blob: 10ac80c9a2ef32a69f3e328afa01ac4f2efc6ab5 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000371 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000387
388 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000390 size = PyUnicode_GET_SIZE(unicode) + 1;
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392#ifdef HAVE_USABLE_WCHAR_T
393 memcpy(w, unicode->str, size * sizeof(wchar_t));
394#else
395 {
396 register Py_UNICODE *u;
397 register int i;
398 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000399 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 *w++ = *u++;
401 }
402#endif
403
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000404 if (size > PyUnicode_GET_SIZE(unicode))
405 return PyUnicode_GET_SIZE(unicode);
406 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000407 return size;
408}
409
410#endif
411
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000412PyObject *PyUnicode_FromOrdinal(int ordinal)
413{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000414 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000415
416#ifdef Py_UNICODE_WIDE
417 if (ordinal < 0 || ordinal > 0x10ffff) {
418 PyErr_SetString(PyExc_ValueError,
419 "unichr() arg not in range(0x110000) "
420 "(wide Python build)");
421 return NULL;
422 }
423#else
424 if (ordinal < 0 || ordinal > 0xffff) {
425 PyErr_SetString(PyExc_ValueError,
426 "unichr() arg not in range(0x10000) "
427 "(narrow Python build)");
428 return NULL;
429 }
430#endif
431
Hye-Shik Chang40574832004-04-06 07:24:51 +0000432 s[0] = (Py_UNICODE)ordinal;
433 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000434}
435
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436PyObject *PyUnicode_FromObject(register PyObject *obj)
437{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 /* XXX Perhaps we should make this API an alias of
439 PyObject_Unicode() instead ?! */
440 if (PyUnicode_CheckExact(obj)) {
441 Py_INCREF(obj);
442 return obj;
443 }
444 if (PyUnicode_Check(obj)) {
445 /* For a Unicode subtype that's not a Unicode object,
446 return a true Unicode object with the same data. */
447 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
448 PyUnicode_GET_SIZE(obj));
449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
451}
452
453PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
454 const char *encoding,
455 const char *errors)
456{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000457 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 if (obj == NULL) {
462 PyErr_BadInternalCall();
463 return NULL;
464 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466#if 0
467 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000468 that no encodings is given and then redirect to
469 PyObject_Unicode() which then applies the additional logic for
470 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000471
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000472 NOTE: This API should really only be used for object which
473 represent *encoded* Unicode !
474
475 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000476 if (PyUnicode_Check(obj)) {
477 if (encoding) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000481 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#else
485 if (PyUnicode_Check(obj)) {
486 PyErr_SetString(PyExc_TypeError,
487 "decoding Unicode is not supported");
488 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000489 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000490#endif
491
492 /* Coerce object */
493 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000494 s = PyString_AS_STRING(obj);
495 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000496 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000497 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
498 /* Overwrite the error message with something more useful in
499 case of a TypeError. */
500 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000502 "coercing to Unicode: need string or buffer, "
503 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000504 obj->ob_type->tp_name);
505 goto onError;
506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 if (len == 0) {
510 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000511 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000515
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520}
521
522PyObject *PyUnicode_Decode(const char *s,
523 int size,
524 const char *encoding,
525 const char *errors)
526{
527 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528
529 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000530 encoding = PyUnicode_GetDefaultEncoding();
531
532 /* Shortcuts for common default encodings */
533 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "latin-1") == 0)
536 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000537#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
538 else if (strcmp(encoding, "mbcs") == 0)
539 return PyUnicode_DecodeMBCS(s, size, errors);
540#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000541 else if (strcmp(encoding, "ascii") == 0)
542 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543
544 /* Decode via the codec registry */
545 buffer = PyBuffer_FromMemory((void *)s, size);
546 if (buffer == NULL)
547 goto onError;
548 unicode = PyCodec_Decode(buffer, encoding, errors);
549 if (unicode == NULL)
550 goto onError;
551 if (!PyUnicode_Check(unicode)) {
552 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000553 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 unicode->ob_type->tp_name);
555 Py_DECREF(unicode);
556 goto onError;
557 }
558 Py_DECREF(buffer);
559 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000560
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 onError:
562 Py_XDECREF(buffer);
563 return NULL;
564}
565
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000566PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v;
571
572 if (!PyUnicode_Check(unicode)) {
573 PyErr_BadArgument();
574 goto onError;
575 }
576
577 if (encoding == NULL)
578 encoding = PyUnicode_GetDefaultEncoding();
579
580 /* Decode via the codec registry */
581 v = PyCodec_Decode(unicode, encoding, errors);
582 if (v == NULL)
583 goto onError;
584 return v;
585
586 onError:
587 return NULL;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590PyObject *PyUnicode_Encode(const Py_UNICODE *s,
591 int size,
592 const char *encoding,
593 const char *errors)
594{
595 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 unicode = PyUnicode_FromUnicode(s, size);
598 if (unicode == NULL)
599 return NULL;
600 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
601 Py_DECREF(unicode);
602 return v;
603}
604
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000605PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
606 const char *encoding,
607 const char *errors)
608{
609 PyObject *v;
610
611 if (!PyUnicode_Check(unicode)) {
612 PyErr_BadArgument();
613 goto onError;
614 }
615
616 if (encoding == NULL)
617 encoding = PyUnicode_GetDefaultEncoding();
618
619 /* Encode via the codec registry */
620 v = PyCodec_Encode(unicode, encoding, errors);
621 if (v == NULL)
622 goto onError;
623 return v;
624
625 onError:
626 return NULL;
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
630 const char *encoding,
631 const char *errors)
632{
633 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 if (!PyUnicode_Check(unicode)) {
636 PyErr_BadArgument();
637 goto onError;
638 }
Fred Drakee4315f52000-05-09 19:53:39 +0000639
Tim Petersced69f82003-09-16 20:30:58 +0000640 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000641 encoding = PyUnicode_GetDefaultEncoding();
642
643 /* Shortcuts for common default encodings */
644 if (errors == NULL) {
645 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000646 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "latin-1") == 0)
648 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000649#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
650 else if (strcmp(encoding, "mbcs") == 0)
651 return PyUnicode_AsMBCSString(unicode);
652#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000653 else if (strcmp(encoding, "ascii") == 0)
654 return PyUnicode_AsASCIIString(unicode);
655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656
657 /* Encode via the codec registry */
658 v = PyCodec_Encode(unicode, encoding, errors);
659 if (v == NULL)
660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 if (!PyString_Check(v)) {
662 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000663 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 v->ob_type->tp_name);
665 Py_DECREF(v);
666 goto onError;
667 }
668 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000669
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 onError:
671 return NULL;
672}
673
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000674PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
675 const char *errors)
676{
677 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
678
679 if (v)
680 return v;
681 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
682 if (v && errors == NULL)
683 ((PyUnicodeObject *)unicode)->defenc = v;
684 return v;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
688{
689 if (!PyUnicode_Check(unicode)) {
690 PyErr_BadArgument();
691 goto onError;
692 }
693 return PyUnicode_AS_UNICODE(unicode);
694
695 onError:
696 return NULL;
697}
698
699int PyUnicode_GetSize(PyObject *unicode)
700{
701 if (!PyUnicode_Check(unicode)) {
702 PyErr_BadArgument();
703 goto onError;
704 }
705 return PyUnicode_GET_SIZE(unicode);
706
707 onError:
708 return -1;
709}
710
Thomas Wouters78890102000-07-22 19:25:51 +0000711const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000712{
713 return unicode_default_encoding;
714}
715
716int PyUnicode_SetDefaultEncoding(const char *encoding)
717{
718 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000719
Fred Drakee4315f52000-05-09 19:53:39 +0000720 /* Make sure the encoding is valid. As side effect, this also
721 loads the encoding into the codec registry cache. */
722 v = _PyCodec_Lookup(encoding);
723 if (v == NULL)
724 goto onError;
725 Py_DECREF(v);
726 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000727 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000728 sizeof(unicode_default_encoding));
729 return 0;
730
731 onError:
732 return -1;
733}
734
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000735/* error handling callback helper:
736 build arguments, call the callback and check the arguments,
737 if no exception occured, copy the replacement to the output
738 and adjust various state variables.
739 return 0 on success, -1 on error
740*/
741
742static
743int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
744 const char *encoding, const char *reason,
745 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
746 PyObject **output, int *outpos, Py_UNICODE **outptr)
747{
748 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
749
750 PyObject *restuple = NULL;
751 PyObject *repunicode = NULL;
752 int outsize = PyUnicode_GET_SIZE(*output);
753 int requiredsize;
754 int newpos;
755 Py_UNICODE *repptr;
756 int repsize;
757 int res = -1;
758
759 if (*errorHandler == NULL) {
760 *errorHandler = PyCodec_LookupError(errors);
761 if (*errorHandler == NULL)
762 goto onError;
763 }
764
765 if (*exceptionObject == NULL) {
766 *exceptionObject = PyUnicodeDecodeError_Create(
767 encoding, input, insize, *startinpos, *endinpos, reason);
768 if (*exceptionObject == NULL)
769 goto onError;
770 }
771 else {
772 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
773 goto onError;
774 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
777 goto onError;
778 }
779
780 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
781 if (restuple == NULL)
782 goto onError;
783 if (!PyTuple_Check(restuple)) {
784 PyErr_Format(PyExc_TypeError, &argparse[4]);
785 goto onError;
786 }
787 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
788 goto onError;
789 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000790 newpos = insize+newpos;
791 if (newpos<0 || newpos>insize) {
792 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
793 goto onError;
794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000795
796 /* need more space? (at least enough for what we
797 have+the replacement+the rest of the string (starting
798 at the new input position), so we won't have to check space
799 when there are no errors in the rest of the string) */
800 repptr = PyUnicode_AS_UNICODE(repunicode);
801 repsize = PyUnicode_GET_SIZE(repunicode);
802 requiredsize = *outpos + repsize + insize-newpos;
803 if (requiredsize > outsize) {
804 if (requiredsize<2*outsize)
805 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000806 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807 goto onError;
808 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
809 }
810 *endinpos = newpos;
811 *inptr = input + newpos;
812 Py_UNICODE_COPY(*outptr, repptr, repsize);
813 *outptr += repsize;
814 *outpos += repsize;
815 /* we made it! */
816 res = 0;
817
818 onError:
819 Py_XDECREF(restuple);
820 return res;
821}
822
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000823/* --- UTF-7 Codec -------------------------------------------------------- */
824
825/* see RFC2152 for details */
826
Tim Petersced69f82003-09-16 20:30:58 +0000827static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828char utf7_special[128] = {
829 /* indicate whether a UTF-7 character is special i.e. cannot be directly
830 encoded:
831 0 - not special
832 1 - special
833 2 - whitespace (optional)
834 3 - RFC2152 Set O (optional) */
835 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
837 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
839 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
843
844};
845
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000846/* Note: The comparison (c) <= 0 is a trick to work-around gcc
847 warnings about the comparison always being false; since
848 utf7_special[0] is 1, we can safely make that one comparison
849 true */
850
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000852 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000853 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000854 (encodeO && (utf7_special[(c)] == 3)))
855
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000856#define B64(n) \
857 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
858#define B64CHAR(c) \
859 (isalnum(c) || (c) == '+' || (c) == '/')
860#define UB64(c) \
861 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
862 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000863
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000864#define ENCODE(out, ch, bits) \
865 while (bits >= 6) { \
866 *out++ = B64(ch >> (bits-6)); \
867 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000868 }
869
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000870#define DECODE(out, ch, bits, surrogate) \
871 while (bits >= 16) { \
872 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
873 bits -= 16; \
874 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000875 /* We have already generated an error for the high surrogate \
876 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000877 surrogate = 0; \
878 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000879 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000880 it in a 16-bit character */ \
881 surrogate = 1; \
882 errmsg = "code pairs are not supported"; \
883 goto utf7Error; \
884 } else { \
885 *out++ = outCh; \
886 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000887 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000888
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889PyObject *PyUnicode_DecodeUTF7(const char *s,
890 int size,
891 const char *errors)
892{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000893 const char *starts = s;
894 int startinpos;
895 int endinpos;
896 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000897 const char *e;
898 PyUnicodeObject *unicode;
899 Py_UNICODE *p;
900 const char *errmsg = "";
901 int inShift = 0;
902 unsigned int bitsleft = 0;
903 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 int surrogate = 0;
905 PyObject *errorHandler = NULL;
906 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 unicode = _PyUnicode_New(size);
909 if (!unicode)
910 return NULL;
911 if (size == 0)
912 return (PyObject *)unicode;
913
914 p = unicode->str;
915 e = s + size;
916
917 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000918 Py_UNICODE ch;
919 restart:
920 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
922 if (inShift) {
923 if ((ch == '-') || !B64CHAR(ch)) {
924 inShift = 0;
925 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000926
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000927 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
928 if (bitsleft >= 6) {
929 /* The shift sequence has a partial character in it. If
930 bitsleft < 6 then we could just classify it as padding
931 but that is not the case here */
932
933 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000934 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 }
936 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000937 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000938 here so indicate the potential of a misencoded character. */
939
940 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
941 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
942 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000943 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 }
945
946 if (ch == '-') {
947 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000948 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 inShift = 1;
950 }
951 } else if (SPECIAL(ch,0,0)) {
952 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000953 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 } else {
955 *p++ = ch;
956 }
957 } else {
958 charsleft = (charsleft << 6) | UB64(ch);
959 bitsleft += 6;
960 s++;
961 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
962 }
963 }
964 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000965 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 s++;
967 if (s < e && *s == '-') {
968 s++;
969 *p++ = '+';
970 } else
971 {
972 inShift = 1;
973 bitsleft = 0;
974 }
975 }
976 else if (SPECIAL(ch,0,0)) {
977 errmsg = "unexpected special character";
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 }
981 else {
982 *p++ = ch;
983 s++;
984 }
985 continue;
986 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000987 outpos = p-PyUnicode_AS_UNICODE(unicode);
988 endinpos = s-starts;
989 if (unicode_decode_call_errorhandler(
990 errors, &errorHandler,
991 "utf7", errmsg,
992 starts, size, &startinpos, &endinpos, &exc, &s,
993 (PyObject **)&unicode, &outpos, &p))
994 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000995 }
996
997 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000998 outpos = p-PyUnicode_AS_UNICODE(unicode);
999 endinpos = size;
1000 if (unicode_decode_call_errorhandler(
1001 errors, &errorHandler,
1002 "utf7", "unterminated shift sequence",
1003 starts, size, &startinpos, &endinpos, &exc, &s,
1004 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001006 if (s < e)
1007 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008 }
1009
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001010 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 goto onError;
1012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001013 Py_XDECREF(errorHandler);
1014 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 return (PyObject *)unicode;
1016
1017onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 Py_XDECREF(errorHandler);
1019 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024
1025PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1026 int size,
1027 int encodeSetO,
1028 int encodeWhiteSpace,
1029 const char *errors)
1030{
1031 PyObject *v;
1032 /* It might be possible to tighten this worst case */
1033 unsigned int cbAllocated = 5 * size;
1034 int inShift = 0;
1035 int i = 0;
1036 unsigned int bitsleft = 0;
1037 unsigned long charsleft = 0;
1038 char * out;
1039 char * start;
1040
1041 if (size == 0)
1042 return PyString_FromStringAndSize(NULL, 0);
1043
1044 v = PyString_FromStringAndSize(NULL, cbAllocated);
1045 if (v == NULL)
1046 return NULL;
1047
1048 start = out = PyString_AS_STRING(v);
1049 for (;i < size; ++i) {
1050 Py_UNICODE ch = s[i];
1051
1052 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001053 if (ch == '+') {
1054 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001055 *out++ = '-';
1056 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1057 charsleft = ch;
1058 bitsleft = 16;
1059 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001060 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001062 } else {
1063 *out++ = (char) ch;
1064 }
1065 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1067 *out++ = B64(charsleft << (6-bitsleft));
1068 charsleft = 0;
1069 bitsleft = 0;
1070 /* Characters not in the BASE64 set implicitly unshift the sequence
1071 so no '-' is required, except if the character is itself a '-' */
1072 if (B64CHAR(ch) || ch == '-') {
1073 *out++ = '-';
1074 }
1075 inShift = 0;
1076 *out++ = (char) ch;
1077 } else {
1078 bitsleft += 16;
1079 charsleft = (charsleft << 16) | ch;
1080 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1081
1082 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001083 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001084 or '-' then the shift sequence will be terminated implicitly and we
1085 don't have to insert a '-'. */
1086
1087 if (bitsleft == 0) {
1088 if (i + 1 < size) {
1089 Py_UNICODE ch2 = s[i+1];
1090
1091 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001092
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 } else if (B64CHAR(ch2) || ch2 == '-') {
1094 *out++ = '-';
1095 inShift = 0;
1096 } else {
1097 inShift = 0;
1098 }
1099
1100 }
1101 else {
1102 *out++ = '-';
1103 inShift = 0;
1104 }
1105 }
Tim Petersced69f82003-09-16 20:30:58 +00001106 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001107 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001108 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 if (bitsleft) {
1110 *out++= B64(charsleft << (6-bitsleft) );
1111 *out++ = '-';
1112 }
1113
Tim Peters5de98422002-04-27 18:44:32 +00001114 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001115 return v;
1116}
1117
1118#undef SPECIAL
1119#undef B64
1120#undef B64CHAR
1121#undef UB64
1122#undef ENCODE
1123#undef DECODE
1124
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125/* --- UTF-8 Codec -------------------------------------------------------- */
1126
Tim Petersced69f82003-09-16 20:30:58 +00001127static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128char utf8_code_length[256] = {
1129 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1130 illegal prefix. see RFC 2279 for details */
1131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1146 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1147};
1148
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149PyObject *PyUnicode_DecodeUTF8(const char *s,
1150 int size,
1151 const char *errors)
1152{
Walter Dörwald69652032004-09-07 20:24:22 +00001153 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1154}
1155
1156PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1157 int size,
1158 const char *errors,
1159 int *consumed)
1160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001161 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001163 int startinpos;
1164 int endinpos;
1165 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 const char *e;
1167 PyUnicodeObject *unicode;
1168 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001169 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001170 PyObject *errorHandler = NULL;
1171 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172
1173 /* Note: size will always be longer than the resulting Unicode
1174 character count */
1175 unicode = _PyUnicode_New(size);
1176 if (!unicode)
1177 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001178 if (size == 0) {
1179 if (consumed)
1180 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183
1184 /* Unpack UTF-8 encoded data */
1185 p = unicode->str;
1186 e = s + size;
1187
1188 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001189 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001192 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 s++;
1194 continue;
1195 }
1196
1197 n = utf8_code_length[ch];
1198
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001199 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001200 if (consumed)
1201 break;
1202 else {
1203 errmsg = "unexpected end of data";
1204 startinpos = s-starts;
1205 endinpos = size;
1206 goto utf8Error;
1207 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210 switch (n) {
1211
1212 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001213 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 startinpos = s-starts;
1215 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001216 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001219 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220 startinpos = s-starts;
1221 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223
1224 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001225 if ((s[1] & 0xc0) != 0x80) {
1226 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 startinpos = s-starts;
1228 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001229 goto utf8Error;
1230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001232 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001233 startinpos = s-starts;
1234 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 errmsg = "illegal encoding";
1236 goto utf8Error;
1237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001239 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 break;
1241
1242 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001243 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001244 (s[2] & 0xc0) != 0x80) {
1245 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001246 startinpos = s-starts;
1247 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001248 goto utf8Error;
1249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001251 if (ch < 0x0800) {
1252 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001253 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001254
1255 XXX For wide builds (UCS-4) we should probably try
1256 to recombine the surrogates into a single code
1257 unit.
1258 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001260 startinpos = s-starts;
1261 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001262 goto utf8Error;
1263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001265 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001266 break;
1267
1268 case 4:
1269 if ((s[1] & 0xc0) != 0x80 ||
1270 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 (s[3] & 0xc0) != 0x80) {
1272 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
1276 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001277 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1278 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1279 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001280 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001281 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001282 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001283 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001284 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
1289 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001290#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001291 *p++ = (Py_UNICODE)ch;
1292#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001293 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001294
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001295 /* translate from 10000..10FFFF to 0..FFFF */
1296 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001297
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001298 /* high surrogate = top 10 bits added to D800 */
1299 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001300
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001301 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001302 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001303#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 break;
1305
1306 default:
1307 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001309 startinpos = s-starts;
1310 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001311 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 }
1313 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 outpos = p-PyUnicode_AS_UNICODE(unicode);
1318 if (unicode_decode_call_errorhandler(
1319 errors, &errorHandler,
1320 "utf8", errmsg,
1321 starts, size, &startinpos, &endinpos, &exc, &s,
1322 (PyObject **)&unicode, &outpos, &p))
1323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 }
Walter Dörwald69652032004-09-07 20:24:22 +00001325 if (consumed)
1326 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327
1328 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001329 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 goto onError;
1331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 Py_XDECREF(errorHandler);
1333 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 return (PyObject *)unicode;
1335
1336onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 Py_XDECREF(errorHandler);
1338 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339 Py_DECREF(unicode);
1340 return NULL;
1341}
1342
Tim Peters602f7402002-04-27 18:03:26 +00001343/* Allocation strategy: if the string is short, convert into a stack buffer
1344 and allocate exactly as much space needed at the end. Else allocate the
1345 maximum possible needed (4 result bytes per Unicode character), and return
1346 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001347*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001348PyObject *
1349PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1350 int size,
1351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352{
Tim Peters602f7402002-04-27 18:03:26 +00001353#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001354
Tim Peters602f7402002-04-27 18:03:26 +00001355 int i; /* index into s of next input byte */
1356 PyObject *v; /* result string object */
1357 char *p; /* next free byte in output buffer */
1358 int nallocated; /* number of result bytes allocated */
1359 int nneeded; /* number of result bytes needed */
1360 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001361
Tim Peters602f7402002-04-27 18:03:26 +00001362 assert(s != NULL);
1363 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364
Tim Peters602f7402002-04-27 18:03:26 +00001365 if (size <= MAX_SHORT_UNICHARS) {
1366 /* Write into the stack buffer; nallocated can't overflow.
1367 * At the end, we'll allocate exactly as much heap space as it
1368 * turns out we need.
1369 */
1370 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1371 v = NULL; /* will allocate after we're done */
1372 p = stackbuf;
1373 }
1374 else {
1375 /* Overallocate on the heap, and give the excess back at the end. */
1376 nallocated = size * 4;
1377 if (nallocated / 4 != size) /* overflow! */
1378 return PyErr_NoMemory();
1379 v = PyString_FromStringAndSize(NULL, nallocated);
1380 if (v == NULL)
1381 return NULL;
1382 p = PyString_AS_STRING(v);
1383 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001384
Tim Peters602f7402002-04-27 18:03:26 +00001385 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001386 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001387
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001388 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001389 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001391
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001393 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001394 *p++ = (char)(0xc0 | (ch >> 6));
1395 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001396 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001397 else {
Tim Peters602f7402002-04-27 18:03:26 +00001398 /* Encode UCS2 Unicode ordinals */
1399 if (ch < 0x10000) {
1400 /* Special case: check for high surrogate */
1401 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1402 Py_UCS4 ch2 = s[i];
1403 /* Check for low surrogate and combine the two to
1404 form a UCS4 value */
1405 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001406 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001407 i++;
1408 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001409 }
Tim Peters602f7402002-04-27 18:03:26 +00001410 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001411 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001413 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1414 *p++ = (char)(0x80 | (ch & 0x3f));
1415 continue;
1416 }
1417encodeUCS4:
1418 /* Encode UCS4 Unicode ordinals */
1419 *p++ = (char)(0xf0 | (ch >> 18));
1420 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1421 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1422 *p++ = (char)(0x80 | (ch & 0x3f));
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001425
Tim Peters602f7402002-04-27 18:03:26 +00001426 if (v == NULL) {
1427 /* This was stack allocated. */
1428 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1429 assert(nneeded <= nallocated);
1430 v = PyString_FromStringAndSize(stackbuf, nneeded);
1431 }
1432 else {
1433 /* Cut back to size actually needed. */
1434 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1435 assert(nneeded <= nallocated);
1436 _PyString_Resize(&v, nneeded);
1437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001439
Tim Peters602f7402002-04-27 18:03:26 +00001440#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441}
1442
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1444{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 if (!PyUnicode_Check(unicode)) {
1446 PyErr_BadArgument();
1447 return NULL;
1448 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001449 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1450 PyUnicode_GET_SIZE(unicode),
1451 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
1454/* --- UTF-16 Codec ------------------------------------------------------- */
1455
Tim Peters772747b2001-08-09 22:21:55 +00001456PyObject *
1457PyUnicode_DecodeUTF16(const char *s,
1458 int size,
1459 const char *errors,
1460 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Walter Dörwald69652032004-09-07 20:24:22 +00001462 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1463}
1464
1465PyObject *
1466PyUnicode_DecodeUTF16Stateful(const char *s,
1467 int size,
1468 const char *errors,
1469 int *byteorder,
1470 int *consumed)
1471{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 const char *starts = s;
1473 int startinpos;
1474 int endinpos;
1475 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476 PyUnicodeObject *unicode;
1477 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001478 const unsigned char *q, *e;
1479 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001481 /* Offsets from q for retrieving byte pairs in the right order. */
1482#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1483 int ihi = 1, ilo = 0;
1484#else
1485 int ihi = 0, ilo = 1;
1486#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001487 PyObject *errorHandler = NULL;
1488 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489
1490 /* Note: size will always be longer than the resulting Unicode
1491 character count */
1492 unicode = _PyUnicode_New(size);
1493 if (!unicode)
1494 return NULL;
1495 if (size == 0)
1496 return (PyObject *)unicode;
1497
1498 /* Unpack UTF-16 encoded data */
1499 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001500 q = (unsigned char *)s;
1501 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502
1503 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001504 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001506 /* Check for BOM marks (U+FEFF) in the input and adjust current
1507 byte order setting accordingly. In native mode, the leading BOM
1508 mark is skipped, in all other modes, it is copied to the output
1509 stream as-is (giving a ZWNBSP character). */
1510 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001511 if (size >= 2) {
1512 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001513#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001514 if (bom == 0xFEFF) {
1515 q += 2;
1516 bo = -1;
1517 }
1518 else if (bom == 0xFFFE) {
1519 q += 2;
1520 bo = 1;
1521 }
Tim Petersced69f82003-09-16 20:30:58 +00001522#else
Walter Dörwald69652032004-09-07 20:24:22 +00001523 if (bom == 0xFEFF) {
1524 q += 2;
1525 bo = 1;
1526 }
1527 else if (bom == 0xFFFE) {
1528 q += 2;
1529 bo = -1;
1530 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001531#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001532 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534
Tim Peters772747b2001-08-09 22:21:55 +00001535 if (bo == -1) {
1536 /* force LE */
1537 ihi = 1;
1538 ilo = 0;
1539 }
1540 else if (bo == 1) {
1541 /* force BE */
1542 ihi = 0;
1543 ilo = 1;
1544 }
1545
1546 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001548 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001550 if (consumed)
1551 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 errmsg = "truncated data";
1553 startinpos = ((const char *)q)-starts;
1554 endinpos = ((const char *)e)-starts;
1555 goto utf16Error;
1556 /* The remaining input chars are ignored if the callback
1557 chooses to skip the input */
1558 }
1559 ch = (q[ihi] << 8) | q[ilo];
1560
Tim Peters772747b2001-08-09 22:21:55 +00001561 q += 2;
1562
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 if (ch < 0xD800 || ch > 0xDFFF) {
1564 *p++ = ch;
1565 continue;
1566 }
1567
1568 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001569 if (q >= e) {
1570 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001571 startinpos = (((const char *)q)-2)-starts;
1572 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001573 goto utf16Error;
1574 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001575 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001576 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1577 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001578 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001579#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001580 *p++ = ch;
1581 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001582#else
1583 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001584#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001585 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001586 }
1587 else {
1588 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 startinpos = (((const char *)q)-4)-starts;
1590 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001591 goto utf16Error;
1592 }
1593
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001595 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 startinpos = (((const char *)q)-2)-starts;
1597 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001598 /* Fall through to report the error */
1599
1600 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001601 outpos = p-PyUnicode_AS_UNICODE(unicode);
1602 if (unicode_decode_call_errorhandler(
1603 errors, &errorHandler,
1604 "utf16", errmsg,
1605 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1606 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001607 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 }
1609
1610 if (byteorder)
1611 *byteorder = bo;
1612
Walter Dörwald69652032004-09-07 20:24:22 +00001613 if (consumed)
1614 *consumed = (const char *)q-starts;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001617 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 goto onError;
1619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 Py_XDECREF(errorHandler);
1621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 return (PyObject *)unicode;
1623
1624onError:
1625 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 Py_XDECREF(errorHandler);
1627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 return NULL;
1629}
1630
Tim Peters772747b2001-08-09 22:21:55 +00001631PyObject *
1632PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1633 int size,
1634 const char *errors,
1635 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636{
1637 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001638 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001639#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001640 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001641#else
1642 const int pairs = 0;
1643#endif
Tim Peters772747b2001-08-09 22:21:55 +00001644 /* Offsets from p for storing byte pairs in the right order. */
1645#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1646 int ihi = 1, ilo = 0;
1647#else
1648 int ihi = 0, ilo = 1;
1649#endif
1650
1651#define STORECHAR(CH) \
1652 do { \
1653 p[ihi] = ((CH) >> 8) & 0xff; \
1654 p[ilo] = (CH) & 0xff; \
1655 p += 2; \
1656 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001658#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001659 for (i = pairs = 0; i < size; i++)
1660 if (s[i] >= 0x10000)
1661 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001662#endif
Tim Petersced69f82003-09-16 20:30:58 +00001663 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001664 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 if (v == NULL)
1666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667
Tim Peters772747b2001-08-09 22:21:55 +00001668 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001670 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001671 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001672 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001673
1674 if (byteorder == -1) {
1675 /* force LE */
1676 ihi = 1;
1677 ilo = 0;
1678 }
1679 else if (byteorder == 1) {
1680 /* force BE */
1681 ihi = 0;
1682 ilo = 1;
1683 }
1684
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001685 while (size-- > 0) {
1686 Py_UNICODE ch = *s++;
1687 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001688#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001689 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001690 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1691 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001693#endif
Tim Peters772747b2001-08-09 22:21:55 +00001694 STORECHAR(ch);
1695 if (ch2)
1696 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001699#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700}
1701
1702PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1703{
1704 if (!PyUnicode_Check(unicode)) {
1705 PyErr_BadArgument();
1706 return NULL;
1707 }
1708 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1709 PyUnicode_GET_SIZE(unicode),
1710 NULL,
1711 0);
1712}
1713
1714/* --- Unicode Escape Codec ----------------------------------------------- */
1715
Fredrik Lundh06d12682001-01-24 07:59:11 +00001716static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001717
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1719 int size,
1720 const char *errors)
1721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 const char *starts = s;
1723 int startinpos;
1724 int endinpos;
1725 int outpos;
1726 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001730 char* message;
1731 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001732 PyObject *errorHandler = NULL;
1733 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001734
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 /* Escaped strings will always be longer than the resulting
1736 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 length after conversion to the true value.
1738 (but if the error callback returns a long replacement string
1739 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 v = _PyUnicode_New(size);
1741 if (v == NULL)
1742 goto onError;
1743 if (size == 0)
1744 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001748
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 while (s < end) {
1750 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001751 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 /* Non-escape characters are interpreted as Unicode ordinals */
1755 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001756 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 continue;
1758 }
1759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 /* \ - Escapes */
1762 s++;
1763 switch (*s++) {
1764
1765 /* \x escapes */
1766 case '\n': break;
1767 case '\\': *p++ = '\\'; break;
1768 case '\'': *p++ = '\''; break;
1769 case '\"': *p++ = '\"'; break;
1770 case 'b': *p++ = '\b'; break;
1771 case 'f': *p++ = '\014'; break; /* FF */
1772 case 't': *p++ = '\t'; break;
1773 case 'n': *p++ = '\n'; break;
1774 case 'r': *p++ = '\r'; break;
1775 case 'v': *p++ = '\013'; break; /* VT */
1776 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1777
1778 /* \OOO (octal) escapes */
1779 case '0': case '1': case '2': case '3':
1780 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001781 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001783 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001785 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001787 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 break;
1789
Fredrik Lundhccc74732001-02-18 22:13:49 +00001790 /* hex escapes */
1791 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001793 digits = 2;
1794 message = "truncated \\xXX escape";
1795 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001799 digits = 4;
1800 message = "truncated \\uXXXX escape";
1801 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001804 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001805 digits = 8;
1806 message = "truncated \\UXXXXXXXX escape";
1807 hexescape:
1808 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 outpos = p-PyUnicode_AS_UNICODE(v);
1810 if (s+digits>end) {
1811 endinpos = size;
1812 if (unicode_decode_call_errorhandler(
1813 errors, &errorHandler,
1814 "unicodeescape", "end of string in escape sequence",
1815 starts, size, &startinpos, &endinpos, &exc, &s,
1816 (PyObject **)&v, &outpos, &p))
1817 goto onError;
1818 goto nextByte;
1819 }
1820 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001821 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001822 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 endinpos = (s+i+1)-starts;
1824 if (unicode_decode_call_errorhandler(
1825 errors, &errorHandler,
1826 "unicodeescape", message,
1827 starts, size, &startinpos, &endinpos, &exc, &s,
1828 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001829 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001831 }
1832 chr = (chr<<4) & ~0xF;
1833 if (c >= '0' && c <= '9')
1834 chr += c - '0';
1835 else if (c >= 'a' && c <= 'f')
1836 chr += 10 + c - 'a';
1837 else
1838 chr += 10 + c - 'A';
1839 }
1840 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001841 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 /* _decoding_error will have already written into the
1843 target buffer. */
1844 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001845 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001846 /* when we get here, chr is a 32-bit unicode character */
1847 if (chr <= 0xffff)
1848 /* UCS-2 character */
1849 *p++ = (Py_UNICODE) chr;
1850 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001851 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001852 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001853#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001854 *p++ = chr;
1855#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001856 chr -= 0x10000L;
1857 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001858 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001859#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001860 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 endinpos = s-starts;
1862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (unicode_decode_call_errorhandler(
1864 errors, &errorHandler,
1865 "unicodeescape", "illegal Unicode character",
1866 starts, size, &startinpos, &endinpos, &exc, &s,
1867 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001868 goto onError;
1869 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001870 break;
1871
1872 /* \N{name} */
1873 case 'N':
1874 message = "malformed \\N character escape";
1875 if (ucnhash_CAPI == NULL) {
1876 /* load the unicode data module */
1877 PyObject *m, *v;
1878 m = PyImport_ImportModule("unicodedata");
1879 if (m == NULL)
1880 goto ucnhashError;
1881 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1882 Py_DECREF(m);
1883 if (v == NULL)
1884 goto ucnhashError;
1885 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1886 Py_DECREF(v);
1887 if (ucnhash_CAPI == NULL)
1888 goto ucnhashError;
1889 }
1890 if (*s == '{') {
1891 const char *start = s+1;
1892 /* look for the closing brace */
1893 while (*s != '}' && s < end)
1894 s++;
1895 if (s > start && s < end && *s == '}') {
1896 /* found a name. look it up in the unicode database */
1897 message = "unknown Unicode character name";
1898 s++;
1899 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1900 goto store;
1901 }
1902 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001903 endinpos = s-starts;
1904 outpos = p-PyUnicode_AS_UNICODE(v);
1905 if (unicode_decode_call_errorhandler(
1906 errors, &errorHandler,
1907 "unicodeescape", message,
1908 starts, size, &startinpos, &endinpos, &exc, &s,
1909 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 break;
1912
1913 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001914 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 message = "\\ at end of string";
1916 s--;
1917 endinpos = s-starts;
1918 outpos = p-PyUnicode_AS_UNICODE(v);
1919 if (unicode_decode_call_errorhandler(
1920 errors, &errorHandler,
1921 "unicodeescape", message,
1922 starts, size, &startinpos, &endinpos, &exc, &s,
1923 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001924 goto onError;
1925 }
1926 else {
1927 *p++ = '\\';
1928 *p++ = (unsigned char)s[-1];
1929 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001930 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 nextByte:
1933 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001935 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001936 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001937 Py_XDECREF(errorHandler);
1938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001940
Fredrik Lundhccc74732001-02-18 22:13:49 +00001941ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001942 PyErr_SetString(
1943 PyExc_UnicodeError,
1944 "\\N escapes not supported (can't load unicodedata module)"
1945 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 Py_XDECREF(errorHandler);
1947 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001948 return NULL;
1949
Fredrik Lundhccc74732001-02-18 22:13:49 +00001950onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001952 Py_XDECREF(errorHandler);
1953 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 return NULL;
1955}
1956
1957/* Return a Unicode-Escape string version of the Unicode object.
1958
1959 If quotes is true, the string is enclosed in u"" or u'' quotes as
1960 appropriate.
1961
1962*/
1963
Barry Warsaw51ac5802000-03-20 16:36:48 +00001964static const Py_UNICODE *findchar(const Py_UNICODE *s,
1965 int size,
1966 Py_UNICODE ch);
1967
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968static
1969PyObject *unicodeescape_string(const Py_UNICODE *s,
1970 int size,
1971 int quotes)
1972{
1973 PyObject *repr;
1974 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001976 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
1978 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1979 if (repr == NULL)
1980 return NULL;
1981
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983
1984 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001986 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 !findchar(s, size, '"')) ? '"' : '\'';
1988 }
1989 while (size-- > 0) {
1990 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001991
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001993 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 *p++ = '\\';
1996 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001997 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001998 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001999
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002000#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002001 /* Map 21-bit characters to '\U00xxxxxx' */
2002 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002003 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002004
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002005 /* Resize the string if necessary */
2006 if (offset + 12 > PyString_GET_SIZE(repr)) {
2007 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002008 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002009 p = PyString_AS_STRING(repr) + offset;
2010 }
2011
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002012 *p++ = '\\';
2013 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002014 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2015 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2016 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002021 *p++ = hexdigit[ch & 0x0000000F];
2022 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002023 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002024#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002025 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2026 else if (ch >= 0xD800 && ch < 0xDC00) {
2027 Py_UNICODE ch2;
2028 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002029
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002030 ch2 = *s++;
2031 size--;
2032 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2033 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2034 *p++ = '\\';
2035 *p++ = 'U';
2036 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2037 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2038 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2043 *p++ = hexdigit[ucs & 0x0000000F];
2044 continue;
2045 }
2046 /* Fall through: isolated surrogates are copied as-is */
2047 s--;
2048 size++;
2049 }
2050
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002052 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 *p++ = '\\';
2054 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002055 *p++ = hexdigit[(ch >> 12) & 0x000F];
2056 *p++ = hexdigit[(ch >> 8) & 0x000F];
2057 *p++ = hexdigit[(ch >> 4) & 0x000F];
2058 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002060
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002061 /* Map special whitespace to '\t', \n', '\r' */
2062 else if (ch == '\t') {
2063 *p++ = '\\';
2064 *p++ = 't';
2065 }
2066 else if (ch == '\n') {
2067 *p++ = '\\';
2068 *p++ = 'n';
2069 }
2070 else if (ch == '\r') {
2071 *p++ = '\\';
2072 *p++ = 'r';
2073 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002075 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002076 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002078 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002079 *p++ = hexdigit[(ch >> 4) & 0x000F];
2080 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002081 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002082
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083 /* Copy everything else as-is */
2084 else
2085 *p++ = (char) ch;
2086 }
2087 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002088 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089
2090 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002091 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 return repr;
2093}
2094
2095PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2096 int size)
2097{
2098 return unicodeescape_string(s, size, 0);
2099}
2100
2101PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2102{
2103 if (!PyUnicode_Check(unicode)) {
2104 PyErr_BadArgument();
2105 return NULL;
2106 }
2107 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2108 PyUnicode_GET_SIZE(unicode));
2109}
2110
2111/* --- Raw Unicode Escape Codec ------------------------------------------- */
2112
2113PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2114 int size,
2115 const char *errors)
2116{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 const char *starts = s;
2118 int startinpos;
2119 int endinpos;
2120 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 const char *end;
2124 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 PyObject *errorHandler = NULL;
2126 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002127
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 /* Escaped strings will always be longer than the resulting
2129 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 length after conversion to the true value. (But decoding error
2131 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 v = _PyUnicode_New(size);
2133 if (v == NULL)
2134 goto onError;
2135 if (size == 0)
2136 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 end = s + size;
2139 while (s < end) {
2140 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002141 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144
2145 /* Non-escape characters are interpreted as Unicode ordinals */
2146 if (*s != '\\') {
2147 *p++ = (unsigned char)*s++;
2148 continue;
2149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151
2152 /* \u-escapes are only interpreted iff the number of leading
2153 backslashes if odd */
2154 bs = s;
2155 for (;s < end;) {
2156 if (*s != '\\')
2157 break;
2158 *p++ = (unsigned char)*s++;
2159 }
2160 if (((s - bs) & 1) == 0 ||
2161 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002162 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 continue;
2164 }
2165 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 s++;
2168
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002169 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002170 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002171 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 endinpos = s-starts;
2175 if (unicode_decode_call_errorhandler(
2176 errors, &errorHandler,
2177 "rawunicodeescape", "truncated \\uXXXX",
2178 starts, size, &startinpos, &endinpos, &exc, &s,
2179 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 }
2183 x = (x<<4) & ~0xF;
2184 if (c >= '0' && c <= '9')
2185 x += c - '0';
2186 else if (c >= 'a' && c <= 'f')
2187 x += 10 + c - 'a';
2188 else
2189 x += 10 + c - 'A';
2190 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002191#ifndef Py_UNICODE_WIDE
2192 if (x > 0x10000) {
2193 if (unicode_decode_call_errorhandler(
2194 errors, &errorHandler,
2195 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2196 starts, size, &startinpos, &endinpos, &exc, &s,
2197 (PyObject **)&v, &outpos, &p))
2198 goto onError;
2199 }
2200#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002201 *p++ = x;
2202 nextByte:
2203 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002205 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002206 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 Py_XDECREF(errorHandler);
2208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002210
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 onError:
2212 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002213 Py_XDECREF(errorHandler);
2214 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 return NULL;
2216}
2217
2218PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2219 int size)
2220{
2221 PyObject *repr;
2222 char *p;
2223 char *q;
2224
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002225 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227#ifdef Py_UNICODE_WIDE
2228 repr = PyString_FromStringAndSize(NULL, 10 * size);
2229#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 if (repr == NULL)
2233 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002234 if (size == 0)
2235 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236
2237 p = q = PyString_AS_STRING(repr);
2238 while (size-- > 0) {
2239 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002240#ifdef Py_UNICODE_WIDE
2241 /* Map 32-bit characters to '\Uxxxxxxxx' */
2242 if (ch >= 0x10000) {
2243 *p++ = '\\';
2244 *p++ = 'U';
2245 *p++ = hexdigit[(ch >> 28) & 0xf];
2246 *p++ = hexdigit[(ch >> 24) & 0xf];
2247 *p++ = hexdigit[(ch >> 20) & 0xf];
2248 *p++ = hexdigit[(ch >> 16) & 0xf];
2249 *p++ = hexdigit[(ch >> 12) & 0xf];
2250 *p++ = hexdigit[(ch >> 8) & 0xf];
2251 *p++ = hexdigit[(ch >> 4) & 0xf];
2252 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002253 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002254 else
2255#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 /* Map 16-bit characters to '\uxxxx' */
2257 if (ch >= 256) {
2258 *p++ = '\\';
2259 *p++ = 'u';
2260 *p++ = hexdigit[(ch >> 12) & 0xf];
2261 *p++ = hexdigit[(ch >> 8) & 0xf];
2262 *p++ = hexdigit[(ch >> 4) & 0xf];
2263 *p++ = hexdigit[ch & 15];
2264 }
2265 /* Copy everything else as-is */
2266 else
2267 *p++ = (char) ch;
2268 }
2269 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002270 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 return repr;
2272}
2273
2274PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2275{
2276 if (!PyUnicode_Check(unicode)) {
2277 PyErr_BadArgument();
2278 return NULL;
2279 }
2280 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2281 PyUnicode_GET_SIZE(unicode));
2282}
2283
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002284/* --- Unicode Internal Codec ------------------------------------------- */
2285
2286PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2287 int size,
2288 const char *errors)
2289{
2290 const char *starts = s;
2291 int startinpos;
2292 int endinpos;
2293 int outpos;
2294 Py_UNICODE unimax;
2295 PyUnicodeObject *v;
2296 Py_UNICODE *p;
2297 const char *end;
2298 const char *reason;
2299 PyObject *errorHandler = NULL;
2300 PyObject *exc = NULL;
2301
2302 unimax = PyUnicode_GetMax();
2303 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2304 if (v == NULL)
2305 goto onError;
2306 if (PyUnicode_GetSize((PyObject *)v) == 0)
2307 return (PyObject *)v;
2308 p = PyUnicode_AS_UNICODE(v);
2309 end = s + size;
2310
2311 while (s < end) {
2312 *p = *(Py_UNICODE *)s;
2313 /* We have to sanity check the raw data, otherwise doom looms for
2314 some malformed UCS-4 data. */
2315 if (
2316 #ifdef Py_UNICODE_WIDE
2317 *p > unimax || *p < 0 ||
2318 #endif
2319 end-s < Py_UNICODE_SIZE
2320 )
2321 {
2322 startinpos = s - starts;
2323 if (end-s < Py_UNICODE_SIZE) {
2324 endinpos = end-starts;
2325 reason = "truncated input";
2326 }
2327 else {
2328 endinpos = s - starts + Py_UNICODE_SIZE;
2329 reason = "illegal code point (> 0x10FFFF)";
2330 }
2331 outpos = p - PyUnicode_AS_UNICODE(v);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "unicode_internal", reason,
2335 starts, size, &startinpos, &endinpos, &exc, &s,
2336 (PyObject **)&v, &outpos, &p)) {
2337 goto onError;
2338 }
2339 }
2340 else {
2341 p++;
2342 s += Py_UNICODE_SIZE;
2343 }
2344 }
2345
2346 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2347 goto onError;
2348 Py_XDECREF(errorHandler);
2349 Py_XDECREF(exc);
2350 return (PyObject *)v;
2351
2352 onError:
2353 Py_XDECREF(v);
2354 Py_XDECREF(errorHandler);
2355 Py_XDECREF(exc);
2356 return NULL;
2357}
2358
Guido van Rossumd57fd912000-03-10 22:53:23 +00002359/* --- Latin-1 Codec ------------------------------------------------------ */
2360
2361PyObject *PyUnicode_DecodeLatin1(const char *s,
2362 int size,
2363 const char *errors)
2364{
2365 PyUnicodeObject *v;
2366 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002367
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002369 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002370 Py_UNICODE r = *(unsigned char*)s;
2371 return PyUnicode_FromUnicode(&r, 1);
2372 }
2373
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 v = _PyUnicode_New(size);
2375 if (v == NULL)
2376 goto onError;
2377 if (size == 0)
2378 return (PyObject *)v;
2379 p = PyUnicode_AS_UNICODE(v);
2380 while (size-- > 0)
2381 *p++ = (unsigned char)*s++;
2382 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002383
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 onError:
2385 Py_XDECREF(v);
2386 return NULL;
2387}
2388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002389/* create or adjust a UnicodeEncodeError */
2390static void make_encode_exception(PyObject **exceptionObject,
2391 const char *encoding,
2392 const Py_UNICODE *unicode, int size,
2393 int startpos, int endpos,
2394 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002396 if (*exceptionObject == NULL) {
2397 *exceptionObject = PyUnicodeEncodeError_Create(
2398 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399 }
2400 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002401 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2402 goto onError;
2403 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2404 goto onError;
2405 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2406 goto onError;
2407 return;
2408 onError:
2409 Py_DECREF(*exceptionObject);
2410 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411 }
2412}
2413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002414/* raises a UnicodeEncodeError */
2415static void raise_encode_exception(PyObject **exceptionObject,
2416 const char *encoding,
2417 const Py_UNICODE *unicode, int size,
2418 int startpos, int endpos,
2419 const char *reason)
2420{
2421 make_encode_exception(exceptionObject,
2422 encoding, unicode, size, startpos, endpos, reason);
2423 if (*exceptionObject != NULL)
2424 PyCodec_StrictErrors(*exceptionObject);
2425}
2426
2427/* error handling callback helper:
2428 build arguments, call the callback and check the arguments,
2429 put the result into newpos and return the replacement string, which
2430 has to be freed by the caller */
2431static PyObject *unicode_encode_call_errorhandler(const char *errors,
2432 PyObject **errorHandler,
2433 const char *encoding, const char *reason,
2434 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2435 int startpos, int endpos,
2436 int *newpos)
2437{
2438 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2439
2440 PyObject *restuple;
2441 PyObject *resunicode;
2442
2443 if (*errorHandler == NULL) {
2444 *errorHandler = PyCodec_LookupError(errors);
2445 if (*errorHandler == NULL)
2446 return NULL;
2447 }
2448
2449 make_encode_exception(exceptionObject,
2450 encoding, unicode, size, startpos, endpos, reason);
2451 if (*exceptionObject == NULL)
2452 return NULL;
2453
2454 restuple = PyObject_CallFunctionObjArgs(
2455 *errorHandler, *exceptionObject, NULL);
2456 if (restuple == NULL)
2457 return NULL;
2458 if (!PyTuple_Check(restuple)) {
2459 PyErr_Format(PyExc_TypeError, &argparse[4]);
2460 Py_DECREF(restuple);
2461 return NULL;
2462 }
2463 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2464 &resunicode, newpos)) {
2465 Py_DECREF(restuple);
2466 return NULL;
2467 }
2468 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002469 *newpos = size+*newpos;
2470 if (*newpos<0 || *newpos>size) {
2471 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2472 Py_DECREF(restuple);
2473 return NULL;
2474 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 Py_INCREF(resunicode);
2476 Py_DECREF(restuple);
2477 return resunicode;
2478}
2479
2480static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2481 int size,
2482 const char *errors,
2483 int limit)
2484{
2485 /* output object */
2486 PyObject *res;
2487 /* pointers to the beginning and end+1 of input */
2488 const Py_UNICODE *startp = p;
2489 const Py_UNICODE *endp = p + size;
2490 /* pointer to the beginning of the unencodable characters */
2491 /* const Py_UNICODE *badp = NULL; */
2492 /* pointer into the output */
2493 char *str;
2494 /* current output position */
2495 int respos = 0;
2496 int ressize;
2497 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2498 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2499 PyObject *errorHandler = NULL;
2500 PyObject *exc = NULL;
2501 /* the following variable is used for caching string comparisons
2502 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2503 int known_errorHandler = -1;
2504
2505 /* allocate enough for a simple encoding without
2506 replacements, if we need more, we'll resize */
2507 res = PyString_FromStringAndSize(NULL, size);
2508 if (res == NULL)
2509 goto onError;
2510 if (size == 0)
2511 return res;
2512 str = PyString_AS_STRING(res);
2513 ressize = size;
2514
2515 while (p<endp) {
2516 Py_UNICODE c = *p;
2517
2518 /* can we encode this? */
2519 if (c<limit) {
2520 /* no overflow check, because we know that the space is enough */
2521 *str++ = (char)c;
2522 ++p;
2523 }
2524 else {
2525 int unicodepos = p-startp;
2526 int requiredsize;
2527 PyObject *repunicode;
2528 int repsize;
2529 int newpos;
2530 int respos;
2531 Py_UNICODE *uni2;
2532 /* startpos for collecting unencodable chars */
2533 const Py_UNICODE *collstart = p;
2534 const Py_UNICODE *collend = p;
2535 /* find all unecodable characters */
2536 while ((collend < endp) && ((*collend)>=limit))
2537 ++collend;
2538 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2539 if (known_errorHandler==-1) {
2540 if ((errors==NULL) || (!strcmp(errors, "strict")))
2541 known_errorHandler = 1;
2542 else if (!strcmp(errors, "replace"))
2543 known_errorHandler = 2;
2544 else if (!strcmp(errors, "ignore"))
2545 known_errorHandler = 3;
2546 else if (!strcmp(errors, "xmlcharrefreplace"))
2547 known_errorHandler = 4;
2548 else
2549 known_errorHandler = 0;
2550 }
2551 switch (known_errorHandler) {
2552 case 1: /* strict */
2553 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2554 goto onError;
2555 case 2: /* replace */
2556 while (collstart++<collend)
2557 *str++ = '?'; /* fall through */
2558 case 3: /* ignore */
2559 p = collend;
2560 break;
2561 case 4: /* xmlcharrefreplace */
2562 respos = str-PyString_AS_STRING(res);
2563 /* determine replacement size (temporarily (mis)uses p) */
2564 for (p = collstart, repsize = 0; p < collend; ++p) {
2565 if (*p<10)
2566 repsize += 2+1+1;
2567 else if (*p<100)
2568 repsize += 2+2+1;
2569 else if (*p<1000)
2570 repsize += 2+3+1;
2571 else if (*p<10000)
2572 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002573#ifndef Py_UNICODE_WIDE
2574 else
2575 repsize += 2+5+1;
2576#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 else if (*p<100000)
2578 repsize += 2+5+1;
2579 else if (*p<1000000)
2580 repsize += 2+6+1;
2581 else
2582 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002583#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002584 }
2585 requiredsize = respos+repsize+(endp-collend);
2586 if (requiredsize > ressize) {
2587 if (requiredsize<2*ressize)
2588 requiredsize = 2*ressize;
2589 if (_PyString_Resize(&res, requiredsize))
2590 goto onError;
2591 str = PyString_AS_STRING(res) + respos;
2592 ressize = requiredsize;
2593 }
2594 /* generate replacement (temporarily (mis)uses p) */
2595 for (p = collstart; p < collend; ++p) {
2596 str += sprintf(str, "&#%d;", (int)*p);
2597 }
2598 p = collend;
2599 break;
2600 default:
2601 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2602 encoding, reason, startp, size, &exc,
2603 collstart-startp, collend-startp, &newpos);
2604 if (repunicode == NULL)
2605 goto onError;
2606 /* need more space? (at least enough for what we
2607 have+the replacement+the rest of the string, so
2608 we won't have to check space for encodable characters) */
2609 respos = str-PyString_AS_STRING(res);
2610 repsize = PyUnicode_GET_SIZE(repunicode);
2611 requiredsize = respos+repsize+(endp-collend);
2612 if (requiredsize > ressize) {
2613 if (requiredsize<2*ressize)
2614 requiredsize = 2*ressize;
2615 if (_PyString_Resize(&res, requiredsize)) {
2616 Py_DECREF(repunicode);
2617 goto onError;
2618 }
2619 str = PyString_AS_STRING(res) + respos;
2620 ressize = requiredsize;
2621 }
2622 /* check if there is anything unencodable in the replacement
2623 and copy it to the output */
2624 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2625 c = *uni2;
2626 if (c >= limit) {
2627 raise_encode_exception(&exc, encoding, startp, size,
2628 unicodepos, unicodepos+1, reason);
2629 Py_DECREF(repunicode);
2630 goto onError;
2631 }
2632 *str = (char)c;
2633 }
2634 p = startp + newpos;
2635 Py_DECREF(repunicode);
2636 }
2637 }
2638 }
2639 /* Resize if we allocated to much */
2640 respos = str-PyString_AS_STRING(res);
2641 if (respos<ressize)
2642 /* If this falls res will be NULL */
2643 _PyString_Resize(&res, respos);
2644 Py_XDECREF(errorHandler);
2645 Py_XDECREF(exc);
2646 return res;
2647
2648 onError:
2649 Py_XDECREF(res);
2650 Py_XDECREF(errorHandler);
2651 Py_XDECREF(exc);
2652 return NULL;
2653}
2654
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2656 int size,
2657 const char *errors)
2658{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002659 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660}
2661
2662PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2663{
2664 if (!PyUnicode_Check(unicode)) {
2665 PyErr_BadArgument();
2666 return NULL;
2667 }
2668 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2669 PyUnicode_GET_SIZE(unicode),
2670 NULL);
2671}
2672
2673/* --- 7-bit ASCII Codec -------------------------------------------------- */
2674
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675PyObject *PyUnicode_DecodeASCII(const char *s,
2676 int size,
2677 const char *errors)
2678{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 PyUnicodeObject *v;
2681 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002682 int startinpos;
2683 int endinpos;
2684 int outpos;
2685 const char *e;
2686 PyObject *errorHandler = NULL;
2687 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002688
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002690 if (size == 1 && *(unsigned char*)s < 128) {
2691 Py_UNICODE r = *(unsigned char*)s;
2692 return PyUnicode_FromUnicode(&r, 1);
2693 }
Tim Petersced69f82003-09-16 20:30:58 +00002694
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 v = _PyUnicode_New(size);
2696 if (v == NULL)
2697 goto onError;
2698 if (size == 0)
2699 return (PyObject *)v;
2700 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 e = s + size;
2702 while (s < e) {
2703 register unsigned char c = (unsigned char)*s;
2704 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 ++s;
2707 }
2708 else {
2709 startinpos = s-starts;
2710 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002711 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 if (unicode_decode_call_errorhandler(
2713 errors, &errorHandler,
2714 "ascii", "ordinal not in range(128)",
2715 starts, size, &startinpos, &endinpos, &exc, &s,
2716 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002720 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002721 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002722 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 Py_XDECREF(errorHandler);
2724 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 onError:
2728 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 return NULL;
2732}
2733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2735 int size,
2736 const char *errors)
2737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739}
2740
2741PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2742{
2743 if (!PyUnicode_Check(unicode)) {
2744 PyErr_BadArgument();
2745 return NULL;
2746 }
2747 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2748 PyUnicode_GET_SIZE(unicode),
2749 NULL);
2750}
2751
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002752#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002753
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002754/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002755
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002756PyObject *PyUnicode_DecodeMBCS(const char *s,
2757 int size,
2758 const char *errors)
2759{
2760 PyUnicodeObject *v;
2761 Py_UNICODE *p;
2762
2763 /* First get the size of the result */
2764 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002765 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002766 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2767
2768 v = _PyUnicode_New(usize);
2769 if (v == NULL)
2770 return NULL;
2771 if (usize == 0)
2772 return (PyObject *)v;
2773 p = PyUnicode_AS_UNICODE(v);
2774 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2775 Py_DECREF(v);
2776 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2777 }
2778
2779 return (PyObject *)v;
2780}
2781
2782PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2783 int size,
2784 const char *errors)
2785{
2786 PyObject *repr;
2787 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002788 DWORD mbcssize;
2789
2790 /* If there are no characters, bail now! */
2791 if (size==0)
2792 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002793
2794 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002795 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002796 if (mbcssize==0)
2797 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2798
2799 repr = PyString_FromStringAndSize(NULL, mbcssize);
2800 if (repr == NULL)
2801 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002802 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002803 return repr;
2804
2805 /* Do the conversion */
2806 s = PyString_AS_STRING(repr);
2807 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2808 Py_DECREF(repr);
2809 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2810 }
2811 return repr;
2812}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002813
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002814PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2815{
2816 if (!PyUnicode_Check(unicode)) {
2817 PyErr_BadArgument();
2818 return NULL;
2819 }
2820 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2821 PyUnicode_GET_SIZE(unicode),
2822 NULL);
2823}
2824
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002825#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002826
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827/* --- Character Mapping Codec -------------------------------------------- */
2828
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829PyObject *PyUnicode_DecodeCharmap(const char *s,
2830 int size,
2831 PyObject *mapping,
2832 const char *errors)
2833{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 const char *starts = s;
2835 int startinpos;
2836 int endinpos;
2837 int outpos;
2838 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 PyUnicodeObject *v;
2840 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002841 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 PyObject *errorHandler = NULL;
2843 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002844 Py_UNICODE *mapstring = NULL;
2845 int maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002846
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 /* Default to Latin-1 */
2848 if (mapping == NULL)
2849 return PyUnicode_DecodeLatin1(s, size, errors);
2850
2851 v = _PyUnicode_New(size);
2852 if (v == NULL)
2853 goto onError;
2854 if (size == 0)
2855 return (PyObject *)v;
2856 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002858 if (PyUnicode_CheckExact(mapping)) {
2859 mapstring = PyUnicode_AS_UNICODE(mapping);
2860 maplen = PyUnicode_GET_SIZE(mapping);
2861 while (s < e) {
2862 unsigned char ch = *s;
2863 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002865 if (ch < maplen)
2866 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002868 if (x == 0xfffe) {
2869 /* undefined mapping */
2870 outpos = p-PyUnicode_AS_UNICODE(v);
2871 startinpos = s-starts;
2872 endinpos = startinpos+1;
2873 if (unicode_decode_call_errorhandler(
2874 errors, &errorHandler,
2875 "charmap", "character maps to <undefined>",
2876 starts, size, &startinpos, &endinpos, &exc, &s,
2877 (PyObject **)&v, &outpos, &p)) {
2878 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002879 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002880 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002881 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002882 *p++ = x;
2883 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002885 }
2886 else {
2887 while (s < e) {
2888 unsigned char ch = *s;
2889 PyObject *w, *x;
2890
2891 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2892 w = PyInt_FromLong((long)ch);
2893 if (w == NULL)
2894 goto onError;
2895 x = PyObject_GetItem(mapping, w);
2896 Py_DECREF(w);
2897 if (x == NULL) {
2898 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2899 /* No mapping found means: mapping is undefined. */
2900 PyErr_Clear();
2901 x = Py_None;
2902 Py_INCREF(x);
2903 } else
2904 goto onError;
2905 }
2906
2907 /* Apply mapping */
2908 if (PyInt_Check(x)) {
2909 long value = PyInt_AS_LONG(x);
2910 if (value < 0 || value > 65535) {
2911 PyErr_SetString(PyExc_TypeError,
2912 "character mapping must be in range(65536)");
2913 Py_DECREF(x);
2914 goto onError;
2915 }
2916 *p++ = (Py_UNICODE)value;
2917 }
2918 else if (x == Py_None) {
2919 /* undefined mapping */
2920 outpos = p-PyUnicode_AS_UNICODE(v);
2921 startinpos = s-starts;
2922 endinpos = startinpos+1;
2923 if (unicode_decode_call_errorhandler(
2924 errors, &errorHandler,
2925 "charmap", "character maps to <undefined>",
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 (PyObject **)&v, &outpos, &p)) {
2928 Py_DECREF(x);
2929 goto onError;
2930 }
2931 continue;
2932 }
2933 else if (PyUnicode_Check(x)) {
2934 int targetsize = PyUnicode_GET_SIZE(x);
2935
2936 if (targetsize == 1)
2937 /* 1-1 mapping */
2938 *p++ = *PyUnicode_AS_UNICODE(x);
2939
2940 else if (targetsize > 1) {
2941 /* 1-n mapping */
2942 if (targetsize > extrachars) {
2943 /* resize first */
2944 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2945 int needed = (targetsize - extrachars) + \
2946 (targetsize << 2);
2947 extrachars += needed;
2948 if (_PyUnicode_Resize(&v,
2949 PyUnicode_GET_SIZE(v) + needed) < 0) {
2950 Py_DECREF(x);
2951 goto onError;
2952 }
2953 p = PyUnicode_AS_UNICODE(v) + oldpos;
2954 }
2955 Py_UNICODE_COPY(p,
2956 PyUnicode_AS_UNICODE(x),
2957 targetsize);
2958 p += targetsize;
2959 extrachars -= targetsize;
2960 }
2961 /* 1-0 mapping: skip the character */
2962 }
2963 else {
2964 /* wrong return value */
2965 PyErr_SetString(PyExc_TypeError,
2966 "character mapping must return integer, None or unicode");
2967 Py_DECREF(x);
2968 goto onError;
2969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002971 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 }
2974 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002975 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 Py_XDECREF(errorHandler);
2978 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002980
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002982 Py_XDECREF(errorHandler);
2983 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 Py_XDECREF(v);
2985 return NULL;
2986}
2987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002988/* Lookup the character ch in the mapping. If the character
2989 can't be found, Py_None is returned (or NULL, if another
2990 error occured). */
2991static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002993 PyObject *w = PyInt_FromLong((long)c);
2994 PyObject *x;
2995
2996 if (w == NULL)
2997 return NULL;
2998 x = PyObject_GetItem(mapping, w);
2999 Py_DECREF(w);
3000 if (x == NULL) {
3001 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3002 /* No mapping found means: mapping is undefined. */
3003 PyErr_Clear();
3004 x = Py_None;
3005 Py_INCREF(x);
3006 return x;
3007 } else
3008 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003010 else if (x == Py_None)
3011 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 else if (PyInt_Check(x)) {
3013 long value = PyInt_AS_LONG(x);
3014 if (value < 0 || value > 255) {
3015 PyErr_SetString(PyExc_TypeError,
3016 "character mapping must be in range(256)");
3017 Py_DECREF(x);
3018 return NULL;
3019 }
3020 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 else if (PyString_Check(x))
3023 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 /* wrong return value */
3026 PyErr_SetString(PyExc_TypeError,
3027 "character mapping must return integer, None or str");
3028 Py_DECREF(x);
3029 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
3031}
3032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033/* lookup the character, put the result in the output string and adjust
3034 various state variables. Reallocate the output string if not enough
3035 space is available. Return a new reference to the object that
3036 was put in the output buffer, or Py_None, if the mapping was undefined
3037 (in which case no character was written) or NULL, if a
3038 reallocation error ocurred. The called must decref the result */
3039static
3040PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
3041 PyObject **outobj, int *outpos)
3042{
3043 PyObject *rep = charmapencode_lookup(c, mapping);
3044
3045 if (rep==NULL)
3046 return NULL;
3047 else if (rep==Py_None)
3048 return rep;
3049 else {
3050 char *outstart = PyString_AS_STRING(*outobj);
3051 int outsize = PyString_GET_SIZE(*outobj);
3052 if (PyInt_Check(rep)) {
3053 int requiredsize = *outpos+1;
3054 if (outsize<requiredsize) {
3055 /* exponentially overallocate to minimize reallocations */
3056 if (requiredsize < 2*outsize)
3057 requiredsize = 2*outsize;
3058 if (_PyString_Resize(outobj, requiredsize)) {
3059 Py_DECREF(rep);
3060 return NULL;
3061 }
3062 outstart = PyString_AS_STRING(*outobj);
3063 }
3064 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3065 }
3066 else {
3067 const char *repchars = PyString_AS_STRING(rep);
3068 int repsize = PyString_GET_SIZE(rep);
3069 int requiredsize = *outpos+repsize;
3070 if (outsize<requiredsize) {
3071 /* exponentially overallocate to minimize reallocations */
3072 if (requiredsize < 2*outsize)
3073 requiredsize = 2*outsize;
3074 if (_PyString_Resize(outobj, requiredsize)) {
3075 Py_DECREF(rep);
3076 return NULL;
3077 }
3078 outstart = PyString_AS_STRING(*outobj);
3079 }
3080 memcpy(outstart + *outpos, repchars, repsize);
3081 *outpos += repsize;
3082 }
3083 }
3084 return rep;
3085}
3086
3087/* handle an error in PyUnicode_EncodeCharmap
3088 Return 0 on success, -1 on error */
3089static
3090int charmap_encoding_error(
3091 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
3092 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003093 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 PyObject **res, int *respos)
3095{
3096 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3097 int repsize;
3098 int newpos;
3099 Py_UNICODE *uni2;
3100 /* startpos for collecting unencodable chars */
3101 int collstartpos = *inpos;
3102 int collendpos = *inpos+1;
3103 int collpos;
3104 char *encoding = "charmap";
3105 char *reason = "character maps to <undefined>";
3106
3107 PyObject *x;
3108 /* find all unencodable characters */
3109 while (collendpos < size) {
3110 x = charmapencode_lookup(p[collendpos], mapping);
3111 if (x==NULL)
3112 return -1;
3113 else if (x!=Py_None) {
3114 Py_DECREF(x);
3115 break;
3116 }
3117 Py_DECREF(x);
3118 ++collendpos;
3119 }
3120 /* cache callback name lookup
3121 * (if not done yet, i.e. it's the first error) */
3122 if (*known_errorHandler==-1) {
3123 if ((errors==NULL) || (!strcmp(errors, "strict")))
3124 *known_errorHandler = 1;
3125 else if (!strcmp(errors, "replace"))
3126 *known_errorHandler = 2;
3127 else if (!strcmp(errors, "ignore"))
3128 *known_errorHandler = 3;
3129 else if (!strcmp(errors, "xmlcharrefreplace"))
3130 *known_errorHandler = 4;
3131 else
3132 *known_errorHandler = 0;
3133 }
3134 switch (*known_errorHandler) {
3135 case 1: /* strict */
3136 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3137 return -1;
3138 case 2: /* replace */
3139 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3140 x = charmapencode_output('?', mapping, res, respos);
3141 if (x==NULL) {
3142 return -1;
3143 }
3144 else if (x==Py_None) {
3145 Py_DECREF(x);
3146 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3147 return -1;
3148 }
3149 Py_DECREF(x);
3150 }
3151 /* fall through */
3152 case 3: /* ignore */
3153 *inpos = collendpos;
3154 break;
3155 case 4: /* xmlcharrefreplace */
3156 /* generate replacement (temporarily (mis)uses p) */
3157 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3158 char buffer[2+29+1+1];
3159 char *cp;
3160 sprintf(buffer, "&#%d;", (int)p[collpos]);
3161 for (cp = buffer; *cp; ++cp) {
3162 x = charmapencode_output(*cp, mapping, res, respos);
3163 if (x==NULL)
3164 return -1;
3165 else if (x==Py_None) {
3166 Py_DECREF(x);
3167 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3168 return -1;
3169 }
3170 Py_DECREF(x);
3171 }
3172 }
3173 *inpos = collendpos;
3174 break;
3175 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003176 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 encoding, reason, p, size, exceptionObject,
3178 collstartpos, collendpos, &newpos);
3179 if (repunicode == NULL)
3180 return -1;
3181 /* generate replacement */
3182 repsize = PyUnicode_GET_SIZE(repunicode);
3183 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3184 x = charmapencode_output(*uni2, mapping, res, respos);
3185 if (x==NULL) {
3186 Py_DECREF(repunicode);
3187 return -1;
3188 }
3189 else if (x==Py_None) {
3190 Py_DECREF(repunicode);
3191 Py_DECREF(x);
3192 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3193 return -1;
3194 }
3195 Py_DECREF(x);
3196 }
3197 *inpos = newpos;
3198 Py_DECREF(repunicode);
3199 }
3200 return 0;
3201}
3202
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3204 int size,
3205 PyObject *mapping,
3206 const char *errors)
3207{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 /* output object */
3209 PyObject *res = NULL;
3210 /* current input position */
3211 int inpos = 0;
3212 /* current output position */
3213 int respos = 0;
3214 PyObject *errorHandler = NULL;
3215 PyObject *exc = NULL;
3216 /* the following variable is used for caching string comparisons
3217 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3218 * 3=ignore, 4=xmlcharrefreplace */
3219 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220
3221 /* Default to Latin-1 */
3222 if (mapping == NULL)
3223 return PyUnicode_EncodeLatin1(p, size, errors);
3224
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003225 /* allocate enough for a simple encoding without
3226 replacements, if we need more, we'll resize */
3227 res = PyString_FromStringAndSize(NULL, size);
3228 if (res == NULL)
3229 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003230 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003233 while (inpos<size) {
3234 /* try to encode it */
3235 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3236 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 if (x==Py_None) { /* unencodable character */
3239 if (charmap_encoding_error(p, size, &inpos, mapping,
3240 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003241 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003242 &res, &respos)) {
3243 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003244 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 else
3248 /* done with this character => adjust input position */
3249 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 Py_DECREF(x);
3251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 /* Resize if we allocated to much */
3254 if (respos<PyString_GET_SIZE(res)) {
3255 if (_PyString_Resize(&res, respos))
3256 goto onError;
3257 }
3258 Py_XDECREF(exc);
3259 Py_XDECREF(errorHandler);
3260 return res;
3261
3262 onError:
3263 Py_XDECREF(res);
3264 Py_XDECREF(exc);
3265 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 return NULL;
3267}
3268
3269PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3270 PyObject *mapping)
3271{
3272 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3273 PyErr_BadArgument();
3274 return NULL;
3275 }
3276 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3277 PyUnicode_GET_SIZE(unicode),
3278 mapping,
3279 NULL);
3280}
3281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282/* create or adjust a UnicodeTranslateError */
3283static void make_translate_exception(PyObject **exceptionObject,
3284 const Py_UNICODE *unicode, int size,
3285 int startpos, int endpos,
3286 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 if (*exceptionObject == NULL) {
3289 *exceptionObject = PyUnicodeTranslateError_Create(
3290 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 }
3292 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3294 goto onError;
3295 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3296 goto onError;
3297 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3298 goto onError;
3299 return;
3300 onError:
3301 Py_DECREF(*exceptionObject);
3302 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 }
3304}
3305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306/* raises a UnicodeTranslateError */
3307static void raise_translate_exception(PyObject **exceptionObject,
3308 const Py_UNICODE *unicode, int size,
3309 int startpos, int endpos,
3310 const char *reason)
3311{
3312 make_translate_exception(exceptionObject,
3313 unicode, size, startpos, endpos, reason);
3314 if (*exceptionObject != NULL)
3315 PyCodec_StrictErrors(*exceptionObject);
3316}
3317
3318/* error handling callback helper:
3319 build arguments, call the callback and check the arguments,
3320 put the result into newpos and return the replacement string, which
3321 has to be freed by the caller */
3322static PyObject *unicode_translate_call_errorhandler(const char *errors,
3323 PyObject **errorHandler,
3324 const char *reason,
3325 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3326 int startpos, int endpos,
3327 int *newpos)
3328{
3329 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3330
3331 PyObject *restuple;
3332 PyObject *resunicode;
3333
3334 if (*errorHandler == NULL) {
3335 *errorHandler = PyCodec_LookupError(errors);
3336 if (*errorHandler == NULL)
3337 return NULL;
3338 }
3339
3340 make_translate_exception(exceptionObject,
3341 unicode, size, startpos, endpos, reason);
3342 if (*exceptionObject == NULL)
3343 return NULL;
3344
3345 restuple = PyObject_CallFunctionObjArgs(
3346 *errorHandler, *exceptionObject, NULL);
3347 if (restuple == NULL)
3348 return NULL;
3349 if (!PyTuple_Check(restuple)) {
3350 PyErr_Format(PyExc_TypeError, &argparse[4]);
3351 Py_DECREF(restuple);
3352 return NULL;
3353 }
3354 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3355 &resunicode, newpos)) {
3356 Py_DECREF(restuple);
3357 return NULL;
3358 }
3359 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003360 *newpos = size+*newpos;
3361 if (*newpos<0 || *newpos>size) {
3362 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3363 Py_DECREF(restuple);
3364 return NULL;
3365 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 Py_INCREF(resunicode);
3367 Py_DECREF(restuple);
3368 return resunicode;
3369}
3370
3371/* Lookup the character ch in the mapping and put the result in result,
3372 which must be decrefed by the caller.
3373 Return 0 on success, -1 on error */
3374static
3375int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3376{
3377 PyObject *w = PyInt_FromLong((long)c);
3378 PyObject *x;
3379
3380 if (w == NULL)
3381 return -1;
3382 x = PyObject_GetItem(mapping, w);
3383 Py_DECREF(w);
3384 if (x == NULL) {
3385 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3386 /* No mapping found means: use 1:1 mapping. */
3387 PyErr_Clear();
3388 *result = NULL;
3389 return 0;
3390 } else
3391 return -1;
3392 }
3393 else if (x == Py_None) {
3394 *result = x;
3395 return 0;
3396 }
3397 else if (PyInt_Check(x)) {
3398 long value = PyInt_AS_LONG(x);
3399 long max = PyUnicode_GetMax();
3400 if (value < 0 || value > max) {
3401 PyErr_Format(PyExc_TypeError,
3402 "character mapping must be in range(0x%lx)", max+1);
3403 Py_DECREF(x);
3404 return -1;
3405 }
3406 *result = x;
3407 return 0;
3408 }
3409 else if (PyUnicode_Check(x)) {
3410 *result = x;
3411 return 0;
3412 }
3413 else {
3414 /* wrong return value */
3415 PyErr_SetString(PyExc_TypeError,
3416 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003417 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418 return -1;
3419 }
3420}
3421/* ensure that *outobj is at least requiredsize characters long,
3422if not reallocate and adjust various state variables.
3423Return 0 on success, -1 on error */
3424static
Walter Dörwald4894c302003-10-24 14:25:28 +00003425int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 int requiredsize)
3427{
Walter Dörwald4894c302003-10-24 14:25:28 +00003428 int oldsize = PyUnicode_GET_SIZE(*outobj);
3429 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 /* remember old output position */
3431 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3432 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003433 if (requiredsize < 2 * oldsize)
3434 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003435 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 return -1;
3437 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 }
3439 return 0;
3440}
3441/* lookup the character, put the result in the output string and adjust
3442 various state variables. Return a new reference to the object that
3443 was put in the output buffer in *result, or Py_None, if the mapping was
3444 undefined (in which case no character was written).
3445 The called must decref result.
3446 Return 0 on success, -1 on error. */
3447static
Walter Dörwald4894c302003-10-24 14:25:28 +00003448int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3449 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3450 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451{
Walter Dörwald4894c302003-10-24 14:25:28 +00003452 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 return -1;
3454 if (*res==NULL) {
3455 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003456 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 }
3458 else if (*res==Py_None)
3459 ;
3460 else if (PyInt_Check(*res)) {
3461 /* no overflow check, because we know that the space is enough */
3462 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3463 }
3464 else if (PyUnicode_Check(*res)) {
3465 int repsize = PyUnicode_GET_SIZE(*res);
3466 if (repsize==1) {
3467 /* no overflow check, because we know that the space is enough */
3468 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3469 }
3470 else if (repsize!=0) {
3471 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003472 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003473 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003474 repsize - 1;
3475 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 return -1;
3477 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3478 *outp += repsize;
3479 }
3480 }
3481 else
3482 return -1;
3483 return 0;
3484}
3485
3486PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 int size,
3488 PyObject *mapping,
3489 const char *errors)
3490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 /* output object */
3492 PyObject *res = NULL;
3493 /* pointers to the beginning and end+1 of input */
3494 const Py_UNICODE *startp = p;
3495 const Py_UNICODE *endp = p + size;
3496 /* pointer into the output */
3497 Py_UNICODE *str;
3498 /* current output position */
3499 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500 char *reason = "character maps to <undefined>";
3501 PyObject *errorHandler = NULL;
3502 PyObject *exc = NULL;
3503 /* the following variable is used for caching string comparisons
3504 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3505 * 3=ignore, 4=xmlcharrefreplace */
3506 int known_errorHandler = -1;
3507
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 if (mapping == NULL) {
3509 PyErr_BadArgument();
3510 return NULL;
3511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512
3513 /* allocate enough for a simple 1:1 translation without
3514 replacements, if we need more, we'll resize */
3515 res = PyUnicode_FromUnicode(NULL, size);
3516 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003517 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 return res;
3520 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 while (p<endp) {
3523 /* try to encode it */
3524 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003525 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 goto onError;
3528 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003529 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 if (x!=Py_None) /* it worked => adjust input pointer */
3531 ++p;
3532 else { /* untranslatable character */
3533 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3534 int repsize;
3535 int newpos;
3536 Py_UNICODE *uni2;
3537 /* startpos for collecting untranslatable chars */
3538 const Py_UNICODE *collstart = p;
3539 const Py_UNICODE *collend = p+1;
3540 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 /* find all untranslatable characters */
3543 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003544 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 goto onError;
3546 Py_XDECREF(x);
3547 if (x!=Py_None)
3548 break;
3549 ++collend;
3550 }
3551 /* cache callback name lookup
3552 * (if not done yet, i.e. it's the first error) */
3553 if (known_errorHandler==-1) {
3554 if ((errors==NULL) || (!strcmp(errors, "strict")))
3555 known_errorHandler = 1;
3556 else if (!strcmp(errors, "replace"))
3557 known_errorHandler = 2;
3558 else if (!strcmp(errors, "ignore"))
3559 known_errorHandler = 3;
3560 else if (!strcmp(errors, "xmlcharrefreplace"))
3561 known_errorHandler = 4;
3562 else
3563 known_errorHandler = 0;
3564 }
3565 switch (known_errorHandler) {
3566 case 1: /* strict */
3567 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3568 goto onError;
3569 case 2: /* replace */
3570 /* No need to check for space, this is a 1:1 replacement */
3571 for (coll = collstart; coll<collend; ++coll)
3572 *str++ = '?';
3573 /* fall through */
3574 case 3: /* ignore */
3575 p = collend;
3576 break;
3577 case 4: /* xmlcharrefreplace */
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p) {
3580 char buffer[2+29+1+1];
3581 char *cp;
3582 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003583 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3585 goto onError;
3586 for (cp = buffer; *cp; ++cp)
3587 *str++ = *cp;
3588 }
3589 p = collend;
3590 break;
3591 default:
3592 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3593 reason, startp, size, &exc,
3594 collstart-startp, collend-startp, &newpos);
3595 if (repunicode == NULL)
3596 goto onError;
3597 /* generate replacement */
3598 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003599 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3601 Py_DECREF(repunicode);
3602 goto onError;
3603 }
3604 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3605 *str++ = *uni2;
3606 p = startp + newpos;
3607 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 }
3609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 /* Resize if we allocated to much */
3612 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003613 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003614 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003615 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 }
3617 Py_XDECREF(exc);
3618 Py_XDECREF(errorHandler);
3619 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 onError:
3622 Py_XDECREF(res);
3623 Py_XDECREF(exc);
3624 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625 return NULL;
3626}
3627
3628PyObject *PyUnicode_Translate(PyObject *str,
3629 PyObject *mapping,
3630 const char *errors)
3631{
3632 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003633
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 str = PyUnicode_FromObject(str);
3635 if (str == NULL)
3636 goto onError;
3637 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3638 PyUnicode_GET_SIZE(str),
3639 mapping,
3640 errors);
3641 Py_DECREF(str);
3642 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003643
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 onError:
3645 Py_XDECREF(str);
3646 return NULL;
3647}
Tim Petersced69f82003-09-16 20:30:58 +00003648
Guido van Rossum9e896b32000-04-05 20:11:21 +00003649/* --- Decimal Encoder ---------------------------------------------------- */
3650
3651int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3652 int length,
3653 char *output,
3654 const char *errors)
3655{
3656 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 PyObject *errorHandler = NULL;
3658 PyObject *exc = NULL;
3659 const char *encoding = "decimal";
3660 const char *reason = "invalid decimal Unicode string";
3661 /* the following variable is used for caching string comparisons
3662 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3663 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003664
3665 if (output == NULL) {
3666 PyErr_BadArgument();
3667 return -1;
3668 }
3669
3670 p = s;
3671 end = s + length;
3672 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003674 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 PyObject *repunicode;
3676 int repsize;
3677 int newpos;
3678 Py_UNICODE *uni2;
3679 Py_UNICODE *collstart;
3680 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003681
Guido van Rossum9e896b32000-04-05 20:11:21 +00003682 if (Py_UNICODE_ISSPACE(ch)) {
3683 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003685 continue;
3686 }
3687 decimal = Py_UNICODE_TODECIMAL(ch);
3688 if (decimal >= 0) {
3689 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003691 continue;
3692 }
Guido van Rossumba477042000-04-06 18:18:10 +00003693 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003694 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003696 continue;
3697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 /* All other characters are considered unencodable */
3699 collstart = p;
3700 collend = p+1;
3701 while (collend < end) {
3702 if ((0 < *collend && *collend < 256) ||
3703 !Py_UNICODE_ISSPACE(*collend) ||
3704 Py_UNICODE_TODECIMAL(*collend))
3705 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 /* cache callback name lookup
3708 * (if not done yet, i.e. it's the first error) */
3709 if (known_errorHandler==-1) {
3710 if ((errors==NULL) || (!strcmp(errors, "strict")))
3711 known_errorHandler = 1;
3712 else if (!strcmp(errors, "replace"))
3713 known_errorHandler = 2;
3714 else if (!strcmp(errors, "ignore"))
3715 known_errorHandler = 3;
3716 else if (!strcmp(errors, "xmlcharrefreplace"))
3717 known_errorHandler = 4;
3718 else
3719 known_errorHandler = 0;
3720 }
3721 switch (known_errorHandler) {
3722 case 1: /* strict */
3723 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3724 goto onError;
3725 case 2: /* replace */
3726 for (p = collstart; p < collend; ++p)
3727 *output++ = '?';
3728 /* fall through */
3729 case 3: /* ignore */
3730 p = collend;
3731 break;
3732 case 4: /* xmlcharrefreplace */
3733 /* generate replacement (temporarily (mis)uses p) */
3734 for (p = collstart; p < collend; ++p)
3735 output += sprintf(output, "&#%d;", (int)*p);
3736 p = collend;
3737 break;
3738 default:
3739 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3740 encoding, reason, s, length, &exc,
3741 collstart-s, collend-s, &newpos);
3742 if (repunicode == NULL)
3743 goto onError;
3744 /* generate replacement */
3745 repsize = PyUnicode_GET_SIZE(repunicode);
3746 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3747 Py_UNICODE ch = *uni2;
3748 if (Py_UNICODE_ISSPACE(ch))
3749 *output++ = ' ';
3750 else {
3751 decimal = Py_UNICODE_TODECIMAL(ch);
3752 if (decimal >= 0)
3753 *output++ = '0' + decimal;
3754 else if (0 < ch && ch < 256)
3755 *output++ = (char)ch;
3756 else {
3757 Py_DECREF(repunicode);
3758 raise_encode_exception(&exc, encoding,
3759 s, length, collstart-s, collend-s, reason);
3760 goto onError;
3761 }
3762 }
3763 }
3764 p = s + newpos;
3765 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003766 }
3767 }
3768 /* 0-terminate the output string */
3769 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 Py_XDECREF(exc);
3771 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003772 return 0;
3773
3774 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 Py_XDECREF(exc);
3776 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003777 return -1;
3778}
3779
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780/* --- Helpers ------------------------------------------------------------ */
3781
Tim Petersced69f82003-09-16 20:30:58 +00003782static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783int count(PyUnicodeObject *self,
3784 int start,
3785 int end,
3786 PyUnicodeObject *substring)
3787{
3788 int count = 0;
3789
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003790 if (start < 0)
3791 start += self->length;
3792 if (start < 0)
3793 start = 0;
3794 if (end > self->length)
3795 end = self->length;
3796 if (end < 0)
3797 end += self->length;
3798 if (end < 0)
3799 end = 0;
3800
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003801 if (substring->length == 0)
3802 return (end - start + 1);
3803
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 end -= substring->length;
3805
3806 while (start <= end)
3807 if (Py_UNICODE_MATCH(self, start, substring)) {
3808 count++;
3809 start += substring->length;
3810 } else
3811 start++;
3812
3813 return count;
3814}
3815
3816int PyUnicode_Count(PyObject *str,
3817 PyObject *substr,
3818 int start,
3819 int end)
3820{
3821 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 str = PyUnicode_FromObject(str);
3824 if (str == NULL)
3825 return -1;
3826 substr = PyUnicode_FromObject(substr);
3827 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003828 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 return -1;
3830 }
Tim Petersced69f82003-09-16 20:30:58 +00003831
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 result = count((PyUnicodeObject *)str,
3833 start, end,
3834 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 Py_DECREF(str);
3837 Py_DECREF(substr);
3838 return result;
3839}
3840
Tim Petersced69f82003-09-16 20:30:58 +00003841static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842int findstring(PyUnicodeObject *self,
3843 PyUnicodeObject *substring,
3844 int start,
3845 int end,
3846 int direction)
3847{
3848 if (start < 0)
3849 start += self->length;
3850 if (start < 0)
3851 start = 0;
3852
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 if (end > self->length)
3854 end = self->length;
3855 if (end < 0)
3856 end += self->length;
3857 if (end < 0)
3858 end = 0;
3859
Guido van Rossum76afbd92002-08-20 17:29:29 +00003860 if (substring->length == 0)
3861 return (direction > 0) ? start : end;
3862
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 end -= substring->length;
3864
3865 if (direction < 0) {
3866 for (; end >= start; end--)
3867 if (Py_UNICODE_MATCH(self, end, substring))
3868 return end;
3869 } else {
3870 for (; start <= end; start++)
3871 if (Py_UNICODE_MATCH(self, start, substring))
3872 return start;
3873 }
3874
3875 return -1;
3876}
3877
3878int PyUnicode_Find(PyObject *str,
3879 PyObject *substr,
3880 int start,
3881 int end,
3882 int direction)
3883{
3884 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003885
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886 str = PyUnicode_FromObject(str);
3887 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003888 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 substr = PyUnicode_FromObject(substr);
3890 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003891 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003892 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893 }
Tim Petersced69f82003-09-16 20:30:58 +00003894
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 result = findstring((PyUnicodeObject *)str,
3896 (PyUnicodeObject *)substr,
3897 start, end, direction);
3898 Py_DECREF(str);
3899 Py_DECREF(substr);
3900 return result;
3901}
3902
Tim Petersced69f82003-09-16 20:30:58 +00003903static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904int tailmatch(PyUnicodeObject *self,
3905 PyUnicodeObject *substring,
3906 int start,
3907 int end,
3908 int direction)
3909{
3910 if (start < 0)
3911 start += self->length;
3912 if (start < 0)
3913 start = 0;
3914
3915 if (substring->length == 0)
3916 return 1;
3917
3918 if (end > self->length)
3919 end = self->length;
3920 if (end < 0)
3921 end += self->length;
3922 if (end < 0)
3923 end = 0;
3924
3925 end -= substring->length;
3926 if (end < start)
3927 return 0;
3928
3929 if (direction > 0) {
3930 if (Py_UNICODE_MATCH(self, end, substring))
3931 return 1;
3932 } else {
3933 if (Py_UNICODE_MATCH(self, start, substring))
3934 return 1;
3935 }
3936
3937 return 0;
3938}
3939
3940int PyUnicode_Tailmatch(PyObject *str,
3941 PyObject *substr,
3942 int start,
3943 int end,
3944 int direction)
3945{
3946 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003947
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 str = PyUnicode_FromObject(str);
3949 if (str == NULL)
3950 return -1;
3951 substr = PyUnicode_FromObject(substr);
3952 if (substr == NULL) {
3953 Py_DECREF(substr);
3954 return -1;
3955 }
Tim Petersced69f82003-09-16 20:30:58 +00003956
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 result = tailmatch((PyUnicodeObject *)str,
3958 (PyUnicodeObject *)substr,
3959 start, end, direction);
3960 Py_DECREF(str);
3961 Py_DECREF(substr);
3962 return result;
3963}
3964
Tim Petersced69f82003-09-16 20:30:58 +00003965static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966const Py_UNICODE *findchar(const Py_UNICODE *s,
3967 int size,
3968 Py_UNICODE ch)
3969{
3970 /* like wcschr, but doesn't stop at NULL characters */
3971
3972 while (size-- > 0) {
3973 if (*s == ch)
3974 return s;
3975 s++;
3976 }
3977
3978 return NULL;
3979}
3980
3981/* Apply fixfct filter to the Unicode object self and return a
3982 reference to the modified object */
3983
Tim Petersced69f82003-09-16 20:30:58 +00003984static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985PyObject *fixup(PyUnicodeObject *self,
3986 int (*fixfct)(PyUnicodeObject *s))
3987{
3988
3989 PyUnicodeObject *u;
3990
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003991 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 if (u == NULL)
3993 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003994
3995 Py_UNICODE_COPY(u->str, self->str, self->length);
3996
Tim Peters7a29bd52001-09-12 03:03:31 +00003997 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 /* fixfct should return TRUE if it modified the buffer. If
3999 FALSE, return a reference to the original buffer instead
4000 (to save space, not time) */
4001 Py_INCREF(self);
4002 Py_DECREF(u);
4003 return (PyObject*) self;
4004 }
4005 return (PyObject*) u;
4006}
4007
Tim Petersced69f82003-09-16 20:30:58 +00004008static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009int fixupper(PyUnicodeObject *self)
4010{
4011 int len = self->length;
4012 Py_UNICODE *s = self->str;
4013 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004014
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 while (len-- > 0) {
4016 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004017
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 ch = Py_UNICODE_TOUPPER(*s);
4019 if (ch != *s) {
4020 status = 1;
4021 *s = ch;
4022 }
4023 s++;
4024 }
4025
4026 return status;
4027}
4028
Tim Petersced69f82003-09-16 20:30:58 +00004029static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030int fixlower(PyUnicodeObject *self)
4031{
4032 int len = self->length;
4033 Py_UNICODE *s = self->str;
4034 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 while (len-- > 0) {
4037 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004038
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 ch = Py_UNICODE_TOLOWER(*s);
4040 if (ch != *s) {
4041 status = 1;
4042 *s = ch;
4043 }
4044 s++;
4045 }
4046
4047 return status;
4048}
4049
Tim Petersced69f82003-09-16 20:30:58 +00004050static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051int fixswapcase(PyUnicodeObject *self)
4052{
4053 int len = self->length;
4054 Py_UNICODE *s = self->str;
4055 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004056
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 while (len-- > 0) {
4058 if (Py_UNICODE_ISUPPER(*s)) {
4059 *s = Py_UNICODE_TOLOWER(*s);
4060 status = 1;
4061 } else if (Py_UNICODE_ISLOWER(*s)) {
4062 *s = Py_UNICODE_TOUPPER(*s);
4063 status = 1;
4064 }
4065 s++;
4066 }
4067
4068 return status;
4069}
4070
Tim Petersced69f82003-09-16 20:30:58 +00004071static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072int fixcapitalize(PyUnicodeObject *self)
4073{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004074 int len = self->length;
4075 Py_UNICODE *s = self->str;
4076 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004077
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004078 if (len == 0)
4079 return 0;
4080 if (Py_UNICODE_ISLOWER(*s)) {
4081 *s = Py_UNICODE_TOUPPER(*s);
4082 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004084 s++;
4085 while (--len > 0) {
4086 if (Py_UNICODE_ISUPPER(*s)) {
4087 *s = Py_UNICODE_TOLOWER(*s);
4088 status = 1;
4089 }
4090 s++;
4091 }
4092 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093}
4094
4095static
4096int fixtitle(PyUnicodeObject *self)
4097{
4098 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4099 register Py_UNICODE *e;
4100 int previous_is_cased;
4101
4102 /* Shortcut for single character strings */
4103 if (PyUnicode_GET_SIZE(self) == 1) {
4104 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4105 if (*p != ch) {
4106 *p = ch;
4107 return 1;
4108 }
4109 else
4110 return 0;
4111 }
Tim Petersced69f82003-09-16 20:30:58 +00004112
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 e = p + PyUnicode_GET_SIZE(self);
4114 previous_is_cased = 0;
4115 for (; p < e; p++) {
4116 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 if (previous_is_cased)
4119 *p = Py_UNICODE_TOLOWER(ch);
4120 else
4121 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004122
4123 if (Py_UNICODE_ISLOWER(ch) ||
4124 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 Py_UNICODE_ISTITLE(ch))
4126 previous_is_cased = 1;
4127 else
4128 previous_is_cased = 0;
4129 }
4130 return 1;
4131}
4132
Tim Peters8ce9f162004-08-27 01:49:32 +00004133PyObject *
4134PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135{
Tim Peters8ce9f162004-08-27 01:49:32 +00004136 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004137 const Py_UNICODE blank = ' ';
4138 const Py_UNICODE *sep = &blank;
4139 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004140 PyUnicodeObject *res = NULL; /* the result */
4141 size_t res_alloc = 100; /* # allocated bytes for string in res */
4142 size_t res_used; /* # used bytes */
4143 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4144 PyObject *fseq; /* PySequence_Fast(seq) */
4145 int seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004146 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 int i;
4148
Tim Peters05eba1f2004-08-27 21:32:02 +00004149 fseq = PySequence_Fast(seq, "");
4150 if (fseq == NULL) {
4151 if (PyErr_ExceptionMatches(PyExc_TypeError))
4152 PyErr_Format(PyExc_TypeError,
4153 "sequence expected, %.80s found",
4154 seq->ob_type->tp_name);
4155 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004156 }
4157
Tim Peters91879ab2004-08-27 22:35:44 +00004158 /* Grrrr. A codec may be invoked to convert str objects to
4159 * Unicode, and so it's possible to call back into Python code
4160 * during PyUnicode_FromObject(), and so it's possible for a sick
4161 * codec to change the size of fseq (if seq is a list). Therefore
4162 * we have to keep refetching the size -- can't assume seqlen
4163 * is invariant.
4164 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004165 seqlen = PySequence_Fast_GET_SIZE(fseq);
4166 /* If empty sequence, return u"". */
4167 if (seqlen == 0) {
4168 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4169 goto Done;
4170 }
4171 /* If singleton sequence with an exact Unicode, return that. */
4172 if (seqlen == 1) {
4173 item = PySequence_Fast_GET_ITEM(fseq, 0);
4174 if (PyUnicode_CheckExact(item)) {
4175 Py_INCREF(item);
4176 res = (PyUnicodeObject *)item;
4177 goto Done;
4178 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004179 }
4180
Tim Peters05eba1f2004-08-27 21:32:02 +00004181 /* At least two items to join, or one that isn't exact Unicode. */
4182 if (seqlen > 1) {
4183 /* Set up sep and seplen -- they're needed. */
4184 if (separator == NULL) {
4185 sep = &blank;
4186 seplen = 1;
4187 }
4188 else {
4189 internal_separator = PyUnicode_FromObject(separator);
4190 if (internal_separator == NULL)
4191 goto onError;
4192 sep = PyUnicode_AS_UNICODE(internal_separator);
4193 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004194 /* In case PyUnicode_FromObject() mutated seq. */
4195 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004196 }
4197 }
4198
4199 /* Get space. */
4200 res = _PyUnicode_New((int)res_alloc);
4201 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004202 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004203 res_p = PyUnicode_AS_UNICODE(res);
4204 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004205
Tim Peters05eba1f2004-08-27 21:32:02 +00004206 for (i = 0; i < seqlen; ++i) {
4207 size_t itemlen;
4208 size_t new_res_used;
4209
4210 item = PySequence_Fast_GET_ITEM(fseq, i);
4211 /* Convert item to Unicode. */
4212 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4213 PyErr_Format(PyExc_TypeError,
4214 "sequence item %i: expected string or Unicode,"
4215 " %.80s found",
4216 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004217 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004218 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004219 item = PyUnicode_FromObject(item);
4220 if (item == NULL)
4221 goto onError;
4222 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004223
Tim Peters91879ab2004-08-27 22:35:44 +00004224 /* In case PyUnicode_FromObject() mutated seq. */
4225 seqlen = PySequence_Fast_GET_SIZE(fseq);
4226
Tim Peters8ce9f162004-08-27 01:49:32 +00004227 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004229 new_res_used = res_used + itemlen;
4230 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004231 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004232 if (i < seqlen - 1) {
4233 new_res_used += seplen;
4234 if (new_res_used < res_used || new_res_used > INT_MAX)
4235 goto Overflow;
4236 }
4237 if (new_res_used > res_alloc) {
4238 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004239 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004240 size_t oldsize = res_alloc;
4241 res_alloc += res_alloc;
4242 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004243 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004244 } while (new_res_used > res_alloc);
4245 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004246 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004248 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004249 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004251
4252 /* Copy item, and maybe the separator. */
4253 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4254 res_p += itemlen;
4255 if (i < seqlen - 1) {
4256 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4257 res_p += seplen;
4258 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004260 res_used = new_res_used;
4261 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004262
Tim Peters05eba1f2004-08-27 21:32:02 +00004263 /* Shrink res to match the used area; this probably can't fail,
4264 * but it's cheap to check.
4265 */
4266 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004267 goto onError;
4268
4269 Done:
4270 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004271 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 return (PyObject *)res;
4273
Tim Peters8ce9f162004-08-27 01:49:32 +00004274 Overflow:
4275 PyErr_SetString(PyExc_OverflowError,
4276 "join() is too long for a Python string");
4277 Py_DECREF(item);
4278 /* fall through */
4279
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004281 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004282 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004283 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 return NULL;
4285}
4286
Tim Petersced69f82003-09-16 20:30:58 +00004287static
4288PyUnicodeObject *pad(PyUnicodeObject *self,
4289 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290 int right,
4291 Py_UNICODE fill)
4292{
4293 PyUnicodeObject *u;
4294
4295 if (left < 0)
4296 left = 0;
4297 if (right < 0)
4298 right = 0;
4299
Tim Peters7a29bd52001-09-12 03:03:31 +00004300 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 Py_INCREF(self);
4302 return self;
4303 }
4304
4305 u = _PyUnicode_New(left + self->length + right);
4306 if (u) {
4307 if (left)
4308 Py_UNICODE_FILL(u->str, fill, left);
4309 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4310 if (right)
4311 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4312 }
4313
4314 return u;
4315}
4316
4317#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004318 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319 if (!str) \
4320 goto onError; \
4321 if (PyList_Append(list, str)) { \
4322 Py_DECREF(str); \
4323 goto onError; \
4324 } \
4325 else \
4326 Py_DECREF(str);
4327
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004328#define SPLIT_INSERT(data, left, right) \
4329 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4330 if (!str) \
4331 goto onError; \
4332 if (PyList_Insert(list, 0, str)) { \
4333 Py_DECREF(str); \
4334 goto onError; \
4335 } \
4336 else \
4337 Py_DECREF(str);
4338
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339static
4340PyObject *split_whitespace(PyUnicodeObject *self,
4341 PyObject *list,
4342 int maxcount)
4343{
4344 register int i;
4345 register int j;
4346 int len = self->length;
4347 PyObject *str;
4348
4349 for (i = j = 0; i < len; ) {
4350 /* find a token */
4351 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4352 i++;
4353 j = i;
4354 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4355 i++;
4356 if (j < i) {
4357 if (maxcount-- <= 0)
4358 break;
4359 SPLIT_APPEND(self->str, j, i);
4360 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4361 i++;
4362 j = i;
4363 }
4364 }
4365 if (j < len) {
4366 SPLIT_APPEND(self->str, j, len);
4367 }
4368 return list;
4369
4370 onError:
4371 Py_DECREF(list);
4372 return NULL;
4373}
4374
4375PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004376 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377{
4378 register int i;
4379 register int j;
4380 int len;
4381 PyObject *list;
4382 PyObject *str;
4383 Py_UNICODE *data;
4384
4385 string = PyUnicode_FromObject(string);
4386 if (string == NULL)
4387 return NULL;
4388 data = PyUnicode_AS_UNICODE(string);
4389 len = PyUnicode_GET_SIZE(string);
4390
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391 list = PyList_New(0);
4392 if (!list)
4393 goto onError;
4394
4395 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004396 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004397
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398 /* Find a line and append it */
4399 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4400 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401
4402 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004403 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404 if (i < len) {
4405 if (data[i] == '\r' && i + 1 < len &&
4406 data[i+1] == '\n')
4407 i += 2;
4408 else
4409 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004410 if (keepends)
4411 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 }
Guido van Rossum86662912000-04-11 15:38:46 +00004413 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 j = i;
4415 }
4416 if (j < len) {
4417 SPLIT_APPEND(data, j, len);
4418 }
4419
4420 Py_DECREF(string);
4421 return list;
4422
4423 onError:
4424 Py_DECREF(list);
4425 Py_DECREF(string);
4426 return NULL;
4427}
4428
Tim Petersced69f82003-09-16 20:30:58 +00004429static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004430PyObject *split_char(PyUnicodeObject *self,
4431 PyObject *list,
4432 Py_UNICODE ch,
4433 int maxcount)
4434{
4435 register int i;
4436 register int j;
4437 int len = self->length;
4438 PyObject *str;
4439
4440 for (i = j = 0; i < len; ) {
4441 if (self->str[i] == ch) {
4442 if (maxcount-- <= 0)
4443 break;
4444 SPLIT_APPEND(self->str, j, i);
4445 i = j = i + 1;
4446 } else
4447 i++;
4448 }
4449 if (j <= len) {
4450 SPLIT_APPEND(self->str, j, len);
4451 }
4452 return list;
4453
4454 onError:
4455 Py_DECREF(list);
4456 return NULL;
4457}
4458
Tim Petersced69f82003-09-16 20:30:58 +00004459static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004460PyObject *split_substring(PyUnicodeObject *self,
4461 PyObject *list,
4462 PyUnicodeObject *substring,
4463 int maxcount)
4464{
4465 register int i;
4466 register int j;
4467 int len = self->length;
4468 int sublen = substring->length;
4469 PyObject *str;
4470
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004471 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004472 if (Py_UNICODE_MATCH(self, i, substring)) {
4473 if (maxcount-- <= 0)
4474 break;
4475 SPLIT_APPEND(self->str, j, i);
4476 i = j = i + sublen;
4477 } else
4478 i++;
4479 }
4480 if (j <= len) {
4481 SPLIT_APPEND(self->str, j, len);
4482 }
4483 return list;
4484
4485 onError:
4486 Py_DECREF(list);
4487 return NULL;
4488}
4489
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004490static
4491PyObject *rsplit_whitespace(PyUnicodeObject *self,
4492 PyObject *list,
4493 int maxcount)
4494{
4495 register int i;
4496 register int j;
4497 int len = self->length;
4498 PyObject *str;
4499
4500 for (i = j = len - 1; i >= 0; ) {
4501 /* find a token */
4502 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4503 i--;
4504 j = i;
4505 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4506 i--;
4507 if (j > i) {
4508 if (maxcount-- <= 0)
4509 break;
4510 SPLIT_INSERT(self->str, i + 1, j + 1);
4511 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4512 i--;
4513 j = i;
4514 }
4515 }
4516 if (j >= 0) {
4517 SPLIT_INSERT(self->str, 0, j + 1);
4518 }
4519 return list;
4520
4521 onError:
4522 Py_DECREF(list);
4523 return NULL;
4524}
4525
4526static
4527PyObject *rsplit_char(PyUnicodeObject *self,
4528 PyObject *list,
4529 Py_UNICODE ch,
4530 int maxcount)
4531{
4532 register int i;
4533 register int j;
4534 int len = self->length;
4535 PyObject *str;
4536
4537 for (i = j = len - 1; i >= 0; ) {
4538 if (self->str[i] == ch) {
4539 if (maxcount-- <= 0)
4540 break;
4541 SPLIT_INSERT(self->str, i + 1, j + 1);
4542 j = i = i - 1;
4543 } else
4544 i--;
4545 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004546 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004547 SPLIT_INSERT(self->str, 0, j + 1);
4548 }
4549 return list;
4550
4551 onError:
4552 Py_DECREF(list);
4553 return NULL;
4554}
4555
4556static
4557PyObject *rsplit_substring(PyUnicodeObject *self,
4558 PyObject *list,
4559 PyUnicodeObject *substring,
4560 int maxcount)
4561{
4562 register int i;
4563 register int j;
4564 int len = self->length;
4565 int sublen = substring->length;
4566 PyObject *str;
4567
4568 for (i = len - sublen, j = len; i >= 0; ) {
4569 if (Py_UNICODE_MATCH(self, i, substring)) {
4570 if (maxcount-- <= 0)
4571 break;
4572 SPLIT_INSERT(self->str, i + sublen, j);
4573 j = i;
4574 i -= sublen;
4575 } else
4576 i--;
4577 }
4578 if (j >= 0) {
4579 SPLIT_INSERT(self->str, 0, j);
4580 }
4581 return list;
4582
4583 onError:
4584 Py_DECREF(list);
4585 return NULL;
4586}
4587
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004589#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590
4591static
4592PyObject *split(PyUnicodeObject *self,
4593 PyUnicodeObject *substring,
4594 int maxcount)
4595{
4596 PyObject *list;
4597
4598 if (maxcount < 0)
4599 maxcount = INT_MAX;
4600
4601 list = PyList_New(0);
4602 if (!list)
4603 return NULL;
4604
4605 if (substring == NULL)
4606 return split_whitespace(self,list,maxcount);
4607
4608 else if (substring->length == 1)
4609 return split_char(self,list,substring->str[0],maxcount);
4610
4611 else if (substring->length == 0) {
4612 Py_DECREF(list);
4613 PyErr_SetString(PyExc_ValueError, "empty separator");
4614 return NULL;
4615 }
4616 else
4617 return split_substring(self,list,substring,maxcount);
4618}
4619
Tim Petersced69f82003-09-16 20:30:58 +00004620static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004621PyObject *rsplit(PyUnicodeObject *self,
4622 PyUnicodeObject *substring,
4623 int maxcount)
4624{
4625 PyObject *list;
4626
4627 if (maxcount < 0)
4628 maxcount = INT_MAX;
4629
4630 list = PyList_New(0);
4631 if (!list)
4632 return NULL;
4633
4634 if (substring == NULL)
4635 return rsplit_whitespace(self,list,maxcount);
4636
4637 else if (substring->length == 1)
4638 return rsplit_char(self,list,substring->str[0],maxcount);
4639
4640 else if (substring->length == 0) {
4641 Py_DECREF(list);
4642 PyErr_SetString(PyExc_ValueError, "empty separator");
4643 return NULL;
4644 }
4645 else
4646 return rsplit_substring(self,list,substring,maxcount);
4647}
4648
4649static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650PyObject *replace(PyUnicodeObject *self,
4651 PyUnicodeObject *str1,
4652 PyUnicodeObject *str2,
4653 int maxcount)
4654{
4655 PyUnicodeObject *u;
4656
4657 if (maxcount < 0)
4658 maxcount = INT_MAX;
4659
4660 if (str1->length == 1 && str2->length == 1) {
4661 int i;
4662
4663 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004664 if (!findchar(self->str, self->length, str1->str[0]) &&
4665 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666 /* nothing to replace, return original string */
4667 Py_INCREF(self);
4668 u = self;
4669 } else {
4670 Py_UNICODE u1 = str1->str[0];
4671 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004672
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004674 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 self->length
4676 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004677 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004678 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004679 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004680 for (i = 0; i < u->length; i++)
4681 if (u->str[i] == u1) {
4682 if (--maxcount < 0)
4683 break;
4684 u->str[i] = u2;
4685 }
4686 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004687 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688
4689 } else {
4690 int n, i;
4691 Py_UNICODE *p;
4692
4693 /* replace strings */
4694 n = count(self, 0, self->length, str1);
4695 if (n > maxcount)
4696 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004697 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004699 if (PyUnicode_CheckExact(self)) {
4700 Py_INCREF(self);
4701 u = self;
4702 }
4703 else {
4704 u = (PyUnicodeObject *)
4705 PyUnicode_FromUnicode(self->str, self->length);
4706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707 } else {
4708 u = _PyUnicode_New(
4709 self->length + n * (str2->length - str1->length));
4710 if (u) {
4711 i = 0;
4712 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004713 if (str1->length > 0) {
4714 while (i <= self->length - str1->length)
4715 if (Py_UNICODE_MATCH(self, i, str1)) {
4716 /* replace string segment */
4717 Py_UNICODE_COPY(p, str2->str, str2->length);
4718 p += str2->length;
4719 i += str1->length;
4720 if (--n <= 0) {
4721 /* copy remaining part */
4722 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4723 break;
4724 }
4725 } else
4726 *p++ = self->str[i++];
4727 } else {
4728 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 Py_UNICODE_COPY(p, str2->str, str2->length);
4730 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004731 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004734 }
4735 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4736 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 }
4738 }
4739 }
Tim Petersced69f82003-09-16 20:30:58 +00004740
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 return (PyObject *) u;
4742}
4743
4744/* --- Unicode Object Methods --------------------------------------------- */
4745
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004746PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747"S.title() -> unicode\n\
4748\n\
4749Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004750characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751
4752static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004753unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 return fixup(self, fixtitle);
4756}
4757
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004758PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759"S.capitalize() -> unicode\n\
4760\n\
4761Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004762have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763
4764static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004765unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 return fixup(self, fixcapitalize);
4768}
4769
4770#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004771PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772"S.capwords() -> unicode\n\
4773\n\
4774Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004775normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776
4777static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004778unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779{
4780 PyObject *list;
4781 PyObject *item;
4782 int i;
4783
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 /* Split into words */
4785 list = split(self, NULL, -1);
4786 if (!list)
4787 return NULL;
4788
4789 /* Capitalize each word */
4790 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4791 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4792 fixcapitalize);
4793 if (item == NULL)
4794 goto onError;
4795 Py_DECREF(PyList_GET_ITEM(list, i));
4796 PyList_SET_ITEM(list, i, item);
4797 }
4798
4799 /* Join the words to form a new string */
4800 item = PyUnicode_Join(NULL, list);
4801
4802onError:
4803 Py_DECREF(list);
4804 return (PyObject *)item;
4805}
4806#endif
4807
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004808/* Argument converter. Coerces to a single unicode character */
4809
4810static int
4811convert_uc(PyObject *obj, void *addr)
4812{
4813 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4814 PyObject *uniobj;
4815 Py_UNICODE *unistr;
4816
4817 uniobj = PyUnicode_FromObject(obj);
4818 if (uniobj == NULL) {
4819 PyErr_SetString(PyExc_TypeError,
4820 "The fill character cannot be converted to Unicode");
4821 return 0;
4822 }
4823 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4824 PyErr_SetString(PyExc_TypeError,
4825 "The fill character must be exactly one character long");
4826 Py_DECREF(uniobj);
4827 return 0;
4828 }
4829 unistr = PyUnicode_AS_UNICODE(uniobj);
4830 *fillcharloc = unistr[0];
4831 Py_DECREF(uniobj);
4832 return 1;
4833}
4834
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004835PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004836"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004838Return S centered in a Unicode string of length width. Padding is\n\
4839done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840
4841static PyObject *
4842unicode_center(PyUnicodeObject *self, PyObject *args)
4843{
4844 int marg, left;
4845 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004846 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004848 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 return NULL;
4850
Tim Peters7a29bd52001-09-12 03:03:31 +00004851 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 Py_INCREF(self);
4853 return (PyObject*) self;
4854 }
4855
4856 marg = width - self->length;
4857 left = marg / 2 + (marg & width & 1);
4858
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004859 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860}
4861
Marc-André Lemburge5034372000-08-08 08:04:29 +00004862#if 0
4863
4864/* This code should go into some future Unicode collation support
4865 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004866 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004867
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004868/* speedy UTF-16 code point order comparison */
4869/* gleaned from: */
4870/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4871
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004872static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004873{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004874 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004875 0, 0, 0, 0, 0, 0, 0, 0,
4876 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004877 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004878};
4879
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880static int
4881unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4882{
4883 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004884
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 Py_UNICODE *s1 = str1->str;
4886 Py_UNICODE *s2 = str2->str;
4887
4888 len1 = str1->length;
4889 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004890
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004892 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004893
4894 c1 = *s1++;
4895 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004896
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004897 if (c1 > (1<<11) * 26)
4898 c1 += utf16Fixup[c1>>11];
4899 if (c2 > (1<<11) * 26)
4900 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004901 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004902
4903 if (c1 != c2)
4904 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004905
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004906 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907 }
4908
4909 return (len1 < len2) ? -1 : (len1 != len2);
4910}
4911
Marc-André Lemburge5034372000-08-08 08:04:29 +00004912#else
4913
4914static int
4915unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4916{
4917 register int len1, len2;
4918
4919 Py_UNICODE *s1 = str1->str;
4920 Py_UNICODE *s2 = str2->str;
4921
4922 len1 = str1->length;
4923 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004924
Marc-André Lemburge5034372000-08-08 08:04:29 +00004925 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004926 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004927
Fredrik Lundh45714e92001-06-26 16:39:36 +00004928 c1 = *s1++;
4929 c2 = *s2++;
4930
4931 if (c1 != c2)
4932 return (c1 < c2) ? -1 : 1;
4933
Marc-André Lemburge5034372000-08-08 08:04:29 +00004934 len1--; len2--;
4935 }
4936
4937 return (len1 < len2) ? -1 : (len1 != len2);
4938}
4939
4940#endif
4941
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942int PyUnicode_Compare(PyObject *left,
4943 PyObject *right)
4944{
4945 PyUnicodeObject *u = NULL, *v = NULL;
4946 int result;
4947
4948 /* Coerce the two arguments */
4949 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4950 if (u == NULL)
4951 goto onError;
4952 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4953 if (v == NULL)
4954 goto onError;
4955
Thomas Wouters7e474022000-07-16 12:04:32 +00004956 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004957 if (v == u) {
4958 Py_DECREF(u);
4959 Py_DECREF(v);
4960 return 0;
4961 }
4962
4963 result = unicode_compare(u, v);
4964
4965 Py_DECREF(u);
4966 Py_DECREF(v);
4967 return result;
4968
4969onError:
4970 Py_XDECREF(u);
4971 Py_XDECREF(v);
4972 return -1;
4973}
4974
Guido van Rossum403d68b2000-03-13 15:55:09 +00004975int PyUnicode_Contains(PyObject *container,
4976 PyObject *element)
4977{
4978 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004979 int result, size;
4980 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004981
4982 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004983 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004984 if (v == NULL) {
4985 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004986 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004987 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004988 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004989 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004990 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004991 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004992
Barry Warsaw817918c2002-08-06 16:58:21 +00004993 size = PyUnicode_GET_SIZE(v);
4994 rhs = PyUnicode_AS_UNICODE(v);
4995 lhs = PyUnicode_AS_UNICODE(u);
4996
Guido van Rossum403d68b2000-03-13 15:55:09 +00004997 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004998 if (size == 1) {
4999 end = lhs + PyUnicode_GET_SIZE(u);
5000 while (lhs < end) {
5001 if (*lhs++ == *rhs) {
5002 result = 1;
5003 break;
5004 }
5005 }
5006 }
5007 else {
5008 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5009 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005010 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005011 result = 1;
5012 break;
5013 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005014 }
5015 }
5016
5017 Py_DECREF(u);
5018 Py_DECREF(v);
5019 return result;
5020
5021onError:
5022 Py_XDECREF(u);
5023 Py_XDECREF(v);
5024 return -1;
5025}
5026
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027/* Concat to string or Unicode object giving a new Unicode object. */
5028
5029PyObject *PyUnicode_Concat(PyObject *left,
5030 PyObject *right)
5031{
5032 PyUnicodeObject *u = NULL, *v = NULL, *w;
5033
5034 /* Coerce the two arguments */
5035 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5036 if (u == NULL)
5037 goto onError;
5038 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5039 if (v == NULL)
5040 goto onError;
5041
5042 /* Shortcuts */
5043 if (v == unicode_empty) {
5044 Py_DECREF(v);
5045 return (PyObject *)u;
5046 }
5047 if (u == unicode_empty) {
5048 Py_DECREF(u);
5049 return (PyObject *)v;
5050 }
5051
5052 /* Concat the two Unicode strings */
5053 w = _PyUnicode_New(u->length + v->length);
5054 if (w == NULL)
5055 goto onError;
5056 Py_UNICODE_COPY(w->str, u->str, u->length);
5057 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5058
5059 Py_DECREF(u);
5060 Py_DECREF(v);
5061 return (PyObject *)w;
5062
5063onError:
5064 Py_XDECREF(u);
5065 Py_XDECREF(v);
5066 return NULL;
5067}
5068
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005069PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070"S.count(sub[, start[, end]]) -> int\n\
5071\n\
5072Return the number of occurrences of substring sub in Unicode string\n\
5073S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005074interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075
5076static PyObject *
5077unicode_count(PyUnicodeObject *self, PyObject *args)
5078{
5079 PyUnicodeObject *substring;
5080 int start = 0;
5081 int end = INT_MAX;
5082 PyObject *result;
5083
Guido van Rossumb8872e62000-05-09 14:14:27 +00005084 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5085 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005086 return NULL;
5087
5088 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5089 (PyObject *)substring);
5090 if (substring == NULL)
5091 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005092
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 if (start < 0)
5094 start += self->length;
5095 if (start < 0)
5096 start = 0;
5097 if (end > self->length)
5098 end = self->length;
5099 if (end < 0)
5100 end += self->length;
5101 if (end < 0)
5102 end = 0;
5103
5104 result = PyInt_FromLong((long) count(self, start, end, substring));
5105
5106 Py_DECREF(substring);
5107 return result;
5108}
5109
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005110PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005111"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005113Encodes S using the codec registered for encoding. encoding defaults\n\
5114to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005115handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005116a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5117'xmlcharrefreplace' as well as any other name registered with\n\
5118codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119
5120static PyObject *
5121unicode_encode(PyUnicodeObject *self, PyObject *args)
5122{
5123 char *encoding = NULL;
5124 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005125 PyObject *v;
5126
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5128 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005129 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005130 if (v == NULL)
5131 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005132 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5133 PyErr_Format(PyExc_TypeError,
5134 "encoder did not return a string/unicode object "
5135 "(type=%.400s)",
5136 v->ob_type->tp_name);
5137 Py_DECREF(v);
5138 return NULL;
5139 }
5140 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005141
5142 onError:
5143 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005144}
5145
5146PyDoc_STRVAR(decode__doc__,
5147"S.decode([encoding[,errors]]) -> string or unicode\n\
5148\n\
5149Decodes S using the codec registered for encoding. encoding defaults\n\
5150to the default encoding. errors may be given to set a different error\n\
5151handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5152a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5153as well as any other name registerd with codecs.register_error that is\n\
5154able to handle UnicodeDecodeErrors.");
5155
5156static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005157unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005158{
5159 char *encoding = NULL;
5160 char *errors = NULL;
5161 PyObject *v;
5162
5163 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5164 return NULL;
5165 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005166 if (v == NULL)
5167 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005168 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5169 PyErr_Format(PyExc_TypeError,
5170 "decoder did not return a string/unicode object "
5171 "(type=%.400s)",
5172 v->ob_type->tp_name);
5173 Py_DECREF(v);
5174 return NULL;
5175 }
5176 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005177
5178 onError:
5179 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180}
5181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005182PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183"S.expandtabs([tabsize]) -> unicode\n\
5184\n\
5185Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005186If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187
5188static PyObject*
5189unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5190{
5191 Py_UNICODE *e;
5192 Py_UNICODE *p;
5193 Py_UNICODE *q;
5194 int i, j;
5195 PyUnicodeObject *u;
5196 int tabsize = 8;
5197
5198 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5199 return NULL;
5200
Thomas Wouters7e474022000-07-16 12:04:32 +00005201 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 i = j = 0;
5203 e = self->str + self->length;
5204 for (p = self->str; p < e; p++)
5205 if (*p == '\t') {
5206 if (tabsize > 0)
5207 j += tabsize - (j % tabsize);
5208 }
5209 else {
5210 j++;
5211 if (*p == '\n' || *p == '\r') {
5212 i += j;
5213 j = 0;
5214 }
5215 }
5216
5217 /* Second pass: create output string and fill it */
5218 u = _PyUnicode_New(i + j);
5219 if (!u)
5220 return NULL;
5221
5222 j = 0;
5223 q = u->str;
5224
5225 for (p = self->str; p < e; p++)
5226 if (*p == '\t') {
5227 if (tabsize > 0) {
5228 i = tabsize - (j % tabsize);
5229 j += i;
5230 while (i--)
5231 *q++ = ' ';
5232 }
5233 }
5234 else {
5235 j++;
5236 *q++ = *p;
5237 if (*p == '\n' || *p == '\r')
5238 j = 0;
5239 }
5240
5241 return (PyObject*) u;
5242}
5243
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005244PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245"S.find(sub [,start [,end]]) -> int\n\
5246\n\
5247Return the lowest index in S where substring sub is found,\n\
5248such that sub is contained within s[start,end]. Optional\n\
5249arguments start and end are interpreted as in slice notation.\n\
5250\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005251Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005252
5253static PyObject *
5254unicode_find(PyUnicodeObject *self, PyObject *args)
5255{
5256 PyUnicodeObject *substring;
5257 int start = 0;
5258 int end = INT_MAX;
5259 PyObject *result;
5260
Guido van Rossumb8872e62000-05-09 14:14:27 +00005261 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5262 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 return NULL;
5264 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5265 (PyObject *)substring);
5266 if (substring == NULL)
5267 return NULL;
5268
5269 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5270
5271 Py_DECREF(substring);
5272 return result;
5273}
5274
5275static PyObject *
5276unicode_getitem(PyUnicodeObject *self, int index)
5277{
5278 if (index < 0 || index >= self->length) {
5279 PyErr_SetString(PyExc_IndexError, "string index out of range");
5280 return NULL;
5281 }
5282
5283 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5284}
5285
5286static long
5287unicode_hash(PyUnicodeObject *self)
5288{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005289 /* Since Unicode objects compare equal to their ASCII string
5290 counterparts, they should use the individual character values
5291 as basis for their hash value. This is needed to assure that
5292 strings and Unicode objects behave in the same way as
5293 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294
Fredrik Lundhdde61642000-07-10 18:27:47 +00005295 register int len;
5296 register Py_UNICODE *p;
5297 register long x;
5298
Guido van Rossumd57fd912000-03-10 22:53:23 +00005299 if (self->hash != -1)
5300 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005301 len = PyUnicode_GET_SIZE(self);
5302 p = PyUnicode_AS_UNICODE(self);
5303 x = *p << 7;
5304 while (--len >= 0)
5305 x = (1000003*x) ^ *p++;
5306 x ^= PyUnicode_GET_SIZE(self);
5307 if (x == -1)
5308 x = -2;
5309 self->hash = x;
5310 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311}
5312
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005313PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314"S.index(sub [,start [,end]]) -> int\n\
5315\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005316Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317
5318static PyObject *
5319unicode_index(PyUnicodeObject *self, PyObject *args)
5320{
5321 int result;
5322 PyUnicodeObject *substring;
5323 int start = 0;
5324 int end = INT_MAX;
5325
Guido van Rossumb8872e62000-05-09 14:14:27 +00005326 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5327 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005329
Guido van Rossumd57fd912000-03-10 22:53:23 +00005330 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5331 (PyObject *)substring);
5332 if (substring == NULL)
5333 return NULL;
5334
5335 result = findstring(self, substring, start, end, 1);
5336
5337 Py_DECREF(substring);
5338 if (result < 0) {
5339 PyErr_SetString(PyExc_ValueError, "substring not found");
5340 return NULL;
5341 }
5342 return PyInt_FromLong(result);
5343}
5344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005345PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005346"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005348Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005349at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350
5351static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005352unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353{
5354 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5355 register const Py_UNICODE *e;
5356 int cased;
5357
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 /* Shortcut for single character strings */
5359 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005360 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005362 /* Special case for empty strings */
5363 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005364 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005365
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366 e = p + PyUnicode_GET_SIZE(self);
5367 cased = 0;
5368 for (; p < e; p++) {
5369 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005370
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005372 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373 else if (!cased && Py_UNICODE_ISLOWER(ch))
5374 cased = 1;
5375 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005376 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377}
5378
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005379PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005380"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005382Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005383at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384
5385static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005386unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387{
5388 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5389 register const Py_UNICODE *e;
5390 int cased;
5391
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392 /* Shortcut for single character strings */
5393 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005394 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005396 /* Special case for empty strings */
5397 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005398 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005399
Guido van Rossumd57fd912000-03-10 22:53:23 +00005400 e = p + PyUnicode_GET_SIZE(self);
5401 cased = 0;
5402 for (; p < e; p++) {
5403 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005404
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005406 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407 else if (!cased && Py_UNICODE_ISUPPER(ch))
5408 cased = 1;
5409 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005410 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411}
5412
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005413PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005414"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005415\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005416Return True if S is a titlecased string and there is at least one\n\
5417character in S, i.e. upper- and titlecase characters may only\n\
5418follow uncased characters and lowercase characters only cased ones.\n\
5419Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420
5421static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005422unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423{
5424 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5425 register const Py_UNICODE *e;
5426 int cased, previous_is_cased;
5427
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428 /* Shortcut for single character strings */
5429 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005430 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5431 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005432
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005433 /* Special case for empty strings */
5434 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005435 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005436
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437 e = p + PyUnicode_GET_SIZE(self);
5438 cased = 0;
5439 previous_is_cased = 0;
5440 for (; p < e; p++) {
5441 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005442
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5444 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005445 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 previous_is_cased = 1;
5447 cased = 1;
5448 }
5449 else if (Py_UNICODE_ISLOWER(ch)) {
5450 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005451 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 previous_is_cased = 1;
5453 cased = 1;
5454 }
5455 else
5456 previous_is_cased = 0;
5457 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005458 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459}
5460
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005461PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005462"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005464Return True if all characters in S are whitespace\n\
5465and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466
5467static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005468unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469{
5470 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5471 register const Py_UNICODE *e;
5472
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 /* Shortcut for single character strings */
5474 if (PyUnicode_GET_SIZE(self) == 1 &&
5475 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005476 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005478 /* Special case for empty strings */
5479 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005480 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005481
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 e = p + PyUnicode_GET_SIZE(self);
5483 for (; p < e; p++) {
5484 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005485 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005486 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005487 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488}
5489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005490PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005491"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005492\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005493Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005494and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005495
5496static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005497unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005498{
5499 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5500 register const Py_UNICODE *e;
5501
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005502 /* Shortcut for single character strings */
5503 if (PyUnicode_GET_SIZE(self) == 1 &&
5504 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005505 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005506
5507 /* Special case for empty strings */
5508 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005509 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005510
5511 e = p + PyUnicode_GET_SIZE(self);
5512 for (; p < e; p++) {
5513 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005514 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005515 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005516 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005517}
5518
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005519PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005520"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005521\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005522Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005523and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005524
5525static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005526unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005527{
5528 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5529 register const Py_UNICODE *e;
5530
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005531 /* Shortcut for single character strings */
5532 if (PyUnicode_GET_SIZE(self) == 1 &&
5533 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005534 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005535
5536 /* Special case for empty strings */
5537 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005538 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005539
5540 e = p + PyUnicode_GET_SIZE(self);
5541 for (; p < e; p++) {
5542 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005543 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005544 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005545 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005546}
5547
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005548PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005549"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005551Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005552False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553
5554static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005555unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556{
5557 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5558 register const Py_UNICODE *e;
5559
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560 /* Shortcut for single character strings */
5561 if (PyUnicode_GET_SIZE(self) == 1 &&
5562 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005563 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005565 /* Special case for empty strings */
5566 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005567 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005568
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 e = p + PyUnicode_GET_SIZE(self);
5570 for (; p < e; p++) {
5571 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005572 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005573 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005574 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575}
5576
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005577PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005578"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005580Return True if all characters in S are digits\n\
5581and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
5583static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005584unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585{
5586 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5587 register const Py_UNICODE *e;
5588
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 /* Shortcut for single character strings */
5590 if (PyUnicode_GET_SIZE(self) == 1 &&
5591 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005592 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005594 /* Special case for empty strings */
5595 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005596 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005597
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 e = p + PyUnicode_GET_SIZE(self);
5599 for (; p < e; p++) {
5600 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005601 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005602 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005603 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604}
5605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005606PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005607"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005609Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005610False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611
5612static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005613unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614{
5615 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5616 register const Py_UNICODE *e;
5617
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 /* Shortcut for single character strings */
5619 if (PyUnicode_GET_SIZE(self) == 1 &&
5620 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005621 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005623 /* Special case for empty strings */
5624 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005625 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005626
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 e = p + PyUnicode_GET_SIZE(self);
5628 for (; p < e; p++) {
5629 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005630 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005631 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005632 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633}
5634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005635PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636"S.join(sequence) -> unicode\n\
5637\n\
5638Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005639sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
5641static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005642unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005644 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645}
5646
5647static int
5648unicode_length(PyUnicodeObject *self)
5649{
5650 return self->length;
5651}
5652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005653PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005654"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655\n\
5656Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005657done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005658
5659static PyObject *
5660unicode_ljust(PyUnicodeObject *self, PyObject *args)
5661{
5662 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005663 Py_UNICODE fillchar = ' ';
5664
5665 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 return NULL;
5667
Tim Peters7a29bd52001-09-12 03:03:31 +00005668 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 Py_INCREF(self);
5670 return (PyObject*) self;
5671 }
5672
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005673 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674}
5675
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005676PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677"S.lower() -> unicode\n\
5678\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005679Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680
5681static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005682unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684 return fixup(self, fixlower);
5685}
5686
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005687#define LEFTSTRIP 0
5688#define RIGHTSTRIP 1
5689#define BOTHSTRIP 2
5690
5691/* Arrays indexed by above */
5692static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5693
5694#define STRIPNAME(i) (stripformat[i]+3)
5695
5696static const Py_UNICODE *
5697unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5698{
Tim Peters030a5ce2002-04-22 19:00:10 +00005699 size_t i;
5700 for (i = 0; i < n; ++i)
5701 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005702 return s+i;
5703 return NULL;
5704}
5705
5706/* externally visible for str.strip(unicode) */
5707PyObject *
5708_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5709{
5710 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5711 int len = PyUnicode_GET_SIZE(self);
5712 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5713 int seplen = PyUnicode_GET_SIZE(sepobj);
5714 int i, j;
5715
5716 i = 0;
5717 if (striptype != RIGHTSTRIP) {
5718 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5719 i++;
5720 }
5721 }
5722
5723 j = len;
5724 if (striptype != LEFTSTRIP) {
5725 do {
5726 j--;
5727 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5728 j++;
5729 }
5730
5731 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5732 Py_INCREF(self);
5733 return (PyObject*)self;
5734 }
5735 else
5736 return PyUnicode_FromUnicode(s+i, j-i);
5737}
5738
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
5740static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005741do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005743 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5744 int len = PyUnicode_GET_SIZE(self), i, j;
5745
5746 i = 0;
5747 if (striptype != RIGHTSTRIP) {
5748 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5749 i++;
5750 }
5751 }
5752
5753 j = len;
5754 if (striptype != LEFTSTRIP) {
5755 do {
5756 j--;
5757 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5758 j++;
5759 }
5760
5761 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5762 Py_INCREF(self);
5763 return (PyObject*)self;
5764 }
5765 else
5766 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767}
5768
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005769
5770static PyObject *
5771do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5772{
5773 PyObject *sep = NULL;
5774
5775 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5776 return NULL;
5777
5778 if (sep != NULL && sep != Py_None) {
5779 if (PyUnicode_Check(sep))
5780 return _PyUnicode_XStrip(self, striptype, sep);
5781 else if (PyString_Check(sep)) {
5782 PyObject *res;
5783 sep = PyUnicode_FromObject(sep);
5784 if (sep==NULL)
5785 return NULL;
5786 res = _PyUnicode_XStrip(self, striptype, sep);
5787 Py_DECREF(sep);
5788 return res;
5789 }
5790 else {
5791 PyErr_Format(PyExc_TypeError,
5792 "%s arg must be None, unicode or str",
5793 STRIPNAME(striptype));
5794 return NULL;
5795 }
5796 }
5797
5798 return do_strip(self, striptype);
5799}
5800
5801
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005802PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005803"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005804\n\
5805Return a copy of the string S with leading and trailing\n\
5806whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005807If chars is given and not None, remove characters in chars instead.\n\
5808If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005809
5810static PyObject *
5811unicode_strip(PyUnicodeObject *self, PyObject *args)
5812{
5813 if (PyTuple_GET_SIZE(args) == 0)
5814 return do_strip(self, BOTHSTRIP); /* Common case */
5815 else
5816 return do_argstrip(self, BOTHSTRIP, args);
5817}
5818
5819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005820PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005821"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005822\n\
5823Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005824If chars is given and not None, remove characters in chars instead.\n\
5825If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005826
5827static PyObject *
5828unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5829{
5830 if (PyTuple_GET_SIZE(args) == 0)
5831 return do_strip(self, LEFTSTRIP); /* Common case */
5832 else
5833 return do_argstrip(self, LEFTSTRIP, args);
5834}
5835
5836
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005837PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005838"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005839\n\
5840Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005841If chars is given and not None, remove characters in chars instead.\n\
5842If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005843
5844static PyObject *
5845unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5846{
5847 if (PyTuple_GET_SIZE(args) == 0)
5848 return do_strip(self, RIGHTSTRIP); /* Common case */
5849 else
5850 return do_argstrip(self, RIGHTSTRIP, args);
5851}
5852
5853
Guido van Rossumd57fd912000-03-10 22:53:23 +00005854static PyObject*
5855unicode_repeat(PyUnicodeObject *str, int len)
5856{
5857 PyUnicodeObject *u;
5858 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005859 int nchars;
5860 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861
5862 if (len < 0)
5863 len = 0;
5864
Tim Peters7a29bd52001-09-12 03:03:31 +00005865 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005866 /* no repeat, return original string */
5867 Py_INCREF(str);
5868 return (PyObject*) str;
5869 }
Tim Peters8f422462000-09-09 06:13:41 +00005870
5871 /* ensure # of chars needed doesn't overflow int and # of bytes
5872 * needed doesn't overflow size_t
5873 */
5874 nchars = len * str->length;
5875 if (len && nchars / len != str->length) {
5876 PyErr_SetString(PyExc_OverflowError,
5877 "repeated string is too long");
5878 return NULL;
5879 }
5880 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5881 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5882 PyErr_SetString(PyExc_OverflowError,
5883 "repeated string is too long");
5884 return NULL;
5885 }
5886 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 if (!u)
5888 return NULL;
5889
5890 p = u->str;
5891
5892 while (len-- > 0) {
5893 Py_UNICODE_COPY(p, str->str, str->length);
5894 p += str->length;
5895 }
5896
5897 return (PyObject*) u;
5898}
5899
5900PyObject *PyUnicode_Replace(PyObject *obj,
5901 PyObject *subobj,
5902 PyObject *replobj,
5903 int maxcount)
5904{
5905 PyObject *self;
5906 PyObject *str1;
5907 PyObject *str2;
5908 PyObject *result;
5909
5910 self = PyUnicode_FromObject(obj);
5911 if (self == NULL)
5912 return NULL;
5913 str1 = PyUnicode_FromObject(subobj);
5914 if (str1 == NULL) {
5915 Py_DECREF(self);
5916 return NULL;
5917 }
5918 str2 = PyUnicode_FromObject(replobj);
5919 if (str2 == NULL) {
5920 Py_DECREF(self);
5921 Py_DECREF(str1);
5922 return NULL;
5923 }
Tim Petersced69f82003-09-16 20:30:58 +00005924 result = replace((PyUnicodeObject *)self,
5925 (PyUnicodeObject *)str1,
5926 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005927 maxcount);
5928 Py_DECREF(self);
5929 Py_DECREF(str1);
5930 Py_DECREF(str2);
5931 return result;
5932}
5933
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005934PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005935"S.replace (old, new[, maxsplit]) -> unicode\n\
5936\n\
5937Return a copy of S with all occurrences of substring\n\
5938old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005939given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005940
5941static PyObject*
5942unicode_replace(PyUnicodeObject *self, PyObject *args)
5943{
5944 PyUnicodeObject *str1;
5945 PyUnicodeObject *str2;
5946 int maxcount = -1;
5947 PyObject *result;
5948
5949 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5950 return NULL;
5951 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5952 if (str1 == NULL)
5953 return NULL;
5954 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005955 if (str2 == NULL) {
5956 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005957 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005958 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005959
5960 result = replace(self, str1, str2, maxcount);
5961
5962 Py_DECREF(str1);
5963 Py_DECREF(str2);
5964 return result;
5965}
5966
5967static
5968PyObject *unicode_repr(PyObject *unicode)
5969{
5970 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5971 PyUnicode_GET_SIZE(unicode),
5972 1);
5973}
5974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005975PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005976"S.rfind(sub [,start [,end]]) -> int\n\
5977\n\
5978Return the highest index in S where substring sub is found,\n\
5979such that sub is contained within s[start,end]. Optional\n\
5980arguments start and end are interpreted as in slice notation.\n\
5981\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005982Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005983
5984static PyObject *
5985unicode_rfind(PyUnicodeObject *self, PyObject *args)
5986{
5987 PyUnicodeObject *substring;
5988 int start = 0;
5989 int end = INT_MAX;
5990 PyObject *result;
5991
Guido van Rossumb8872e62000-05-09 14:14:27 +00005992 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5993 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005994 return NULL;
5995 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5996 (PyObject *)substring);
5997 if (substring == NULL)
5998 return NULL;
5999
6000 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
6001
6002 Py_DECREF(substring);
6003 return result;
6004}
6005
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006006PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007"S.rindex(sub [,start [,end]]) -> int\n\
6008\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006009Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010
6011static PyObject *
6012unicode_rindex(PyUnicodeObject *self, PyObject *args)
6013{
6014 int result;
6015 PyUnicodeObject *substring;
6016 int start = 0;
6017 int end = INT_MAX;
6018
Guido van Rossumb8872e62000-05-09 14:14:27 +00006019 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6020 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006021 return NULL;
6022 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6023 (PyObject *)substring);
6024 if (substring == NULL)
6025 return NULL;
6026
6027 result = findstring(self, substring, start, end, -1);
6028
6029 Py_DECREF(substring);
6030 if (result < 0) {
6031 PyErr_SetString(PyExc_ValueError, "substring not found");
6032 return NULL;
6033 }
6034 return PyInt_FromLong(result);
6035}
6036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006037PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006038"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039\n\
6040Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006041done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006042
6043static PyObject *
6044unicode_rjust(PyUnicodeObject *self, PyObject *args)
6045{
6046 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006047 Py_UNICODE fillchar = ' ';
6048
6049 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 return NULL;
6051
Tim Peters7a29bd52001-09-12 03:03:31 +00006052 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053 Py_INCREF(self);
6054 return (PyObject*) self;
6055 }
6056
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006057 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006058}
6059
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060static PyObject*
6061unicode_slice(PyUnicodeObject *self, int start, int end)
6062{
6063 /* standard clamping */
6064 if (start < 0)
6065 start = 0;
6066 if (end < 0)
6067 end = 0;
6068 if (end > self->length)
6069 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006070 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006071 /* full slice, return original string */
6072 Py_INCREF(self);
6073 return (PyObject*) self;
6074 }
6075 if (start > end)
6076 start = end;
6077 /* copy slice */
6078 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6079 end - start);
6080}
6081
6082PyObject *PyUnicode_Split(PyObject *s,
6083 PyObject *sep,
6084 int maxsplit)
6085{
6086 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006087
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 s = PyUnicode_FromObject(s);
6089 if (s == NULL)
6090 return NULL;
6091 if (sep != NULL) {
6092 sep = PyUnicode_FromObject(sep);
6093 if (sep == NULL) {
6094 Py_DECREF(s);
6095 return NULL;
6096 }
6097 }
6098
6099 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6100
6101 Py_DECREF(s);
6102 Py_XDECREF(sep);
6103 return result;
6104}
6105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006106PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006107"S.split([sep [,maxsplit]]) -> list of strings\n\
6108\n\
6109Return a list of the words in S, using sep as the\n\
6110delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006111splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006112any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113
6114static PyObject*
6115unicode_split(PyUnicodeObject *self, PyObject *args)
6116{
6117 PyObject *substring = Py_None;
6118 int maxcount = -1;
6119
6120 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6121 return NULL;
6122
6123 if (substring == Py_None)
6124 return split(self, NULL, maxcount);
6125 else if (PyUnicode_Check(substring))
6126 return split(self, (PyUnicodeObject *)substring, maxcount);
6127 else
6128 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6129}
6130
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006131PyObject *PyUnicode_RSplit(PyObject *s,
6132 PyObject *sep,
6133 int maxsplit)
6134{
6135 PyObject *result;
6136
6137 s = PyUnicode_FromObject(s);
6138 if (s == NULL)
6139 return NULL;
6140 if (sep != NULL) {
6141 sep = PyUnicode_FromObject(sep);
6142 if (sep == NULL) {
6143 Py_DECREF(s);
6144 return NULL;
6145 }
6146 }
6147
6148 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6149
6150 Py_DECREF(s);
6151 Py_XDECREF(sep);
6152 return result;
6153}
6154
6155PyDoc_STRVAR(rsplit__doc__,
6156"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6157\n\
6158Return a list of the words in S, using sep as the\n\
6159delimiter string, starting at the end of the string and\n\
6160working to the front. If maxsplit is given, at most maxsplit\n\
6161splits are done. If sep is not specified, any whitespace string\n\
6162is a separator.");
6163
6164static PyObject*
6165unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6166{
6167 PyObject *substring = Py_None;
6168 int maxcount = -1;
6169
6170 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6171 return NULL;
6172
6173 if (substring == Py_None)
6174 return rsplit(self, NULL, maxcount);
6175 else if (PyUnicode_Check(substring))
6176 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6177 else
6178 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6179}
6180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006181PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006182"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183\n\
6184Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006185Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006186is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188static PyObject*
6189unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6190{
Guido van Rossum86662912000-04-11 15:38:46 +00006191 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192
Guido van Rossum86662912000-04-11 15:38:46 +00006193 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194 return NULL;
6195
Guido van Rossum86662912000-04-11 15:38:46 +00006196 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006197}
6198
6199static
6200PyObject *unicode_str(PyUnicodeObject *self)
6201{
Fred Drakee4315f52000-05-09 19:53:39 +00006202 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203}
6204
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006205PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206"S.swapcase() -> unicode\n\
6207\n\
6208Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006209and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210
6211static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006212unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006213{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214 return fixup(self, fixswapcase);
6215}
6216
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006217PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006218"S.translate(table) -> unicode\n\
6219\n\
6220Return a copy of the string S, where all characters have been mapped\n\
6221through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006222Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6223Unmapped characters are left untouched. Characters mapped to None\n\
6224are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225
6226static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006227unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228{
Tim Petersced69f82003-09-16 20:30:58 +00006229 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006231 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 "ignore");
6233}
6234
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006235PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236"S.upper() -> unicode\n\
6237\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006238Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239
6240static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006241unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006242{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243 return fixup(self, fixupper);
6244}
6245
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006246PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247"S.zfill(width) -> unicode\n\
6248\n\
6249Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006250of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251
6252static PyObject *
6253unicode_zfill(PyUnicodeObject *self, PyObject *args)
6254{
6255 int fill;
6256 PyUnicodeObject *u;
6257
6258 int width;
6259 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6260 return NULL;
6261
6262 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006263 if (PyUnicode_CheckExact(self)) {
6264 Py_INCREF(self);
6265 return (PyObject*) self;
6266 }
6267 else
6268 return PyUnicode_FromUnicode(
6269 PyUnicode_AS_UNICODE(self),
6270 PyUnicode_GET_SIZE(self)
6271 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272 }
6273
6274 fill = width - self->length;
6275
6276 u = pad(self, fill, 0, '0');
6277
Walter Dörwald068325e2002-04-15 13:36:47 +00006278 if (u == NULL)
6279 return NULL;
6280
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281 if (u->str[fill] == '+' || u->str[fill] == '-') {
6282 /* move sign to beginning of string */
6283 u->str[0] = u->str[fill];
6284 u->str[fill] = '0';
6285 }
6286
6287 return (PyObject*) u;
6288}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289
6290#if 0
6291static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006292unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006293{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006294 return PyInt_FromLong(unicode_freelist_size);
6295}
6296#endif
6297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006298PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006299"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006301Return True if S starts with the specified prefix, False otherwise.\n\
6302With optional start, test S beginning at that position.\n\
6303With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304
6305static PyObject *
6306unicode_startswith(PyUnicodeObject *self,
6307 PyObject *args)
6308{
6309 PyUnicodeObject *substring;
6310 int start = 0;
6311 int end = INT_MAX;
6312 PyObject *result;
6313
Guido van Rossumb8872e62000-05-09 14:14:27 +00006314 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6315 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006316 return NULL;
6317 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6318 (PyObject *)substring);
6319 if (substring == NULL)
6320 return NULL;
6321
Guido van Rossum77f6a652002-04-03 22:41:51 +00006322 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006323
6324 Py_DECREF(substring);
6325 return result;
6326}
6327
6328
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006329PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006330"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006332Return True if S ends with the specified suffix, False otherwise.\n\
6333With optional start, test S beginning at that position.\n\
6334With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006335
6336static PyObject *
6337unicode_endswith(PyUnicodeObject *self,
6338 PyObject *args)
6339{
6340 PyUnicodeObject *substring;
6341 int start = 0;
6342 int end = INT_MAX;
6343 PyObject *result;
6344
Guido van Rossumb8872e62000-05-09 14:14:27 +00006345 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6346 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006347 return NULL;
6348 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6349 (PyObject *)substring);
6350 if (substring == NULL)
6351 return NULL;
6352
Guido van Rossum77f6a652002-04-03 22:41:51 +00006353 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006354
6355 Py_DECREF(substring);
6356 return result;
6357}
6358
6359
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006360
6361static PyObject *
6362unicode_getnewargs(PyUnicodeObject *v)
6363{
6364 return Py_BuildValue("(u#)", v->str, v->length);
6365}
6366
6367
Guido van Rossumd57fd912000-03-10 22:53:23 +00006368static PyMethodDef unicode_methods[] = {
6369
6370 /* Order is according to common usage: often used methods should
6371 appear first, since lookup is done sequentially. */
6372
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006373 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6374 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6375 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006376 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006377 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6378 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6379 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6380 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6381 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6382 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6383 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6384 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6385 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6386 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006387 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006388 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006389/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6390 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6391 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6392 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006393 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006394 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006395 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006396 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6397 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6398 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6399 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6400 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6401 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6402 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6403 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6404 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6405 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6406 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6407 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6408 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6409 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006410 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006411#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006412 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413#endif
6414
6415#if 0
6416 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006417 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418#endif
6419
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006420 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006421 {NULL, NULL}
6422};
6423
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006424static PyObject *
6425unicode_mod(PyObject *v, PyObject *w)
6426{
6427 if (!PyUnicode_Check(v)) {
6428 Py_INCREF(Py_NotImplemented);
6429 return Py_NotImplemented;
6430 }
6431 return PyUnicode_Format(v, w);
6432}
6433
6434static PyNumberMethods unicode_as_number = {
6435 0, /*nb_add*/
6436 0, /*nb_subtract*/
6437 0, /*nb_multiply*/
6438 0, /*nb_divide*/
6439 unicode_mod, /*nb_remainder*/
6440};
6441
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442static PySequenceMethods unicode_as_sequence = {
6443 (inquiry) unicode_length, /* sq_length */
6444 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6445 (intargfunc) unicode_repeat, /* sq_repeat */
6446 (intargfunc) unicode_getitem, /* sq_item */
6447 (intintargfunc) unicode_slice, /* sq_slice */
6448 0, /* sq_ass_item */
6449 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006450 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451};
6452
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006453static PyObject*
6454unicode_subscript(PyUnicodeObject* self, PyObject* item)
6455{
6456 if (PyInt_Check(item)) {
6457 long i = PyInt_AS_LONG(item);
6458 if (i < 0)
6459 i += PyString_GET_SIZE(self);
6460 return unicode_getitem(self, i);
6461 } else if (PyLong_Check(item)) {
6462 long i = PyLong_AsLong(item);
6463 if (i == -1 && PyErr_Occurred())
6464 return NULL;
6465 if (i < 0)
6466 i += PyString_GET_SIZE(self);
6467 return unicode_getitem(self, i);
6468 } else if (PySlice_Check(item)) {
6469 int start, stop, step, slicelength, cur, i;
6470 Py_UNICODE* source_buf;
6471 Py_UNICODE* result_buf;
6472 PyObject* result;
6473
6474 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6475 &start, &stop, &step, &slicelength) < 0) {
6476 return NULL;
6477 }
6478
6479 if (slicelength <= 0) {
6480 return PyUnicode_FromUnicode(NULL, 0);
6481 } else {
6482 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6483 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6484
6485 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6486 result_buf[i] = source_buf[cur];
6487 }
Tim Petersced69f82003-09-16 20:30:58 +00006488
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006489 result = PyUnicode_FromUnicode(result_buf, slicelength);
6490 PyMem_FREE(result_buf);
6491 return result;
6492 }
6493 } else {
6494 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6495 return NULL;
6496 }
6497}
6498
6499static PyMappingMethods unicode_as_mapping = {
6500 (inquiry)unicode_length, /* mp_length */
6501 (binaryfunc)unicode_subscript, /* mp_subscript */
6502 (objobjargproc)0, /* mp_ass_subscript */
6503};
6504
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505static int
6506unicode_buffer_getreadbuf(PyUnicodeObject *self,
6507 int index,
6508 const void **ptr)
6509{
6510 if (index != 0) {
6511 PyErr_SetString(PyExc_SystemError,
6512 "accessing non-existent unicode segment");
6513 return -1;
6514 }
6515 *ptr = (void *) self->str;
6516 return PyUnicode_GET_DATA_SIZE(self);
6517}
6518
6519static int
6520unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6521 const void **ptr)
6522{
6523 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006524 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525 return -1;
6526}
6527
6528static int
6529unicode_buffer_getsegcount(PyUnicodeObject *self,
6530 int *lenp)
6531{
6532 if (lenp)
6533 *lenp = PyUnicode_GET_DATA_SIZE(self);
6534 return 1;
6535}
6536
6537static int
6538unicode_buffer_getcharbuf(PyUnicodeObject *self,
6539 int index,
6540 const void **ptr)
6541{
6542 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006543
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 if (index != 0) {
6545 PyErr_SetString(PyExc_SystemError,
6546 "accessing non-existent unicode segment");
6547 return -1;
6548 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006549 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 if (str == NULL)
6551 return -1;
6552 *ptr = (void *) PyString_AS_STRING(str);
6553 return PyString_GET_SIZE(str);
6554}
6555
6556/* Helpers for PyUnicode_Format() */
6557
6558static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006559getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560{
6561 int argidx = *p_argidx;
6562 if (argidx < arglen) {
6563 (*p_argidx)++;
6564 if (arglen < 0)
6565 return args;
6566 else
6567 return PyTuple_GetItem(args, argidx);
6568 }
6569 PyErr_SetString(PyExc_TypeError,
6570 "not enough arguments for format string");
6571 return NULL;
6572}
6573
6574#define F_LJUST (1<<0)
6575#define F_SIGN (1<<1)
6576#define F_BLANK (1<<2)
6577#define F_ALT (1<<3)
6578#define F_ZERO (1<<4)
6579
6580static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582{
6583 register int i;
6584 int len;
6585 va_list va;
6586 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006588
6589 /* First, format the string as char array, then expand to Py_UNICODE
6590 array. */
6591 charbuffer = (char *)buffer;
6592 len = vsprintf(charbuffer, format, va);
6593 for (i = len - 1; i >= 0; i--)
6594 buffer[i] = (Py_UNICODE) charbuffer[i];
6595
6596 va_end(va);
6597 return len;
6598}
6599
Guido van Rossum078151d2002-08-11 04:24:12 +00006600/* XXX To save some code duplication, formatfloat/long/int could have been
6601 shared with stringobject.c, converting from 8-bit to Unicode after the
6602 formatting is done. */
6603
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604static int
6605formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006606 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 int flags,
6608 int prec,
6609 int type,
6610 PyObject *v)
6611{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006612 /* fmt = '%#.' + `prec` + `type`
6613 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 char fmt[20];
6615 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006616
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617 x = PyFloat_AsDouble(v);
6618 if (x == -1.0 && PyErr_Occurred())
6619 return -1;
6620 if (prec < 0)
6621 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006622 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6623 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006624 /* Worst case length calc to ensure no buffer overrun:
6625
6626 'g' formats:
6627 fmt = %#.<prec>g
6628 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6629 for any double rep.)
6630 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6631
6632 'f' formats:
6633 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6634 len = 1 + 50 + 1 + prec = 52 + prec
6635
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006636 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006637 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006638
6639 */
6640 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6641 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006642 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006643 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006644 return -1;
6645 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006646 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6647 (flags&F_ALT) ? "#" : "",
6648 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006649 return usprintf(buf, fmt, x);
6650}
6651
Tim Peters38fd5b62000-09-21 05:43:11 +00006652static PyObject*
6653formatlong(PyObject *val, int flags, int prec, int type)
6654{
6655 char *buf;
6656 int i, len;
6657 PyObject *str; /* temporary string object. */
6658 PyUnicodeObject *result;
6659
6660 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6661 if (!str)
6662 return NULL;
6663 result = _PyUnicode_New(len);
6664 for (i = 0; i < len; i++)
6665 result->str[i] = buf[i];
6666 result->str[len] = 0;
6667 Py_DECREF(str);
6668 return (PyObject*)result;
6669}
6670
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671static int
6672formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006673 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006674 int flags,
6675 int prec,
6676 int type,
6677 PyObject *v)
6678{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006679 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006680 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6681 * + 1 + 1
6682 * = 24
6683 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006684 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006685 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006686 long x;
6687
6688 x = PyInt_AsLong(v);
6689 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006690 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006691 if (x < 0 && type == 'u') {
6692 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006693 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006694 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6695 sign = "-";
6696 else
6697 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006698 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006699 prec = 1;
6700
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006701 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6702 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006703 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006704 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006705 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006706 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006707 return -1;
6708 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006709
6710 if ((flags & F_ALT) &&
6711 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006712 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006713 * of issues that cause pain:
6714 * - when 0 is being converted, the C standard leaves off
6715 * the '0x' or '0X', which is inconsistent with other
6716 * %#x/%#X conversions and inconsistent with Python's
6717 * hex() function
6718 * - there are platforms that violate the standard and
6719 * convert 0 with the '0x' or '0X'
6720 * (Metrowerks, Compaq Tru64)
6721 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006722 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006723 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006724 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006725 * We can achieve the desired consistency by inserting our
6726 * own '0x' or '0X' prefix, and substituting %x/%X in place
6727 * of %#x/%#X.
6728 *
6729 * Note that this is the same approach as used in
6730 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006731 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006732 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6733 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006734 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006735 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006736 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6737 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006738 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006739 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006740 if (sign[0])
6741 return usprintf(buf, fmt, -x);
6742 else
6743 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006744}
6745
6746static int
6747formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006748 size_t buflen,
6749 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006750{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006751 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006752 if (PyUnicode_Check(v)) {
6753 if (PyUnicode_GET_SIZE(v) != 1)
6754 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006758 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006759 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006760 goto onError;
6761 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6762 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006763
6764 else {
6765 /* Integer input truncated to a character */
6766 long x;
6767 x = PyInt_AsLong(v);
6768 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006769 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006770#ifdef Py_UNICODE_WIDE
6771 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006772 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006773 "%c arg not in range(0x110000) "
6774 "(wide Python build)");
6775 return -1;
6776 }
6777#else
6778 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006779 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006780 "%c arg not in range(0x10000) "
6781 "(narrow Python build)");
6782 return -1;
6783 }
6784#endif
6785 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006786 }
6787 buf[1] = '\0';
6788 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006789
6790 onError:
6791 PyErr_SetString(PyExc_TypeError,
6792 "%c requires int or char");
6793 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006794}
6795
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006796/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6797
6798 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6799 chars are formatted. XXX This is a magic number. Each formatting
6800 routine does bounds checking to ensure no overflow, but a better
6801 solution may be to malloc a buffer of appropriate size for each
6802 format. For now, the current solution is sufficient.
6803*/
6804#define FORMATBUFLEN (size_t)120
6805
Guido van Rossumd57fd912000-03-10 22:53:23 +00006806PyObject *PyUnicode_Format(PyObject *format,
6807 PyObject *args)
6808{
6809 Py_UNICODE *fmt, *res;
6810 int fmtcnt, rescnt, reslen, arglen, argidx;
6811 int args_owned = 0;
6812 PyUnicodeObject *result = NULL;
6813 PyObject *dict = NULL;
6814 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006815
Guido van Rossumd57fd912000-03-10 22:53:23 +00006816 if (format == NULL || args == NULL) {
6817 PyErr_BadInternalCall();
6818 return NULL;
6819 }
6820 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006821 if (uformat == NULL)
6822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006823 fmt = PyUnicode_AS_UNICODE(uformat);
6824 fmtcnt = PyUnicode_GET_SIZE(uformat);
6825
6826 reslen = rescnt = fmtcnt + 100;
6827 result = _PyUnicode_New(reslen);
6828 if (result == NULL)
6829 goto onError;
6830 res = PyUnicode_AS_UNICODE(result);
6831
6832 if (PyTuple_Check(args)) {
6833 arglen = PyTuple_Size(args);
6834 argidx = 0;
6835 }
6836 else {
6837 arglen = -1;
6838 argidx = -2;
6839 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006840 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6841 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006842 dict = args;
6843
6844 while (--fmtcnt >= 0) {
6845 if (*fmt != '%') {
6846 if (--rescnt < 0) {
6847 rescnt = fmtcnt + 100;
6848 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006849 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006850 return NULL;
6851 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6852 --rescnt;
6853 }
6854 *res++ = *fmt++;
6855 }
6856 else {
6857 /* Got a format specifier */
6858 int flags = 0;
6859 int width = -1;
6860 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 Py_UNICODE c = '\0';
6862 Py_UNICODE fill;
6863 PyObject *v = NULL;
6864 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006865 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866 Py_UNICODE sign;
6867 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006868 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006869
6870 fmt++;
6871 if (*fmt == '(') {
6872 Py_UNICODE *keystart;
6873 int keylen;
6874 PyObject *key;
6875 int pcount = 1;
6876
6877 if (dict == NULL) {
6878 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006879 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006880 goto onError;
6881 }
6882 ++fmt;
6883 --fmtcnt;
6884 keystart = fmt;
6885 /* Skip over balanced parentheses */
6886 while (pcount > 0 && --fmtcnt >= 0) {
6887 if (*fmt == ')')
6888 --pcount;
6889 else if (*fmt == '(')
6890 ++pcount;
6891 fmt++;
6892 }
6893 keylen = fmt - keystart - 1;
6894 if (fmtcnt < 0 || pcount > 0) {
6895 PyErr_SetString(PyExc_ValueError,
6896 "incomplete format key");
6897 goto onError;
6898 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006899#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006900 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 then looked up since Python uses strings to hold
6902 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006903 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006904 key = PyUnicode_EncodeUTF8(keystart,
6905 keylen,
6906 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006907#else
6908 key = PyUnicode_FromUnicode(keystart, keylen);
6909#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006910 if (key == NULL)
6911 goto onError;
6912 if (args_owned) {
6913 Py_DECREF(args);
6914 args_owned = 0;
6915 }
6916 args = PyObject_GetItem(dict, key);
6917 Py_DECREF(key);
6918 if (args == NULL) {
6919 goto onError;
6920 }
6921 args_owned = 1;
6922 arglen = -1;
6923 argidx = -2;
6924 }
6925 while (--fmtcnt >= 0) {
6926 switch (c = *fmt++) {
6927 case '-': flags |= F_LJUST; continue;
6928 case '+': flags |= F_SIGN; continue;
6929 case ' ': flags |= F_BLANK; continue;
6930 case '#': flags |= F_ALT; continue;
6931 case '0': flags |= F_ZERO; continue;
6932 }
6933 break;
6934 }
6935 if (c == '*') {
6936 v = getnextarg(args, arglen, &argidx);
6937 if (v == NULL)
6938 goto onError;
6939 if (!PyInt_Check(v)) {
6940 PyErr_SetString(PyExc_TypeError,
6941 "* wants int");
6942 goto onError;
6943 }
6944 width = PyInt_AsLong(v);
6945 if (width < 0) {
6946 flags |= F_LJUST;
6947 width = -width;
6948 }
6949 if (--fmtcnt >= 0)
6950 c = *fmt++;
6951 }
6952 else if (c >= '0' && c <= '9') {
6953 width = c - '0';
6954 while (--fmtcnt >= 0) {
6955 c = *fmt++;
6956 if (c < '0' || c > '9')
6957 break;
6958 if ((width*10) / 10 != width) {
6959 PyErr_SetString(PyExc_ValueError,
6960 "width too big");
6961 goto onError;
6962 }
6963 width = width*10 + (c - '0');
6964 }
6965 }
6966 if (c == '.') {
6967 prec = 0;
6968 if (--fmtcnt >= 0)
6969 c = *fmt++;
6970 if (c == '*') {
6971 v = getnextarg(args, arglen, &argidx);
6972 if (v == NULL)
6973 goto onError;
6974 if (!PyInt_Check(v)) {
6975 PyErr_SetString(PyExc_TypeError,
6976 "* wants int");
6977 goto onError;
6978 }
6979 prec = PyInt_AsLong(v);
6980 if (prec < 0)
6981 prec = 0;
6982 if (--fmtcnt >= 0)
6983 c = *fmt++;
6984 }
6985 else if (c >= '0' && c <= '9') {
6986 prec = c - '0';
6987 while (--fmtcnt >= 0) {
6988 c = Py_CHARMASK(*fmt++);
6989 if (c < '0' || c > '9')
6990 break;
6991 if ((prec*10) / 10 != prec) {
6992 PyErr_SetString(PyExc_ValueError,
6993 "prec too big");
6994 goto onError;
6995 }
6996 prec = prec*10 + (c - '0');
6997 }
6998 }
6999 } /* prec */
7000 if (fmtcnt >= 0) {
7001 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 if (--fmtcnt >= 0)
7003 c = *fmt++;
7004 }
7005 }
7006 if (fmtcnt < 0) {
7007 PyErr_SetString(PyExc_ValueError,
7008 "incomplete format");
7009 goto onError;
7010 }
7011 if (c != '%') {
7012 v = getnextarg(args, arglen, &argidx);
7013 if (v == NULL)
7014 goto onError;
7015 }
7016 sign = 0;
7017 fill = ' ';
7018 switch (c) {
7019
7020 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007021 pbuf = formatbuf;
7022 /* presume that buffer length is at least 1 */
7023 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007024 len = 1;
7025 break;
7026
7027 case 's':
7028 case 'r':
7029 if (PyUnicode_Check(v) && c == 's') {
7030 temp = v;
7031 Py_INCREF(temp);
7032 }
7033 else {
7034 PyObject *unicode;
7035 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007036 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007037 else
7038 temp = PyObject_Repr(v);
7039 if (temp == NULL)
7040 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007041 if (PyUnicode_Check(temp))
7042 /* nothing to do */;
7043 else if (PyString_Check(temp)) {
7044 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00007045 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007046 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00007047 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007048 "strict");
7049 Py_DECREF(temp);
7050 temp = unicode;
7051 if (temp == NULL)
7052 goto onError;
7053 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007054 else {
7055 Py_DECREF(temp);
7056 PyErr_SetString(PyExc_TypeError,
7057 "%s argument has non-string str()");
7058 goto onError;
7059 }
7060 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007061 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 len = PyUnicode_GET_SIZE(temp);
7063 if (prec >= 0 && len > prec)
7064 len = prec;
7065 break;
7066
7067 case 'i':
7068 case 'd':
7069 case 'u':
7070 case 'o':
7071 case 'x':
7072 case 'X':
7073 if (c == 'i')
7074 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007075 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007076 temp = formatlong(v, flags, prec, c);
7077 if (!temp)
7078 goto onError;
7079 pbuf = PyUnicode_AS_UNICODE(temp);
7080 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007081 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007082 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007083 else {
7084 pbuf = formatbuf;
7085 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7086 flags, prec, c, v);
7087 if (len < 0)
7088 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007089 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007090 }
7091 if (flags & F_ZERO)
7092 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007093 break;
7094
7095 case 'e':
7096 case 'E':
7097 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007098 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007099 case 'g':
7100 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007101 if (c == 'F')
7102 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007103 pbuf = formatbuf;
7104 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7105 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 if (len < 0)
7107 goto onError;
7108 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007109 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007110 fill = '0';
7111 break;
7112
7113 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007114 pbuf = formatbuf;
7115 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007116 if (len < 0)
7117 goto onError;
7118 break;
7119
7120 default:
7121 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007122 "unsupported format character '%c' (0x%x) "
7123 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007124 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007125 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007126 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007127 goto onError;
7128 }
7129 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007130 if (*pbuf == '-' || *pbuf == '+') {
7131 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007132 len--;
7133 }
7134 else if (flags & F_SIGN)
7135 sign = '+';
7136 else if (flags & F_BLANK)
7137 sign = ' ';
7138 else
7139 sign = 0;
7140 }
7141 if (width < len)
7142 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007143 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007144 reslen -= rescnt;
7145 rescnt = width + fmtcnt + 100;
7146 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007147 if (reslen < 0) {
7148 Py_DECREF(result);
7149 return PyErr_NoMemory();
7150 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007151 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007152 return NULL;
7153 res = PyUnicode_AS_UNICODE(result)
7154 + reslen - rescnt;
7155 }
7156 if (sign) {
7157 if (fill != ' ')
7158 *res++ = sign;
7159 rescnt--;
7160 if (width > len)
7161 width--;
7162 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007163 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7164 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007165 assert(pbuf[1] == c);
7166 if (fill != ' ') {
7167 *res++ = *pbuf++;
7168 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007169 }
Tim Petersfff53252001-04-12 18:38:48 +00007170 rescnt -= 2;
7171 width -= 2;
7172 if (width < 0)
7173 width = 0;
7174 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007176 if (width > len && !(flags & F_LJUST)) {
7177 do {
7178 --rescnt;
7179 *res++ = fill;
7180 } while (--width > len);
7181 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007182 if (fill == ' ') {
7183 if (sign)
7184 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007185 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007186 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007187 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007188 *res++ = *pbuf++;
7189 *res++ = *pbuf++;
7190 }
7191 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007192 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193 res += len;
7194 rescnt -= len;
7195 while (--width >= len) {
7196 --rescnt;
7197 *res++ = ' ';
7198 }
7199 if (dict && (argidx < arglen) && c != '%') {
7200 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007201 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007202 goto onError;
7203 }
7204 Py_XDECREF(temp);
7205 } /* '%' */
7206 } /* until end */
7207 if (argidx < arglen && !dict) {
7208 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007209 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210 goto onError;
7211 }
7212
7213 if (args_owned) {
7214 Py_DECREF(args);
7215 }
7216 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007217 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007218 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219 return (PyObject *)result;
7220
7221 onError:
7222 Py_XDECREF(result);
7223 Py_DECREF(uformat);
7224 if (args_owned) {
7225 Py_DECREF(args);
7226 }
7227 return NULL;
7228}
7229
7230static PyBufferProcs unicode_as_buffer = {
7231 (getreadbufferproc) unicode_buffer_getreadbuf,
7232 (getwritebufferproc) unicode_buffer_getwritebuf,
7233 (getsegcountproc) unicode_buffer_getsegcount,
7234 (getcharbufferproc) unicode_buffer_getcharbuf,
7235};
7236
Jeremy Hylton938ace62002-07-17 16:30:39 +00007237static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007238unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7239
Tim Peters6d6c1a32001-08-02 04:15:00 +00007240static PyObject *
7241unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7242{
7243 PyObject *x = NULL;
7244 static char *kwlist[] = {"string", "encoding", "errors", 0};
7245 char *encoding = NULL;
7246 char *errors = NULL;
7247
Guido van Rossume023fe02001-08-30 03:12:59 +00007248 if (type != &PyUnicode_Type)
7249 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007250 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7251 kwlist, &x, &encoding, &errors))
7252 return NULL;
7253 if (x == NULL)
7254 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007255 if (encoding == NULL && errors == NULL)
7256 return PyObject_Unicode(x);
7257 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007258 return PyUnicode_FromEncodedObject(x, encoding, errors);
7259}
7260
Guido van Rossume023fe02001-08-30 03:12:59 +00007261static PyObject *
7262unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7263{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007264 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007265 int n;
7266
7267 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7268 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7269 if (tmp == NULL)
7270 return NULL;
7271 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007272 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007273 if (pnew == NULL) {
7274 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007275 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007276 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007277 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7278 if (pnew->str == NULL) {
7279 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007280 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007281 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007282 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007283 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007284 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7285 pnew->length = n;
7286 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007287 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007288 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007289}
7290
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007291PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007292"unicode(string [, encoding[, errors]]) -> object\n\
7293\n\
7294Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007295encoding defaults to the current default string encoding.\n\
7296errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007297
Guido van Rossumd57fd912000-03-10 22:53:23 +00007298PyTypeObject PyUnicode_Type = {
7299 PyObject_HEAD_INIT(&PyType_Type)
7300 0, /* ob_size */
7301 "unicode", /* tp_name */
7302 sizeof(PyUnicodeObject), /* tp_size */
7303 0, /* tp_itemsize */
7304 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007305 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007307 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308 0, /* tp_setattr */
7309 (cmpfunc) unicode_compare, /* tp_compare */
7310 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007311 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007313 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007314 (hashfunc) unicode_hash, /* tp_hash*/
7315 0, /* tp_call*/
7316 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007317 PyObject_GenericGetAttr, /* tp_getattro */
7318 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007319 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007320 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7321 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007322 unicode_doc, /* tp_doc */
7323 0, /* tp_traverse */
7324 0, /* tp_clear */
7325 0, /* tp_richcompare */
7326 0, /* tp_weaklistoffset */
7327 0, /* tp_iter */
7328 0, /* tp_iternext */
7329 unicode_methods, /* tp_methods */
7330 0, /* tp_members */
7331 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007332 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007333 0, /* tp_dict */
7334 0, /* tp_descr_get */
7335 0, /* tp_descr_set */
7336 0, /* tp_dictoffset */
7337 0, /* tp_init */
7338 0, /* tp_alloc */
7339 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007340 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007341};
7342
7343/* Initialize the Unicode implementation */
7344
Thomas Wouters78890102000-07-22 19:25:51 +00007345void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007346{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007347 int i;
7348
Fred Drakee4315f52000-05-09 19:53:39 +00007349 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007350 unicode_freelist = NULL;
7351 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007352 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007353 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007354 for (i = 0; i < 256; i++)
7355 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007356 if (PyType_Ready(&PyUnicode_Type) < 0)
7357 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007358}
7359
7360/* Finalize the Unicode implementation */
7361
7362void
Thomas Wouters78890102000-07-22 19:25:51 +00007363_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007365 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007366 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007367
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007368 Py_XDECREF(unicode_empty);
7369 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007370
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007371 for (i = 0; i < 256; i++) {
7372 if (unicode_latin1[i]) {
7373 Py_DECREF(unicode_latin1[i]);
7374 unicode_latin1[i] = NULL;
7375 }
7376 }
7377
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007378 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007379 PyUnicodeObject *v = u;
7380 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007381 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007382 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007383 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007384 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007386 unicode_freelist = NULL;
7387 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007388}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007389
7390/*
7391Local variables:
7392c-basic-offset: 4
7393indent-tabs-mode: nil
7394End:
7395*/