blob: 9e5e3b47a3a37685f89e2e9661457f502fc2b002 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000371 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000387
388 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000390 size = PyUnicode_GET_SIZE(unicode) + 1;
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392#ifdef HAVE_USABLE_WCHAR_T
393 memcpy(w, unicode->str, size * sizeof(wchar_t));
394#else
395 {
396 register Py_UNICODE *u;
397 register int i;
398 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000399 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 *w++ = *u++;
401 }
402#endif
403
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000404 if (size > PyUnicode_GET_SIZE(unicode))
405 return PyUnicode_GET_SIZE(unicode);
406 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000407 return size;
408}
409
410#endif
411
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000412PyObject *PyUnicode_FromOrdinal(int ordinal)
413{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000414 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000415
416#ifdef Py_UNICODE_WIDE
417 if (ordinal < 0 || ordinal > 0x10ffff) {
418 PyErr_SetString(PyExc_ValueError,
419 "unichr() arg not in range(0x110000) "
420 "(wide Python build)");
421 return NULL;
422 }
423#else
424 if (ordinal < 0 || ordinal > 0xffff) {
425 PyErr_SetString(PyExc_ValueError,
426 "unichr() arg not in range(0x10000) "
427 "(narrow Python build)");
428 return NULL;
429 }
430#endif
431
Hye-Shik Chang40574832004-04-06 07:24:51 +0000432 s[0] = (Py_UNICODE)ordinal;
433 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000434}
435
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436PyObject *PyUnicode_FromObject(register PyObject *obj)
437{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 /* XXX Perhaps we should make this API an alias of
439 PyObject_Unicode() instead ?! */
440 if (PyUnicode_CheckExact(obj)) {
441 Py_INCREF(obj);
442 return obj;
443 }
444 if (PyUnicode_Check(obj)) {
445 /* For a Unicode subtype that's not a Unicode object,
446 return a true Unicode object with the same data. */
447 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
448 PyUnicode_GET_SIZE(obj));
449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
451}
452
453PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
454 const char *encoding,
455 const char *errors)
456{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000457 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 if (obj == NULL) {
462 PyErr_BadInternalCall();
463 return NULL;
464 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466#if 0
467 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000468 that no encodings is given and then redirect to
469 PyObject_Unicode() which then applies the additional logic for
470 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000471
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000472 NOTE: This API should really only be used for object which
473 represent *encoded* Unicode !
474
475 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000476 if (PyUnicode_Check(obj)) {
477 if (encoding) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000481 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#else
485 if (PyUnicode_Check(obj)) {
486 PyErr_SetString(PyExc_TypeError,
487 "decoding Unicode is not supported");
488 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000489 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000490#endif
491
492 /* Coerce object */
493 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000494 s = PyString_AS_STRING(obj);
495 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000496 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000497 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
498 /* Overwrite the error message with something more useful in
499 case of a TypeError. */
500 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000502 "coercing to Unicode: need string or buffer, "
503 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000504 obj->ob_type->tp_name);
505 goto onError;
506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 if (len == 0) {
510 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000511 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000515
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520}
521
522PyObject *PyUnicode_Decode(const char *s,
523 int size,
524 const char *encoding,
525 const char *errors)
526{
527 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528
529 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000530 encoding = PyUnicode_GetDefaultEncoding();
531
532 /* Shortcuts for common default encodings */
533 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "latin-1") == 0)
536 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000537#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
538 else if (strcmp(encoding, "mbcs") == 0)
539 return PyUnicode_DecodeMBCS(s, size, errors);
540#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000541 else if (strcmp(encoding, "ascii") == 0)
542 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543
544 /* Decode via the codec registry */
545 buffer = PyBuffer_FromMemory((void *)s, size);
546 if (buffer == NULL)
547 goto onError;
548 unicode = PyCodec_Decode(buffer, encoding, errors);
549 if (unicode == NULL)
550 goto onError;
551 if (!PyUnicode_Check(unicode)) {
552 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000553 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 unicode->ob_type->tp_name);
555 Py_DECREF(unicode);
556 goto onError;
557 }
558 Py_DECREF(buffer);
559 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000560
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 onError:
562 Py_XDECREF(buffer);
563 return NULL;
564}
565
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000566PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v;
571
572 if (!PyUnicode_Check(unicode)) {
573 PyErr_BadArgument();
574 goto onError;
575 }
576
577 if (encoding == NULL)
578 encoding = PyUnicode_GetDefaultEncoding();
579
580 /* Decode via the codec registry */
581 v = PyCodec_Decode(unicode, encoding, errors);
582 if (v == NULL)
583 goto onError;
584 return v;
585
586 onError:
587 return NULL;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590PyObject *PyUnicode_Encode(const Py_UNICODE *s,
591 int size,
592 const char *encoding,
593 const char *errors)
594{
595 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 unicode = PyUnicode_FromUnicode(s, size);
598 if (unicode == NULL)
599 return NULL;
600 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
601 Py_DECREF(unicode);
602 return v;
603}
604
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000605PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
606 const char *encoding,
607 const char *errors)
608{
609 PyObject *v;
610
611 if (!PyUnicode_Check(unicode)) {
612 PyErr_BadArgument();
613 goto onError;
614 }
615
616 if (encoding == NULL)
617 encoding = PyUnicode_GetDefaultEncoding();
618
619 /* Encode via the codec registry */
620 v = PyCodec_Encode(unicode, encoding, errors);
621 if (v == NULL)
622 goto onError;
623 return v;
624
625 onError:
626 return NULL;
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
630 const char *encoding,
631 const char *errors)
632{
633 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 if (!PyUnicode_Check(unicode)) {
636 PyErr_BadArgument();
637 goto onError;
638 }
Fred Drakee4315f52000-05-09 19:53:39 +0000639
Tim Petersced69f82003-09-16 20:30:58 +0000640 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000641 encoding = PyUnicode_GetDefaultEncoding();
642
643 /* Shortcuts for common default encodings */
644 if (errors == NULL) {
645 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000646 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "latin-1") == 0)
648 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000649#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
650 else if (strcmp(encoding, "mbcs") == 0)
651 return PyUnicode_AsMBCSString(unicode);
652#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000653 else if (strcmp(encoding, "ascii") == 0)
654 return PyUnicode_AsASCIIString(unicode);
655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656
657 /* Encode via the codec registry */
658 v = PyCodec_Encode(unicode, encoding, errors);
659 if (v == NULL)
660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 if (!PyString_Check(v)) {
662 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000663 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 v->ob_type->tp_name);
665 Py_DECREF(v);
666 goto onError;
667 }
668 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000669
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 onError:
671 return NULL;
672}
673
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000674PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
675 const char *errors)
676{
677 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
678
679 if (v)
680 return v;
681 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
682 if (v && errors == NULL)
683 ((PyUnicodeObject *)unicode)->defenc = v;
684 return v;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
688{
689 if (!PyUnicode_Check(unicode)) {
690 PyErr_BadArgument();
691 goto onError;
692 }
693 return PyUnicode_AS_UNICODE(unicode);
694
695 onError:
696 return NULL;
697}
698
699int PyUnicode_GetSize(PyObject *unicode)
700{
701 if (!PyUnicode_Check(unicode)) {
702 PyErr_BadArgument();
703 goto onError;
704 }
705 return PyUnicode_GET_SIZE(unicode);
706
707 onError:
708 return -1;
709}
710
Thomas Wouters78890102000-07-22 19:25:51 +0000711const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000712{
713 return unicode_default_encoding;
714}
715
716int PyUnicode_SetDefaultEncoding(const char *encoding)
717{
718 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000719
Fred Drakee4315f52000-05-09 19:53:39 +0000720 /* Make sure the encoding is valid. As side effect, this also
721 loads the encoding into the codec registry cache. */
722 v = _PyCodec_Lookup(encoding);
723 if (v == NULL)
724 goto onError;
725 Py_DECREF(v);
726 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000727 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000728 sizeof(unicode_default_encoding));
729 return 0;
730
731 onError:
732 return -1;
733}
734
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000735/* error handling callback helper:
736 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +0000737 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000738 and adjust various state variables.
739 return 0 on success, -1 on error
740*/
741
742static
743int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
744 const char *encoding, const char *reason,
745 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
746 PyObject **output, int *outpos, Py_UNICODE **outptr)
747{
748 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
749
750 PyObject *restuple = NULL;
751 PyObject *repunicode = NULL;
752 int outsize = PyUnicode_GET_SIZE(*output);
753 int requiredsize;
754 int newpos;
755 Py_UNICODE *repptr;
756 int repsize;
757 int res = -1;
758
759 if (*errorHandler == NULL) {
760 *errorHandler = PyCodec_LookupError(errors);
761 if (*errorHandler == NULL)
762 goto onError;
763 }
764
765 if (*exceptionObject == NULL) {
766 *exceptionObject = PyUnicodeDecodeError_Create(
767 encoding, input, insize, *startinpos, *endinpos, reason);
768 if (*exceptionObject == NULL)
769 goto onError;
770 }
771 else {
772 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
773 goto onError;
774 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
777 goto onError;
778 }
779
780 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
781 if (restuple == NULL)
782 goto onError;
783 if (!PyTuple_Check(restuple)) {
784 PyErr_Format(PyExc_TypeError, &argparse[4]);
785 goto onError;
786 }
787 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
788 goto onError;
789 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000790 newpos = insize+newpos;
791 if (newpos<0 || newpos>insize) {
792 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
793 goto onError;
794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000795
796 /* need more space? (at least enough for what we
797 have+the replacement+the rest of the string (starting
798 at the new input position), so we won't have to check space
799 when there are no errors in the rest of the string) */
800 repptr = PyUnicode_AS_UNICODE(repunicode);
801 repsize = PyUnicode_GET_SIZE(repunicode);
802 requiredsize = *outpos + repsize + insize-newpos;
803 if (requiredsize > outsize) {
804 if (requiredsize<2*outsize)
805 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000806 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807 goto onError;
808 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
809 }
810 *endinpos = newpos;
811 *inptr = input + newpos;
812 Py_UNICODE_COPY(*outptr, repptr, repsize);
813 *outptr += repsize;
814 *outpos += repsize;
815 /* we made it! */
816 res = 0;
817
818 onError:
819 Py_XDECREF(restuple);
820 return res;
821}
822
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000823/* --- UTF-7 Codec -------------------------------------------------------- */
824
825/* see RFC2152 for details */
826
Tim Petersced69f82003-09-16 20:30:58 +0000827static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828char utf7_special[128] = {
829 /* indicate whether a UTF-7 character is special i.e. cannot be directly
830 encoded:
831 0 - not special
832 1 - special
833 2 - whitespace (optional)
834 3 - RFC2152 Set O (optional) */
835 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
837 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
839 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
843
844};
845
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000846/* Note: The comparison (c) <= 0 is a trick to work-around gcc
847 warnings about the comparison always being false; since
848 utf7_special[0] is 1, we can safely make that one comparison
849 true */
850
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000852 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000853 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000854 (encodeO && (utf7_special[(c)] == 3)))
855
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000856#define B64(n) \
857 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
858#define B64CHAR(c) \
859 (isalnum(c) || (c) == '+' || (c) == '/')
860#define UB64(c) \
861 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
862 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000863
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000864#define ENCODE(out, ch, bits) \
865 while (bits >= 6) { \
866 *out++ = B64(ch >> (bits-6)); \
867 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000868 }
869
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000870#define DECODE(out, ch, bits, surrogate) \
871 while (bits >= 16) { \
872 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
873 bits -= 16; \
874 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000875 /* We have already generated an error for the high surrogate \
876 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000877 surrogate = 0; \
878 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000879 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000880 it in a 16-bit character */ \
881 surrogate = 1; \
882 errmsg = "code pairs are not supported"; \
883 goto utf7Error; \
884 } else { \
885 *out++ = outCh; \
886 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000887 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000888
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889PyObject *PyUnicode_DecodeUTF7(const char *s,
890 int size,
891 const char *errors)
892{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000893 const char *starts = s;
894 int startinpos;
895 int endinpos;
896 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000897 const char *e;
898 PyUnicodeObject *unicode;
899 Py_UNICODE *p;
900 const char *errmsg = "";
901 int inShift = 0;
902 unsigned int bitsleft = 0;
903 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 int surrogate = 0;
905 PyObject *errorHandler = NULL;
906 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 unicode = _PyUnicode_New(size);
909 if (!unicode)
910 return NULL;
911 if (size == 0)
912 return (PyObject *)unicode;
913
914 p = unicode->str;
915 e = s + size;
916
917 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000918 Py_UNICODE ch;
919 restart:
920 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
922 if (inShift) {
923 if ((ch == '-') || !B64CHAR(ch)) {
924 inShift = 0;
925 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000926
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000927 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
928 if (bitsleft >= 6) {
929 /* The shift sequence has a partial character in it. If
930 bitsleft < 6 then we could just classify it as padding
931 but that is not the case here */
932
933 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000934 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 }
936 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000937 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000938 here so indicate the potential of a misencoded character. */
939
940 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
941 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
942 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000943 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 }
945
946 if (ch == '-') {
947 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000948 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 inShift = 1;
950 }
951 } else if (SPECIAL(ch,0,0)) {
952 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000953 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 } else {
955 *p++ = ch;
956 }
957 } else {
958 charsleft = (charsleft << 6) | UB64(ch);
959 bitsleft += 6;
960 s++;
961 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
962 }
963 }
964 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000965 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 s++;
967 if (s < e && *s == '-') {
968 s++;
969 *p++ = '+';
970 } else
971 {
972 inShift = 1;
973 bitsleft = 0;
974 }
975 }
976 else if (SPECIAL(ch,0,0)) {
977 errmsg = "unexpected special character";
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 }
981 else {
982 *p++ = ch;
983 s++;
984 }
985 continue;
986 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000987 outpos = p-PyUnicode_AS_UNICODE(unicode);
988 endinpos = s-starts;
989 if (unicode_decode_call_errorhandler(
990 errors, &errorHandler,
991 "utf7", errmsg,
992 starts, size, &startinpos, &endinpos, &exc, &s,
993 (PyObject **)&unicode, &outpos, &p))
994 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000995 }
996
997 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000998 outpos = p-PyUnicode_AS_UNICODE(unicode);
999 endinpos = size;
1000 if (unicode_decode_call_errorhandler(
1001 errors, &errorHandler,
1002 "utf7", "unterminated shift sequence",
1003 starts, size, &startinpos, &endinpos, &exc, &s,
1004 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001006 if (s < e)
1007 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008 }
1009
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001010 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 goto onError;
1012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001013 Py_XDECREF(errorHandler);
1014 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 return (PyObject *)unicode;
1016
1017onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 Py_XDECREF(errorHandler);
1019 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024
1025PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1026 int size,
1027 int encodeSetO,
1028 int encodeWhiteSpace,
1029 const char *errors)
1030{
1031 PyObject *v;
1032 /* It might be possible to tighten this worst case */
1033 unsigned int cbAllocated = 5 * size;
1034 int inShift = 0;
1035 int i = 0;
1036 unsigned int bitsleft = 0;
1037 unsigned long charsleft = 0;
1038 char * out;
1039 char * start;
1040
1041 if (size == 0)
1042 return PyString_FromStringAndSize(NULL, 0);
1043
1044 v = PyString_FromStringAndSize(NULL, cbAllocated);
1045 if (v == NULL)
1046 return NULL;
1047
1048 start = out = PyString_AS_STRING(v);
1049 for (;i < size; ++i) {
1050 Py_UNICODE ch = s[i];
1051
1052 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001053 if (ch == '+') {
1054 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001055 *out++ = '-';
1056 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1057 charsleft = ch;
1058 bitsleft = 16;
1059 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001060 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001062 } else {
1063 *out++ = (char) ch;
1064 }
1065 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1067 *out++ = B64(charsleft << (6-bitsleft));
1068 charsleft = 0;
1069 bitsleft = 0;
1070 /* Characters not in the BASE64 set implicitly unshift the sequence
1071 so no '-' is required, except if the character is itself a '-' */
1072 if (B64CHAR(ch) || ch == '-') {
1073 *out++ = '-';
1074 }
1075 inShift = 0;
1076 *out++ = (char) ch;
1077 } else {
1078 bitsleft += 16;
1079 charsleft = (charsleft << 16) | ch;
1080 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1081
1082 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001083 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001084 or '-' then the shift sequence will be terminated implicitly and we
1085 don't have to insert a '-'. */
1086
1087 if (bitsleft == 0) {
1088 if (i + 1 < size) {
1089 Py_UNICODE ch2 = s[i+1];
1090
1091 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001092
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 } else if (B64CHAR(ch2) || ch2 == '-') {
1094 *out++ = '-';
1095 inShift = 0;
1096 } else {
1097 inShift = 0;
1098 }
1099
1100 }
1101 else {
1102 *out++ = '-';
1103 inShift = 0;
1104 }
1105 }
Tim Petersced69f82003-09-16 20:30:58 +00001106 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001107 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001108 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 if (bitsleft) {
1110 *out++= B64(charsleft << (6-bitsleft) );
1111 *out++ = '-';
1112 }
1113
Tim Peters5de98422002-04-27 18:44:32 +00001114 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001115 return v;
1116}
1117
1118#undef SPECIAL
1119#undef B64
1120#undef B64CHAR
1121#undef UB64
1122#undef ENCODE
1123#undef DECODE
1124
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125/* --- UTF-8 Codec -------------------------------------------------------- */
1126
Tim Petersced69f82003-09-16 20:30:58 +00001127static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128char utf8_code_length[256] = {
1129 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1130 illegal prefix. see RFC 2279 for details */
1131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1146 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1147};
1148
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149PyObject *PyUnicode_DecodeUTF8(const char *s,
1150 int size,
1151 const char *errors)
1152{
Walter Dörwald69652032004-09-07 20:24:22 +00001153 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1154}
1155
1156PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1157 int size,
1158 const char *errors,
1159 int *consumed)
1160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001161 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001163 int startinpos;
1164 int endinpos;
1165 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 const char *e;
1167 PyUnicodeObject *unicode;
1168 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001169 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001170 PyObject *errorHandler = NULL;
1171 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172
1173 /* Note: size will always be longer than the resulting Unicode
1174 character count */
1175 unicode = _PyUnicode_New(size);
1176 if (!unicode)
1177 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001178 if (size == 0) {
1179 if (consumed)
1180 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183
1184 /* Unpack UTF-8 encoded data */
1185 p = unicode->str;
1186 e = s + size;
1187
1188 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001189 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001192 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 s++;
1194 continue;
1195 }
1196
1197 n = utf8_code_length[ch];
1198
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001199 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001200 if (consumed)
1201 break;
1202 else {
1203 errmsg = "unexpected end of data";
1204 startinpos = s-starts;
1205 endinpos = size;
1206 goto utf8Error;
1207 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210 switch (n) {
1211
1212 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001213 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 startinpos = s-starts;
1215 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001216 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001219 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220 startinpos = s-starts;
1221 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223
1224 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001225 if ((s[1] & 0xc0) != 0x80) {
1226 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 startinpos = s-starts;
1228 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001229 goto utf8Error;
1230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001232 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001233 startinpos = s-starts;
1234 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 errmsg = "illegal encoding";
1236 goto utf8Error;
1237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001239 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 break;
1241
1242 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001243 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001244 (s[2] & 0xc0) != 0x80) {
1245 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001246 startinpos = s-starts;
1247 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001248 goto utf8Error;
1249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001251 if (ch < 0x0800) {
1252 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001253 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001254
1255 XXX For wide builds (UCS-4) we should probably try
1256 to recombine the surrogates into a single code
1257 unit.
1258 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001260 startinpos = s-starts;
1261 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001262 goto utf8Error;
1263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001265 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001266 break;
1267
1268 case 4:
1269 if ((s[1] & 0xc0) != 0x80 ||
1270 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 (s[3] & 0xc0) != 0x80) {
1272 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
1276 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001277 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1278 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1279 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001280 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001281 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001282 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001283 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001284 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
1289 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001290#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001291 *p++ = (Py_UNICODE)ch;
1292#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001293 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001294
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001295 /* translate from 10000..10FFFF to 0..FFFF */
1296 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001297
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001298 /* high surrogate = top 10 bits added to D800 */
1299 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001300
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001301 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001302 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001303#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 break;
1305
1306 default:
1307 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001309 startinpos = s-starts;
1310 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001311 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 }
1313 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 outpos = p-PyUnicode_AS_UNICODE(unicode);
1318 if (unicode_decode_call_errorhandler(
1319 errors, &errorHandler,
1320 "utf8", errmsg,
1321 starts, size, &startinpos, &endinpos, &exc, &s,
1322 (PyObject **)&unicode, &outpos, &p))
1323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 }
Walter Dörwald69652032004-09-07 20:24:22 +00001325 if (consumed)
1326 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327
1328 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001329 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 goto onError;
1331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 Py_XDECREF(errorHandler);
1333 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 return (PyObject *)unicode;
1335
1336onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 Py_XDECREF(errorHandler);
1338 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339 Py_DECREF(unicode);
1340 return NULL;
1341}
1342
Tim Peters602f7402002-04-27 18:03:26 +00001343/* Allocation strategy: if the string is short, convert into a stack buffer
1344 and allocate exactly as much space needed at the end. Else allocate the
1345 maximum possible needed (4 result bytes per Unicode character), and return
1346 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001347*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001348PyObject *
1349PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1350 int size,
1351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352{
Tim Peters602f7402002-04-27 18:03:26 +00001353#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001354
Tim Peters602f7402002-04-27 18:03:26 +00001355 int i; /* index into s of next input byte */
1356 PyObject *v; /* result string object */
1357 char *p; /* next free byte in output buffer */
1358 int nallocated; /* number of result bytes allocated */
1359 int nneeded; /* number of result bytes needed */
1360 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001361
Tim Peters602f7402002-04-27 18:03:26 +00001362 assert(s != NULL);
1363 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364
Tim Peters602f7402002-04-27 18:03:26 +00001365 if (size <= MAX_SHORT_UNICHARS) {
1366 /* Write into the stack buffer; nallocated can't overflow.
1367 * At the end, we'll allocate exactly as much heap space as it
1368 * turns out we need.
1369 */
1370 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1371 v = NULL; /* will allocate after we're done */
1372 p = stackbuf;
1373 }
1374 else {
1375 /* Overallocate on the heap, and give the excess back at the end. */
1376 nallocated = size * 4;
1377 if (nallocated / 4 != size) /* overflow! */
1378 return PyErr_NoMemory();
1379 v = PyString_FromStringAndSize(NULL, nallocated);
1380 if (v == NULL)
1381 return NULL;
1382 p = PyString_AS_STRING(v);
1383 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001384
Tim Peters602f7402002-04-27 18:03:26 +00001385 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001386 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001387
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001388 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001389 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001391
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001393 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001394 *p++ = (char)(0xc0 | (ch >> 6));
1395 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001396 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001397 else {
Tim Peters602f7402002-04-27 18:03:26 +00001398 /* Encode UCS2 Unicode ordinals */
1399 if (ch < 0x10000) {
1400 /* Special case: check for high surrogate */
1401 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1402 Py_UCS4 ch2 = s[i];
1403 /* Check for low surrogate and combine the two to
1404 form a UCS4 value */
1405 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001406 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001407 i++;
1408 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001409 }
Tim Peters602f7402002-04-27 18:03:26 +00001410 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001411 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001413 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1414 *p++ = (char)(0x80 | (ch & 0x3f));
1415 continue;
1416 }
1417encodeUCS4:
1418 /* Encode UCS4 Unicode ordinals */
1419 *p++ = (char)(0xf0 | (ch >> 18));
1420 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1421 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1422 *p++ = (char)(0x80 | (ch & 0x3f));
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001425
Tim Peters602f7402002-04-27 18:03:26 +00001426 if (v == NULL) {
1427 /* This was stack allocated. */
1428 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1429 assert(nneeded <= nallocated);
1430 v = PyString_FromStringAndSize(stackbuf, nneeded);
1431 }
1432 else {
1433 /* Cut back to size actually needed. */
1434 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1435 assert(nneeded <= nallocated);
1436 _PyString_Resize(&v, nneeded);
1437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001439
Tim Peters602f7402002-04-27 18:03:26 +00001440#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441}
1442
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1444{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 if (!PyUnicode_Check(unicode)) {
1446 PyErr_BadArgument();
1447 return NULL;
1448 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001449 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1450 PyUnicode_GET_SIZE(unicode),
1451 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
1454/* --- UTF-16 Codec ------------------------------------------------------- */
1455
Tim Peters772747b2001-08-09 22:21:55 +00001456PyObject *
1457PyUnicode_DecodeUTF16(const char *s,
1458 int size,
1459 const char *errors,
1460 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Walter Dörwald69652032004-09-07 20:24:22 +00001462 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1463}
1464
1465PyObject *
1466PyUnicode_DecodeUTF16Stateful(const char *s,
1467 int size,
1468 const char *errors,
1469 int *byteorder,
1470 int *consumed)
1471{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 const char *starts = s;
1473 int startinpos;
1474 int endinpos;
1475 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476 PyUnicodeObject *unicode;
1477 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001478 const unsigned char *q, *e;
1479 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001481 /* Offsets from q for retrieving byte pairs in the right order. */
1482#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1483 int ihi = 1, ilo = 0;
1484#else
1485 int ihi = 0, ilo = 1;
1486#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001487 PyObject *errorHandler = NULL;
1488 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489
1490 /* Note: size will always be longer than the resulting Unicode
1491 character count */
1492 unicode = _PyUnicode_New(size);
1493 if (!unicode)
1494 return NULL;
1495 if (size == 0)
1496 return (PyObject *)unicode;
1497
1498 /* Unpack UTF-16 encoded data */
1499 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001500 q = (unsigned char *)s;
1501 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502
1503 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001504 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001506 /* Check for BOM marks (U+FEFF) in the input and adjust current
1507 byte order setting accordingly. In native mode, the leading BOM
1508 mark is skipped, in all other modes, it is copied to the output
1509 stream as-is (giving a ZWNBSP character). */
1510 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001511 if (size >= 2) {
1512 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001513#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001514 if (bom == 0xFEFF) {
1515 q += 2;
1516 bo = -1;
1517 }
1518 else if (bom == 0xFFFE) {
1519 q += 2;
1520 bo = 1;
1521 }
Tim Petersced69f82003-09-16 20:30:58 +00001522#else
Walter Dörwald69652032004-09-07 20:24:22 +00001523 if (bom == 0xFEFF) {
1524 q += 2;
1525 bo = 1;
1526 }
1527 else if (bom == 0xFFFE) {
1528 q += 2;
1529 bo = -1;
1530 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001531#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001532 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534
Tim Peters772747b2001-08-09 22:21:55 +00001535 if (bo == -1) {
1536 /* force LE */
1537 ihi = 1;
1538 ilo = 0;
1539 }
1540 else if (bo == 1) {
1541 /* force BE */
1542 ihi = 0;
1543 ilo = 1;
1544 }
1545
1546 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001548 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001550 if (consumed)
1551 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 errmsg = "truncated data";
1553 startinpos = ((const char *)q)-starts;
1554 endinpos = ((const char *)e)-starts;
1555 goto utf16Error;
1556 /* The remaining input chars are ignored if the callback
1557 chooses to skip the input */
1558 }
1559 ch = (q[ihi] << 8) | q[ilo];
1560
Tim Peters772747b2001-08-09 22:21:55 +00001561 q += 2;
1562
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 if (ch < 0xD800 || ch > 0xDFFF) {
1564 *p++ = ch;
1565 continue;
1566 }
1567
1568 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001569 if (q >= e) {
1570 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001571 startinpos = (((const char *)q)-2)-starts;
1572 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001573 goto utf16Error;
1574 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001575 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001576 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1577 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001578 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001579#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001580 *p++ = ch;
1581 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001582#else
1583 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001584#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001585 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001586 }
1587 else {
1588 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 startinpos = (((const char *)q)-4)-starts;
1590 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001591 goto utf16Error;
1592 }
1593
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001595 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 startinpos = (((const char *)q)-2)-starts;
1597 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001598 /* Fall through to report the error */
1599
1600 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001601 outpos = p-PyUnicode_AS_UNICODE(unicode);
1602 if (unicode_decode_call_errorhandler(
1603 errors, &errorHandler,
1604 "utf16", errmsg,
1605 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1606 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001607 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 }
1609
1610 if (byteorder)
1611 *byteorder = bo;
1612
Walter Dörwald69652032004-09-07 20:24:22 +00001613 if (consumed)
1614 *consumed = (const char *)q-starts;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001617 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 goto onError;
1619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 Py_XDECREF(errorHandler);
1621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 return (PyObject *)unicode;
1623
1624onError:
1625 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 Py_XDECREF(errorHandler);
1627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 return NULL;
1629}
1630
Tim Peters772747b2001-08-09 22:21:55 +00001631PyObject *
1632PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1633 int size,
1634 const char *errors,
1635 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636{
1637 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001638 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001639#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001640 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001641#else
1642 const int pairs = 0;
1643#endif
Tim Peters772747b2001-08-09 22:21:55 +00001644 /* Offsets from p for storing byte pairs in the right order. */
1645#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1646 int ihi = 1, ilo = 0;
1647#else
1648 int ihi = 0, ilo = 1;
1649#endif
1650
1651#define STORECHAR(CH) \
1652 do { \
1653 p[ihi] = ((CH) >> 8) & 0xff; \
1654 p[ilo] = (CH) & 0xff; \
1655 p += 2; \
1656 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001658#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001659 for (i = pairs = 0; i < size; i++)
1660 if (s[i] >= 0x10000)
1661 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001662#endif
Tim Petersced69f82003-09-16 20:30:58 +00001663 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001664 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 if (v == NULL)
1666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667
Tim Peters772747b2001-08-09 22:21:55 +00001668 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001670 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001671 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001672 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001673
1674 if (byteorder == -1) {
1675 /* force LE */
1676 ihi = 1;
1677 ilo = 0;
1678 }
1679 else if (byteorder == 1) {
1680 /* force BE */
1681 ihi = 0;
1682 ilo = 1;
1683 }
1684
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001685 while (size-- > 0) {
1686 Py_UNICODE ch = *s++;
1687 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001688#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001689 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001690 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1691 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001693#endif
Tim Peters772747b2001-08-09 22:21:55 +00001694 STORECHAR(ch);
1695 if (ch2)
1696 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001699#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700}
1701
1702PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1703{
1704 if (!PyUnicode_Check(unicode)) {
1705 PyErr_BadArgument();
1706 return NULL;
1707 }
1708 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1709 PyUnicode_GET_SIZE(unicode),
1710 NULL,
1711 0);
1712}
1713
1714/* --- Unicode Escape Codec ----------------------------------------------- */
1715
Fredrik Lundh06d12682001-01-24 07:59:11 +00001716static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001717
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1719 int size,
1720 const char *errors)
1721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 const char *starts = s;
1723 int startinpos;
1724 int endinpos;
1725 int outpos;
1726 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001730 char* message;
1731 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001732 PyObject *errorHandler = NULL;
1733 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001734
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 /* Escaped strings will always be longer than the resulting
1736 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 length after conversion to the true value.
1738 (but if the error callback returns a long replacement string
1739 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 v = _PyUnicode_New(size);
1741 if (v == NULL)
1742 goto onError;
1743 if (size == 0)
1744 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001748
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 while (s < end) {
1750 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001751 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 /* Non-escape characters are interpreted as Unicode ordinals */
1755 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001756 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 continue;
1758 }
1759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 /* \ - Escapes */
1762 s++;
1763 switch (*s++) {
1764
1765 /* \x escapes */
1766 case '\n': break;
1767 case '\\': *p++ = '\\'; break;
1768 case '\'': *p++ = '\''; break;
1769 case '\"': *p++ = '\"'; break;
1770 case 'b': *p++ = '\b'; break;
1771 case 'f': *p++ = '\014'; break; /* FF */
1772 case 't': *p++ = '\t'; break;
1773 case 'n': *p++ = '\n'; break;
1774 case 'r': *p++ = '\r'; break;
1775 case 'v': *p++ = '\013'; break; /* VT */
1776 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1777
1778 /* \OOO (octal) escapes */
1779 case '0': case '1': case '2': case '3':
1780 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001781 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001783 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001785 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001787 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 break;
1789
Fredrik Lundhccc74732001-02-18 22:13:49 +00001790 /* hex escapes */
1791 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001793 digits = 2;
1794 message = "truncated \\xXX escape";
1795 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001799 digits = 4;
1800 message = "truncated \\uXXXX escape";
1801 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001804 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001805 digits = 8;
1806 message = "truncated \\UXXXXXXXX escape";
1807 hexescape:
1808 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 outpos = p-PyUnicode_AS_UNICODE(v);
1810 if (s+digits>end) {
1811 endinpos = size;
1812 if (unicode_decode_call_errorhandler(
1813 errors, &errorHandler,
1814 "unicodeescape", "end of string in escape sequence",
1815 starts, size, &startinpos, &endinpos, &exc, &s,
1816 (PyObject **)&v, &outpos, &p))
1817 goto onError;
1818 goto nextByte;
1819 }
1820 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001821 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001822 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 endinpos = (s+i+1)-starts;
1824 if (unicode_decode_call_errorhandler(
1825 errors, &errorHandler,
1826 "unicodeescape", message,
1827 starts, size, &startinpos, &endinpos, &exc, &s,
1828 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001829 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001831 }
1832 chr = (chr<<4) & ~0xF;
1833 if (c >= '0' && c <= '9')
1834 chr += c - '0';
1835 else if (c >= 'a' && c <= 'f')
1836 chr += 10 + c - 'a';
1837 else
1838 chr += 10 + c - 'A';
1839 }
1840 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001841 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 /* _decoding_error will have already written into the
1843 target buffer. */
1844 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001845 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001846 /* when we get here, chr is a 32-bit unicode character */
1847 if (chr <= 0xffff)
1848 /* UCS-2 character */
1849 *p++ = (Py_UNICODE) chr;
1850 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001851 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001852 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001853#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001854 *p++ = chr;
1855#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001856 chr -= 0x10000L;
1857 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001858 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001859#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001860 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 endinpos = s-starts;
1862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (unicode_decode_call_errorhandler(
1864 errors, &errorHandler,
1865 "unicodeescape", "illegal Unicode character",
1866 starts, size, &startinpos, &endinpos, &exc, &s,
1867 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001868 goto onError;
1869 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001870 break;
1871
1872 /* \N{name} */
1873 case 'N':
1874 message = "malformed \\N character escape";
1875 if (ucnhash_CAPI == NULL) {
1876 /* load the unicode data module */
1877 PyObject *m, *v;
1878 m = PyImport_ImportModule("unicodedata");
1879 if (m == NULL)
1880 goto ucnhashError;
1881 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1882 Py_DECREF(m);
1883 if (v == NULL)
1884 goto ucnhashError;
1885 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1886 Py_DECREF(v);
1887 if (ucnhash_CAPI == NULL)
1888 goto ucnhashError;
1889 }
1890 if (*s == '{') {
1891 const char *start = s+1;
1892 /* look for the closing brace */
1893 while (*s != '}' && s < end)
1894 s++;
1895 if (s > start && s < end && *s == '}') {
1896 /* found a name. look it up in the unicode database */
1897 message = "unknown Unicode character name";
1898 s++;
1899 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1900 goto store;
1901 }
1902 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001903 endinpos = s-starts;
1904 outpos = p-PyUnicode_AS_UNICODE(v);
1905 if (unicode_decode_call_errorhandler(
1906 errors, &errorHandler,
1907 "unicodeescape", message,
1908 starts, size, &startinpos, &endinpos, &exc, &s,
1909 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 break;
1912
1913 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001914 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 message = "\\ at end of string";
1916 s--;
1917 endinpos = s-starts;
1918 outpos = p-PyUnicode_AS_UNICODE(v);
1919 if (unicode_decode_call_errorhandler(
1920 errors, &errorHandler,
1921 "unicodeescape", message,
1922 starts, size, &startinpos, &endinpos, &exc, &s,
1923 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001924 goto onError;
1925 }
1926 else {
1927 *p++ = '\\';
1928 *p++ = (unsigned char)s[-1];
1929 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001930 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 nextByte:
1933 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001935 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001936 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001937 Py_XDECREF(errorHandler);
1938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001940
Fredrik Lundhccc74732001-02-18 22:13:49 +00001941ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001942 PyErr_SetString(
1943 PyExc_UnicodeError,
1944 "\\N escapes not supported (can't load unicodedata module)"
1945 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 Py_XDECREF(errorHandler);
1947 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001948 return NULL;
1949
Fredrik Lundhccc74732001-02-18 22:13:49 +00001950onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001952 Py_XDECREF(errorHandler);
1953 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 return NULL;
1955}
1956
1957/* Return a Unicode-Escape string version of the Unicode object.
1958
1959 If quotes is true, the string is enclosed in u"" or u'' quotes as
1960 appropriate.
1961
1962*/
1963
Barry Warsaw51ac5802000-03-20 16:36:48 +00001964static const Py_UNICODE *findchar(const Py_UNICODE *s,
1965 int size,
1966 Py_UNICODE ch);
1967
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968static
1969PyObject *unicodeescape_string(const Py_UNICODE *s,
1970 int size,
1971 int quotes)
1972{
1973 PyObject *repr;
1974 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001976 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
1978 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1979 if (repr == NULL)
1980 return NULL;
1981
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983
1984 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001986 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 !findchar(s, size, '"')) ? '"' : '\'';
1988 }
1989 while (size-- > 0) {
1990 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001991
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001993 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 *p++ = '\\';
1996 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001997 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001998 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001999
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002000#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002001 /* Map 21-bit characters to '\U00xxxxxx' */
2002 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002003 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002004
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002005 /* Resize the string if necessary */
2006 if (offset + 12 > PyString_GET_SIZE(repr)) {
2007 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002008 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002009 p = PyString_AS_STRING(repr) + offset;
2010 }
2011
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002012 *p++ = '\\';
2013 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002014 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2015 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2016 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002021 *p++ = hexdigit[ch & 0x0000000F];
2022 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002023 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002024#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002025 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2026 else if (ch >= 0xD800 && ch < 0xDC00) {
2027 Py_UNICODE ch2;
2028 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002029
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002030 ch2 = *s++;
2031 size--;
2032 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2033 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2034 *p++ = '\\';
2035 *p++ = 'U';
2036 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2037 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2038 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2043 *p++ = hexdigit[ucs & 0x0000000F];
2044 continue;
2045 }
2046 /* Fall through: isolated surrogates are copied as-is */
2047 s--;
2048 size++;
2049 }
2050
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002052 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 *p++ = '\\';
2054 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002055 *p++ = hexdigit[(ch >> 12) & 0x000F];
2056 *p++ = hexdigit[(ch >> 8) & 0x000F];
2057 *p++ = hexdigit[(ch >> 4) & 0x000F];
2058 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002060
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002061 /* Map special whitespace to '\t', \n', '\r' */
2062 else if (ch == '\t') {
2063 *p++ = '\\';
2064 *p++ = 't';
2065 }
2066 else if (ch == '\n') {
2067 *p++ = '\\';
2068 *p++ = 'n';
2069 }
2070 else if (ch == '\r') {
2071 *p++ = '\\';
2072 *p++ = 'r';
2073 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002075 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002076 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002078 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002079 *p++ = hexdigit[(ch >> 4) & 0x000F];
2080 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002081 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002082
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083 /* Copy everything else as-is */
2084 else
2085 *p++ = (char) ch;
2086 }
2087 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002088 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089
2090 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002091 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 return repr;
2093}
2094
2095PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2096 int size)
2097{
2098 return unicodeescape_string(s, size, 0);
2099}
2100
2101PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2102{
2103 if (!PyUnicode_Check(unicode)) {
2104 PyErr_BadArgument();
2105 return NULL;
2106 }
2107 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2108 PyUnicode_GET_SIZE(unicode));
2109}
2110
2111/* --- Raw Unicode Escape Codec ------------------------------------------- */
2112
2113PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2114 int size,
2115 const char *errors)
2116{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 const char *starts = s;
2118 int startinpos;
2119 int endinpos;
2120 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 const char *end;
2124 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 PyObject *errorHandler = NULL;
2126 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002127
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 /* Escaped strings will always be longer than the resulting
2129 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 length after conversion to the true value. (But decoding error
2131 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 v = _PyUnicode_New(size);
2133 if (v == NULL)
2134 goto onError;
2135 if (size == 0)
2136 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 end = s + size;
2139 while (s < end) {
2140 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002141 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144
2145 /* Non-escape characters are interpreted as Unicode ordinals */
2146 if (*s != '\\') {
2147 *p++ = (unsigned char)*s++;
2148 continue;
2149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151
2152 /* \u-escapes are only interpreted iff the number of leading
2153 backslashes if odd */
2154 bs = s;
2155 for (;s < end;) {
2156 if (*s != '\\')
2157 break;
2158 *p++ = (unsigned char)*s++;
2159 }
2160 if (((s - bs) & 1) == 0 ||
2161 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002162 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 continue;
2164 }
2165 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 s++;
2168
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002169 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002170 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002171 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 endinpos = s-starts;
2175 if (unicode_decode_call_errorhandler(
2176 errors, &errorHandler,
2177 "rawunicodeescape", "truncated \\uXXXX",
2178 starts, size, &startinpos, &endinpos, &exc, &s,
2179 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 }
2183 x = (x<<4) & ~0xF;
2184 if (c >= '0' && c <= '9')
2185 x += c - '0';
2186 else if (c >= 'a' && c <= 'f')
2187 x += 10 + c - 'a';
2188 else
2189 x += 10 + c - 'A';
2190 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002191#ifndef Py_UNICODE_WIDE
2192 if (x > 0x10000) {
2193 if (unicode_decode_call_errorhandler(
2194 errors, &errorHandler,
2195 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2196 starts, size, &startinpos, &endinpos, &exc, &s,
2197 (PyObject **)&v, &outpos, &p))
2198 goto onError;
2199 }
2200#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002201 *p++ = x;
2202 nextByte:
2203 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002205 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002206 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 Py_XDECREF(errorHandler);
2208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002210
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 onError:
2212 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002213 Py_XDECREF(errorHandler);
2214 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 return NULL;
2216}
2217
2218PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2219 int size)
2220{
2221 PyObject *repr;
2222 char *p;
2223 char *q;
2224
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002225 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227#ifdef Py_UNICODE_WIDE
2228 repr = PyString_FromStringAndSize(NULL, 10 * size);
2229#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 if (repr == NULL)
2233 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002234 if (size == 0)
2235 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236
2237 p = q = PyString_AS_STRING(repr);
2238 while (size-- > 0) {
2239 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002240#ifdef Py_UNICODE_WIDE
2241 /* Map 32-bit characters to '\Uxxxxxxxx' */
2242 if (ch >= 0x10000) {
2243 *p++ = '\\';
2244 *p++ = 'U';
2245 *p++ = hexdigit[(ch >> 28) & 0xf];
2246 *p++ = hexdigit[(ch >> 24) & 0xf];
2247 *p++ = hexdigit[(ch >> 20) & 0xf];
2248 *p++ = hexdigit[(ch >> 16) & 0xf];
2249 *p++ = hexdigit[(ch >> 12) & 0xf];
2250 *p++ = hexdigit[(ch >> 8) & 0xf];
2251 *p++ = hexdigit[(ch >> 4) & 0xf];
2252 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002253 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002254 else
2255#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 /* Map 16-bit characters to '\uxxxx' */
2257 if (ch >= 256) {
2258 *p++ = '\\';
2259 *p++ = 'u';
2260 *p++ = hexdigit[(ch >> 12) & 0xf];
2261 *p++ = hexdigit[(ch >> 8) & 0xf];
2262 *p++ = hexdigit[(ch >> 4) & 0xf];
2263 *p++ = hexdigit[ch & 15];
2264 }
2265 /* Copy everything else as-is */
2266 else
2267 *p++ = (char) ch;
2268 }
2269 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002270 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 return repr;
2272}
2273
2274PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2275{
2276 if (!PyUnicode_Check(unicode)) {
2277 PyErr_BadArgument();
2278 return NULL;
2279 }
2280 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2281 PyUnicode_GET_SIZE(unicode));
2282}
2283
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002284/* --- Unicode Internal Codec ------------------------------------------- */
2285
2286PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2287 int size,
2288 const char *errors)
2289{
2290 const char *starts = s;
2291 int startinpos;
2292 int endinpos;
2293 int outpos;
2294 Py_UNICODE unimax;
2295 PyUnicodeObject *v;
2296 Py_UNICODE *p;
2297 const char *end;
2298 const char *reason;
2299 PyObject *errorHandler = NULL;
2300 PyObject *exc = NULL;
2301
2302 unimax = PyUnicode_GetMax();
2303 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2304 if (v == NULL)
2305 goto onError;
2306 if (PyUnicode_GetSize((PyObject *)v) == 0)
2307 return (PyObject *)v;
2308 p = PyUnicode_AS_UNICODE(v);
2309 end = s + size;
2310
2311 while (s < end) {
2312 *p = *(Py_UNICODE *)s;
2313 /* We have to sanity check the raw data, otherwise doom looms for
2314 some malformed UCS-4 data. */
2315 if (
2316 #ifdef Py_UNICODE_WIDE
2317 *p > unimax || *p < 0 ||
2318 #endif
2319 end-s < Py_UNICODE_SIZE
2320 )
2321 {
2322 startinpos = s - starts;
2323 if (end-s < Py_UNICODE_SIZE) {
2324 endinpos = end-starts;
2325 reason = "truncated input";
2326 }
2327 else {
2328 endinpos = s - starts + Py_UNICODE_SIZE;
2329 reason = "illegal code point (> 0x10FFFF)";
2330 }
2331 outpos = p - PyUnicode_AS_UNICODE(v);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "unicode_internal", reason,
2335 starts, size, &startinpos, &endinpos, &exc, &s,
2336 (PyObject **)&v, &outpos, &p)) {
2337 goto onError;
2338 }
2339 }
2340 else {
2341 p++;
2342 s += Py_UNICODE_SIZE;
2343 }
2344 }
2345
2346 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2347 goto onError;
2348 Py_XDECREF(errorHandler);
2349 Py_XDECREF(exc);
2350 return (PyObject *)v;
2351
2352 onError:
2353 Py_XDECREF(v);
2354 Py_XDECREF(errorHandler);
2355 Py_XDECREF(exc);
2356 return NULL;
2357}
2358
Guido van Rossumd57fd912000-03-10 22:53:23 +00002359/* --- Latin-1 Codec ------------------------------------------------------ */
2360
2361PyObject *PyUnicode_DecodeLatin1(const char *s,
2362 int size,
2363 const char *errors)
2364{
2365 PyUnicodeObject *v;
2366 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002367
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002369 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002370 Py_UNICODE r = *(unsigned char*)s;
2371 return PyUnicode_FromUnicode(&r, 1);
2372 }
2373
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 v = _PyUnicode_New(size);
2375 if (v == NULL)
2376 goto onError;
2377 if (size == 0)
2378 return (PyObject *)v;
2379 p = PyUnicode_AS_UNICODE(v);
2380 while (size-- > 0)
2381 *p++ = (unsigned char)*s++;
2382 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002383
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 onError:
2385 Py_XDECREF(v);
2386 return NULL;
2387}
2388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002389/* create or adjust a UnicodeEncodeError */
2390static void make_encode_exception(PyObject **exceptionObject,
2391 const char *encoding,
2392 const Py_UNICODE *unicode, int size,
2393 int startpos, int endpos,
2394 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002396 if (*exceptionObject == NULL) {
2397 *exceptionObject = PyUnicodeEncodeError_Create(
2398 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399 }
2400 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002401 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2402 goto onError;
2403 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2404 goto onError;
2405 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2406 goto onError;
2407 return;
2408 onError:
2409 Py_DECREF(*exceptionObject);
2410 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411 }
2412}
2413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002414/* raises a UnicodeEncodeError */
2415static void raise_encode_exception(PyObject **exceptionObject,
2416 const char *encoding,
2417 const Py_UNICODE *unicode, int size,
2418 int startpos, int endpos,
2419 const char *reason)
2420{
2421 make_encode_exception(exceptionObject,
2422 encoding, unicode, size, startpos, endpos, reason);
2423 if (*exceptionObject != NULL)
2424 PyCodec_StrictErrors(*exceptionObject);
2425}
2426
2427/* error handling callback helper:
2428 build arguments, call the callback and check the arguments,
2429 put the result into newpos and return the replacement string, which
2430 has to be freed by the caller */
2431static PyObject *unicode_encode_call_errorhandler(const char *errors,
2432 PyObject **errorHandler,
2433 const char *encoding, const char *reason,
2434 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2435 int startpos, int endpos,
2436 int *newpos)
2437{
2438 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2439
2440 PyObject *restuple;
2441 PyObject *resunicode;
2442
2443 if (*errorHandler == NULL) {
2444 *errorHandler = PyCodec_LookupError(errors);
2445 if (*errorHandler == NULL)
2446 return NULL;
2447 }
2448
2449 make_encode_exception(exceptionObject,
2450 encoding, unicode, size, startpos, endpos, reason);
2451 if (*exceptionObject == NULL)
2452 return NULL;
2453
2454 restuple = PyObject_CallFunctionObjArgs(
2455 *errorHandler, *exceptionObject, NULL);
2456 if (restuple == NULL)
2457 return NULL;
2458 if (!PyTuple_Check(restuple)) {
2459 PyErr_Format(PyExc_TypeError, &argparse[4]);
2460 Py_DECREF(restuple);
2461 return NULL;
2462 }
2463 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2464 &resunicode, newpos)) {
2465 Py_DECREF(restuple);
2466 return NULL;
2467 }
2468 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002469 *newpos = size+*newpos;
2470 if (*newpos<0 || *newpos>size) {
2471 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2472 Py_DECREF(restuple);
2473 return NULL;
2474 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 Py_INCREF(resunicode);
2476 Py_DECREF(restuple);
2477 return resunicode;
2478}
2479
2480static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2481 int size,
2482 const char *errors,
2483 int limit)
2484{
2485 /* output object */
2486 PyObject *res;
2487 /* pointers to the beginning and end+1 of input */
2488 const Py_UNICODE *startp = p;
2489 const Py_UNICODE *endp = p + size;
2490 /* pointer to the beginning of the unencodable characters */
2491 /* const Py_UNICODE *badp = NULL; */
2492 /* pointer into the output */
2493 char *str;
2494 /* current output position */
2495 int respos = 0;
2496 int ressize;
2497 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2498 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2499 PyObject *errorHandler = NULL;
2500 PyObject *exc = NULL;
2501 /* the following variable is used for caching string comparisons
2502 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2503 int known_errorHandler = -1;
2504
2505 /* allocate enough for a simple encoding without
2506 replacements, if we need more, we'll resize */
2507 res = PyString_FromStringAndSize(NULL, size);
2508 if (res == NULL)
2509 goto onError;
2510 if (size == 0)
2511 return res;
2512 str = PyString_AS_STRING(res);
2513 ressize = size;
2514
2515 while (p<endp) {
2516 Py_UNICODE c = *p;
2517
2518 /* can we encode this? */
2519 if (c<limit) {
2520 /* no overflow check, because we know that the space is enough */
2521 *str++ = (char)c;
2522 ++p;
2523 }
2524 else {
2525 int unicodepos = p-startp;
2526 int requiredsize;
2527 PyObject *repunicode;
2528 int repsize;
2529 int newpos;
2530 int respos;
2531 Py_UNICODE *uni2;
2532 /* startpos for collecting unencodable chars */
2533 const Py_UNICODE *collstart = p;
2534 const Py_UNICODE *collend = p;
2535 /* find all unecodable characters */
2536 while ((collend < endp) && ((*collend)>=limit))
2537 ++collend;
2538 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2539 if (known_errorHandler==-1) {
2540 if ((errors==NULL) || (!strcmp(errors, "strict")))
2541 known_errorHandler = 1;
2542 else if (!strcmp(errors, "replace"))
2543 known_errorHandler = 2;
2544 else if (!strcmp(errors, "ignore"))
2545 known_errorHandler = 3;
2546 else if (!strcmp(errors, "xmlcharrefreplace"))
2547 known_errorHandler = 4;
2548 else
2549 known_errorHandler = 0;
2550 }
2551 switch (known_errorHandler) {
2552 case 1: /* strict */
2553 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2554 goto onError;
2555 case 2: /* replace */
2556 while (collstart++<collend)
2557 *str++ = '?'; /* fall through */
2558 case 3: /* ignore */
2559 p = collend;
2560 break;
2561 case 4: /* xmlcharrefreplace */
2562 respos = str-PyString_AS_STRING(res);
2563 /* determine replacement size (temporarily (mis)uses p) */
2564 for (p = collstart, repsize = 0; p < collend; ++p) {
2565 if (*p<10)
2566 repsize += 2+1+1;
2567 else if (*p<100)
2568 repsize += 2+2+1;
2569 else if (*p<1000)
2570 repsize += 2+3+1;
2571 else if (*p<10000)
2572 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002573#ifndef Py_UNICODE_WIDE
2574 else
2575 repsize += 2+5+1;
2576#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 else if (*p<100000)
2578 repsize += 2+5+1;
2579 else if (*p<1000000)
2580 repsize += 2+6+1;
2581 else
2582 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002583#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002584 }
2585 requiredsize = respos+repsize+(endp-collend);
2586 if (requiredsize > ressize) {
2587 if (requiredsize<2*ressize)
2588 requiredsize = 2*ressize;
2589 if (_PyString_Resize(&res, requiredsize))
2590 goto onError;
2591 str = PyString_AS_STRING(res) + respos;
2592 ressize = requiredsize;
2593 }
2594 /* generate replacement (temporarily (mis)uses p) */
2595 for (p = collstart; p < collend; ++p) {
2596 str += sprintf(str, "&#%d;", (int)*p);
2597 }
2598 p = collend;
2599 break;
2600 default:
2601 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2602 encoding, reason, startp, size, &exc,
2603 collstart-startp, collend-startp, &newpos);
2604 if (repunicode == NULL)
2605 goto onError;
2606 /* need more space? (at least enough for what we
2607 have+the replacement+the rest of the string, so
2608 we won't have to check space for encodable characters) */
2609 respos = str-PyString_AS_STRING(res);
2610 repsize = PyUnicode_GET_SIZE(repunicode);
2611 requiredsize = respos+repsize+(endp-collend);
2612 if (requiredsize > ressize) {
2613 if (requiredsize<2*ressize)
2614 requiredsize = 2*ressize;
2615 if (_PyString_Resize(&res, requiredsize)) {
2616 Py_DECREF(repunicode);
2617 goto onError;
2618 }
2619 str = PyString_AS_STRING(res) + respos;
2620 ressize = requiredsize;
2621 }
2622 /* check if there is anything unencodable in the replacement
2623 and copy it to the output */
2624 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2625 c = *uni2;
2626 if (c >= limit) {
2627 raise_encode_exception(&exc, encoding, startp, size,
2628 unicodepos, unicodepos+1, reason);
2629 Py_DECREF(repunicode);
2630 goto onError;
2631 }
2632 *str = (char)c;
2633 }
2634 p = startp + newpos;
2635 Py_DECREF(repunicode);
2636 }
2637 }
2638 }
2639 /* Resize if we allocated to much */
2640 respos = str-PyString_AS_STRING(res);
2641 if (respos<ressize)
2642 /* If this falls res will be NULL */
2643 _PyString_Resize(&res, respos);
2644 Py_XDECREF(errorHandler);
2645 Py_XDECREF(exc);
2646 return res;
2647
2648 onError:
2649 Py_XDECREF(res);
2650 Py_XDECREF(errorHandler);
2651 Py_XDECREF(exc);
2652 return NULL;
2653}
2654
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2656 int size,
2657 const char *errors)
2658{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002659 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660}
2661
2662PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2663{
2664 if (!PyUnicode_Check(unicode)) {
2665 PyErr_BadArgument();
2666 return NULL;
2667 }
2668 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2669 PyUnicode_GET_SIZE(unicode),
2670 NULL);
2671}
2672
2673/* --- 7-bit ASCII Codec -------------------------------------------------- */
2674
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675PyObject *PyUnicode_DecodeASCII(const char *s,
2676 int size,
2677 const char *errors)
2678{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 PyUnicodeObject *v;
2681 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002682 int startinpos;
2683 int endinpos;
2684 int outpos;
2685 const char *e;
2686 PyObject *errorHandler = NULL;
2687 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002688
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002690 if (size == 1 && *(unsigned char*)s < 128) {
2691 Py_UNICODE r = *(unsigned char*)s;
2692 return PyUnicode_FromUnicode(&r, 1);
2693 }
Tim Petersced69f82003-09-16 20:30:58 +00002694
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 v = _PyUnicode_New(size);
2696 if (v == NULL)
2697 goto onError;
2698 if (size == 0)
2699 return (PyObject *)v;
2700 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 e = s + size;
2702 while (s < e) {
2703 register unsigned char c = (unsigned char)*s;
2704 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 ++s;
2707 }
2708 else {
2709 startinpos = s-starts;
2710 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002711 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 if (unicode_decode_call_errorhandler(
2713 errors, &errorHandler,
2714 "ascii", "ordinal not in range(128)",
2715 starts, size, &startinpos, &endinpos, &exc, &s,
2716 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002720 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002721 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002722 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 Py_XDECREF(errorHandler);
2724 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 onError:
2728 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 return NULL;
2732}
2733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2735 int size,
2736 const char *errors)
2737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739}
2740
2741PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2742{
2743 if (!PyUnicode_Check(unicode)) {
2744 PyErr_BadArgument();
2745 return NULL;
2746 }
2747 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2748 PyUnicode_GET_SIZE(unicode),
2749 NULL);
2750}
2751
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002752#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002753
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002754/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002755
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002756PyObject *PyUnicode_DecodeMBCS(const char *s,
2757 int size,
2758 const char *errors)
2759{
2760 PyUnicodeObject *v;
2761 Py_UNICODE *p;
2762
2763 /* First get the size of the result */
2764 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002765 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002766 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2767
2768 v = _PyUnicode_New(usize);
2769 if (v == NULL)
2770 return NULL;
2771 if (usize == 0)
2772 return (PyObject *)v;
2773 p = PyUnicode_AS_UNICODE(v);
2774 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2775 Py_DECREF(v);
2776 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2777 }
2778
2779 return (PyObject *)v;
2780}
2781
2782PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2783 int size,
2784 const char *errors)
2785{
2786 PyObject *repr;
2787 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002788 DWORD mbcssize;
2789
2790 /* If there are no characters, bail now! */
2791 if (size==0)
2792 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002793
2794 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002795 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002796 if (mbcssize==0)
2797 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2798
2799 repr = PyString_FromStringAndSize(NULL, mbcssize);
2800 if (repr == NULL)
2801 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002802 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002803 return repr;
2804
2805 /* Do the conversion */
2806 s = PyString_AS_STRING(repr);
2807 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2808 Py_DECREF(repr);
2809 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2810 }
2811 return repr;
2812}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002813
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002814PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2815{
2816 if (!PyUnicode_Check(unicode)) {
2817 PyErr_BadArgument();
2818 return NULL;
2819 }
2820 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2821 PyUnicode_GET_SIZE(unicode),
2822 NULL);
2823}
2824
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002825#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002826
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827/* --- Character Mapping Codec -------------------------------------------- */
2828
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829PyObject *PyUnicode_DecodeCharmap(const char *s,
2830 int size,
2831 PyObject *mapping,
2832 const char *errors)
2833{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 const char *starts = s;
2835 int startinpos;
2836 int endinpos;
2837 int outpos;
2838 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 PyUnicodeObject *v;
2840 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002841 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 PyObject *errorHandler = NULL;
2843 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002844 Py_UNICODE *mapstring = NULL;
2845 int maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002846
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 /* Default to Latin-1 */
2848 if (mapping == NULL)
2849 return PyUnicode_DecodeLatin1(s, size, errors);
2850
2851 v = _PyUnicode_New(size);
2852 if (v == NULL)
2853 goto onError;
2854 if (size == 0)
2855 return (PyObject *)v;
2856 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002858 if (PyUnicode_CheckExact(mapping)) {
2859 mapstring = PyUnicode_AS_UNICODE(mapping);
2860 maplen = PyUnicode_GET_SIZE(mapping);
2861 while (s < e) {
2862 unsigned char ch = *s;
2863 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002865 if (ch < maplen)
2866 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002868 if (x == 0xfffe) {
2869 /* undefined mapping */
2870 outpos = p-PyUnicode_AS_UNICODE(v);
2871 startinpos = s-starts;
2872 endinpos = startinpos+1;
2873 if (unicode_decode_call_errorhandler(
2874 errors, &errorHandler,
2875 "charmap", "character maps to <undefined>",
2876 starts, size, &startinpos, &endinpos, &exc, &s,
2877 (PyObject **)&v, &outpos, &p)) {
2878 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002879 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002880 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002881 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002882 *p++ = x;
2883 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002885 }
2886 else {
2887 while (s < e) {
2888 unsigned char ch = *s;
2889 PyObject *w, *x;
2890
2891 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2892 w = PyInt_FromLong((long)ch);
2893 if (w == NULL)
2894 goto onError;
2895 x = PyObject_GetItem(mapping, w);
2896 Py_DECREF(w);
2897 if (x == NULL) {
2898 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2899 /* No mapping found means: mapping is undefined. */
2900 PyErr_Clear();
2901 x = Py_None;
2902 Py_INCREF(x);
2903 } else
2904 goto onError;
2905 }
2906
2907 /* Apply mapping */
2908 if (PyInt_Check(x)) {
2909 long value = PyInt_AS_LONG(x);
2910 if (value < 0 || value > 65535) {
2911 PyErr_SetString(PyExc_TypeError,
2912 "character mapping must be in range(65536)");
2913 Py_DECREF(x);
2914 goto onError;
2915 }
2916 *p++ = (Py_UNICODE)value;
2917 }
2918 else if (x == Py_None) {
2919 /* undefined mapping */
2920 outpos = p-PyUnicode_AS_UNICODE(v);
2921 startinpos = s-starts;
2922 endinpos = startinpos+1;
2923 if (unicode_decode_call_errorhandler(
2924 errors, &errorHandler,
2925 "charmap", "character maps to <undefined>",
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 (PyObject **)&v, &outpos, &p)) {
2928 Py_DECREF(x);
2929 goto onError;
2930 }
Walter Dörwaldd4fff172005-11-28 22:15:56 +00002931 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002932 continue;
2933 }
2934 else if (PyUnicode_Check(x)) {
2935 int targetsize = PyUnicode_GET_SIZE(x);
2936
2937 if (targetsize == 1)
2938 /* 1-1 mapping */
2939 *p++ = *PyUnicode_AS_UNICODE(x);
2940
2941 else if (targetsize > 1) {
2942 /* 1-n mapping */
2943 if (targetsize > extrachars) {
2944 /* resize first */
2945 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2946 int needed = (targetsize - extrachars) + \
2947 (targetsize << 2);
2948 extrachars += needed;
2949 if (_PyUnicode_Resize(&v,
2950 PyUnicode_GET_SIZE(v) + needed) < 0) {
2951 Py_DECREF(x);
2952 goto onError;
2953 }
2954 p = PyUnicode_AS_UNICODE(v) + oldpos;
2955 }
2956 Py_UNICODE_COPY(p,
2957 PyUnicode_AS_UNICODE(x),
2958 targetsize);
2959 p += targetsize;
2960 extrachars -= targetsize;
2961 }
2962 /* 1-0 mapping: skip the character */
2963 }
2964 else {
2965 /* wrong return value */
2966 PyErr_SetString(PyExc_TypeError,
2967 "character mapping must return integer, None or unicode");
2968 Py_DECREF(x);
2969 goto onError;
2970 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002972 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002974 }
2975 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002976 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002978 Py_XDECREF(errorHandler);
2979 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002981
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002983 Py_XDECREF(errorHandler);
2984 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 Py_XDECREF(v);
2986 return NULL;
2987}
2988
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002989/* Lookup the character ch in the mapping. If the character
2990 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00002991 error occurred). */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002992static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002994 PyObject *w = PyInt_FromLong((long)c);
2995 PyObject *x;
2996
2997 if (w == NULL)
2998 return NULL;
2999 x = PyObject_GetItem(mapping, w);
3000 Py_DECREF(w);
3001 if (x == NULL) {
3002 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3003 /* No mapping found means: mapping is undefined. */
3004 PyErr_Clear();
3005 x = Py_None;
3006 Py_INCREF(x);
3007 return x;
3008 } else
3009 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003010 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003011 else if (x == Py_None)
3012 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003013 else if (PyInt_Check(x)) {
3014 long value = PyInt_AS_LONG(x);
3015 if (value < 0 || value > 255) {
3016 PyErr_SetString(PyExc_TypeError,
3017 "character mapping must be in range(256)");
3018 Py_DECREF(x);
3019 return NULL;
3020 }
3021 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003023 else if (PyString_Check(x))
3024 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003025 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003026 /* wrong return value */
3027 PyErr_SetString(PyExc_TypeError,
3028 "character mapping must return integer, None or str");
3029 Py_DECREF(x);
3030 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003031 }
3032}
3033
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003034/* lookup the character, put the result in the output string and adjust
3035 various state variables. Reallocate the output string if not enough
3036 space is available. Return a new reference to the object that
3037 was put in the output buffer, or Py_None, if the mapping was undefined
3038 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00003039 reallocation error occurred. The caller must decref the result */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003040static
3041PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
3042 PyObject **outobj, int *outpos)
3043{
3044 PyObject *rep = charmapencode_lookup(c, mapping);
3045
3046 if (rep==NULL)
3047 return NULL;
3048 else if (rep==Py_None)
3049 return rep;
3050 else {
3051 char *outstart = PyString_AS_STRING(*outobj);
3052 int outsize = PyString_GET_SIZE(*outobj);
3053 if (PyInt_Check(rep)) {
3054 int requiredsize = *outpos+1;
3055 if (outsize<requiredsize) {
3056 /* exponentially overallocate to minimize reallocations */
3057 if (requiredsize < 2*outsize)
3058 requiredsize = 2*outsize;
3059 if (_PyString_Resize(outobj, requiredsize)) {
3060 Py_DECREF(rep);
3061 return NULL;
3062 }
3063 outstart = PyString_AS_STRING(*outobj);
3064 }
3065 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3066 }
3067 else {
3068 const char *repchars = PyString_AS_STRING(rep);
3069 int repsize = PyString_GET_SIZE(rep);
3070 int requiredsize = *outpos+repsize;
3071 if (outsize<requiredsize) {
3072 /* exponentially overallocate to minimize reallocations */
3073 if (requiredsize < 2*outsize)
3074 requiredsize = 2*outsize;
3075 if (_PyString_Resize(outobj, requiredsize)) {
3076 Py_DECREF(rep);
3077 return NULL;
3078 }
3079 outstart = PyString_AS_STRING(*outobj);
3080 }
3081 memcpy(outstart + *outpos, repchars, repsize);
3082 *outpos += repsize;
3083 }
3084 }
3085 return rep;
3086}
3087
3088/* handle an error in PyUnicode_EncodeCharmap
3089 Return 0 on success, -1 on error */
3090static
3091int charmap_encoding_error(
3092 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
3093 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003094 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003095 PyObject **res, int *respos)
3096{
3097 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3098 int repsize;
3099 int newpos;
3100 Py_UNICODE *uni2;
3101 /* startpos for collecting unencodable chars */
3102 int collstartpos = *inpos;
3103 int collendpos = *inpos+1;
3104 int collpos;
3105 char *encoding = "charmap";
3106 char *reason = "character maps to <undefined>";
3107
3108 PyObject *x;
3109 /* find all unencodable characters */
3110 while (collendpos < size) {
3111 x = charmapencode_lookup(p[collendpos], mapping);
3112 if (x==NULL)
3113 return -1;
3114 else if (x!=Py_None) {
3115 Py_DECREF(x);
3116 break;
3117 }
3118 Py_DECREF(x);
3119 ++collendpos;
3120 }
3121 /* cache callback name lookup
3122 * (if not done yet, i.e. it's the first error) */
3123 if (*known_errorHandler==-1) {
3124 if ((errors==NULL) || (!strcmp(errors, "strict")))
3125 *known_errorHandler = 1;
3126 else if (!strcmp(errors, "replace"))
3127 *known_errorHandler = 2;
3128 else if (!strcmp(errors, "ignore"))
3129 *known_errorHandler = 3;
3130 else if (!strcmp(errors, "xmlcharrefreplace"))
3131 *known_errorHandler = 4;
3132 else
3133 *known_errorHandler = 0;
3134 }
3135 switch (*known_errorHandler) {
3136 case 1: /* strict */
3137 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3138 return -1;
3139 case 2: /* replace */
3140 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3141 x = charmapencode_output('?', mapping, res, respos);
3142 if (x==NULL) {
3143 return -1;
3144 }
3145 else if (x==Py_None) {
3146 Py_DECREF(x);
3147 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3148 return -1;
3149 }
3150 Py_DECREF(x);
3151 }
3152 /* fall through */
3153 case 3: /* ignore */
3154 *inpos = collendpos;
3155 break;
3156 case 4: /* xmlcharrefreplace */
3157 /* generate replacement (temporarily (mis)uses p) */
3158 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3159 char buffer[2+29+1+1];
3160 char *cp;
3161 sprintf(buffer, "&#%d;", (int)p[collpos]);
3162 for (cp = buffer; *cp; ++cp) {
3163 x = charmapencode_output(*cp, mapping, res, respos);
3164 if (x==NULL)
3165 return -1;
3166 else if (x==Py_None) {
3167 Py_DECREF(x);
3168 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3169 return -1;
3170 }
3171 Py_DECREF(x);
3172 }
3173 }
3174 *inpos = collendpos;
3175 break;
3176 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003177 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003178 encoding, reason, p, size, exceptionObject,
3179 collstartpos, collendpos, &newpos);
3180 if (repunicode == NULL)
3181 return -1;
3182 /* generate replacement */
3183 repsize = PyUnicode_GET_SIZE(repunicode);
3184 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3185 x = charmapencode_output(*uni2, mapping, res, respos);
3186 if (x==NULL) {
3187 Py_DECREF(repunicode);
3188 return -1;
3189 }
3190 else if (x==Py_None) {
3191 Py_DECREF(repunicode);
3192 Py_DECREF(x);
3193 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3194 return -1;
3195 }
3196 Py_DECREF(x);
3197 }
3198 *inpos = newpos;
3199 Py_DECREF(repunicode);
3200 }
3201 return 0;
3202}
3203
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3205 int size,
3206 PyObject *mapping,
3207 const char *errors)
3208{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003209 /* output object */
3210 PyObject *res = NULL;
3211 /* current input position */
3212 int inpos = 0;
3213 /* current output position */
3214 int respos = 0;
3215 PyObject *errorHandler = NULL;
3216 PyObject *exc = NULL;
3217 /* the following variable is used for caching string comparisons
3218 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3219 * 3=ignore, 4=xmlcharrefreplace */
3220 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221
3222 /* Default to Latin-1 */
3223 if (mapping == NULL)
3224 return PyUnicode_EncodeLatin1(p, size, errors);
3225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 /* allocate enough for a simple encoding without
3227 replacements, if we need more, we'll resize */
3228 res = PyString_FromStringAndSize(NULL, size);
3229 if (res == NULL)
3230 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003231 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003232 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003234 while (inpos<size) {
3235 /* try to encode it */
3236 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3237 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003239 if (x==Py_None) { /* unencodable character */
3240 if (charmap_encoding_error(p, size, &inpos, mapping,
3241 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003242 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003243 &res, &respos)) {
3244 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003245 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 else
3249 /* done with this character => adjust input position */
3250 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251 Py_DECREF(x);
3252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003254 /* Resize if we allocated to much */
3255 if (respos<PyString_GET_SIZE(res)) {
3256 if (_PyString_Resize(&res, respos))
3257 goto onError;
3258 }
3259 Py_XDECREF(exc);
3260 Py_XDECREF(errorHandler);
3261 return res;
3262
3263 onError:
3264 Py_XDECREF(res);
3265 Py_XDECREF(exc);
3266 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 return NULL;
3268}
3269
3270PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3271 PyObject *mapping)
3272{
3273 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3274 PyErr_BadArgument();
3275 return NULL;
3276 }
3277 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3278 PyUnicode_GET_SIZE(unicode),
3279 mapping,
3280 NULL);
3281}
3282
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003283/* create or adjust a UnicodeTranslateError */
3284static void make_translate_exception(PyObject **exceptionObject,
3285 const Py_UNICODE *unicode, int size,
3286 int startpos, int endpos,
3287 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003288{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003289 if (*exceptionObject == NULL) {
3290 *exceptionObject = PyUnicodeTranslateError_Create(
3291 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003292 }
3293 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003294 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3295 goto onError;
3296 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3297 goto onError;
3298 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3299 goto onError;
3300 return;
3301 onError:
3302 Py_DECREF(*exceptionObject);
3303 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003304 }
3305}
3306
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003307/* raises a UnicodeTranslateError */
3308static void raise_translate_exception(PyObject **exceptionObject,
3309 const Py_UNICODE *unicode, int size,
3310 int startpos, int endpos,
3311 const char *reason)
3312{
3313 make_translate_exception(exceptionObject,
3314 unicode, size, startpos, endpos, reason);
3315 if (*exceptionObject != NULL)
3316 PyCodec_StrictErrors(*exceptionObject);
3317}
3318
3319/* error handling callback helper:
3320 build arguments, call the callback and check the arguments,
3321 put the result into newpos and return the replacement string, which
3322 has to be freed by the caller */
3323static PyObject *unicode_translate_call_errorhandler(const char *errors,
3324 PyObject **errorHandler,
3325 const char *reason,
3326 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3327 int startpos, int endpos,
3328 int *newpos)
3329{
3330 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3331
3332 PyObject *restuple;
3333 PyObject *resunicode;
3334
3335 if (*errorHandler == NULL) {
3336 *errorHandler = PyCodec_LookupError(errors);
3337 if (*errorHandler == NULL)
3338 return NULL;
3339 }
3340
3341 make_translate_exception(exceptionObject,
3342 unicode, size, startpos, endpos, reason);
3343 if (*exceptionObject == NULL)
3344 return NULL;
3345
3346 restuple = PyObject_CallFunctionObjArgs(
3347 *errorHandler, *exceptionObject, NULL);
3348 if (restuple == NULL)
3349 return NULL;
3350 if (!PyTuple_Check(restuple)) {
3351 PyErr_Format(PyExc_TypeError, &argparse[4]);
3352 Py_DECREF(restuple);
3353 return NULL;
3354 }
3355 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3356 &resunicode, newpos)) {
3357 Py_DECREF(restuple);
3358 return NULL;
3359 }
3360 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003361 *newpos = size+*newpos;
3362 if (*newpos<0 || *newpos>size) {
3363 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3364 Py_DECREF(restuple);
3365 return NULL;
3366 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003367 Py_INCREF(resunicode);
3368 Py_DECREF(restuple);
3369 return resunicode;
3370}
3371
3372/* Lookup the character ch in the mapping and put the result in result,
3373 which must be decrefed by the caller.
3374 Return 0 on success, -1 on error */
3375static
3376int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3377{
3378 PyObject *w = PyInt_FromLong((long)c);
3379 PyObject *x;
3380
3381 if (w == NULL)
3382 return -1;
3383 x = PyObject_GetItem(mapping, w);
3384 Py_DECREF(w);
3385 if (x == NULL) {
3386 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3387 /* No mapping found means: use 1:1 mapping. */
3388 PyErr_Clear();
3389 *result = NULL;
3390 return 0;
3391 } else
3392 return -1;
3393 }
3394 else if (x == Py_None) {
3395 *result = x;
3396 return 0;
3397 }
3398 else if (PyInt_Check(x)) {
3399 long value = PyInt_AS_LONG(x);
3400 long max = PyUnicode_GetMax();
3401 if (value < 0 || value > max) {
3402 PyErr_Format(PyExc_TypeError,
3403 "character mapping must be in range(0x%lx)", max+1);
3404 Py_DECREF(x);
3405 return -1;
3406 }
3407 *result = x;
3408 return 0;
3409 }
3410 else if (PyUnicode_Check(x)) {
3411 *result = x;
3412 return 0;
3413 }
3414 else {
3415 /* wrong return value */
3416 PyErr_SetString(PyExc_TypeError,
3417 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003418 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003419 return -1;
3420 }
3421}
3422/* ensure that *outobj is at least requiredsize characters long,
3423if not reallocate and adjust various state variables.
3424Return 0 on success, -1 on error */
3425static
Walter Dörwald4894c302003-10-24 14:25:28 +00003426int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003427 int requiredsize)
3428{
Walter Dörwald4894c302003-10-24 14:25:28 +00003429 int oldsize = PyUnicode_GET_SIZE(*outobj);
3430 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003431 /* remember old output position */
3432 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3433 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003434 if (requiredsize < 2 * oldsize)
3435 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003436 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 return -1;
3438 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003439 }
3440 return 0;
3441}
3442/* lookup the character, put the result in the output string and adjust
3443 various state variables. Return a new reference to the object that
3444 was put in the output buffer in *result, or Py_None, if the mapping was
3445 undefined (in which case no character was written).
3446 The called must decref result.
3447 Return 0 on success, -1 on error. */
3448static
Walter Dörwald4894c302003-10-24 14:25:28 +00003449int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3450 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3451 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003452{
Walter Dörwald4894c302003-10-24 14:25:28 +00003453 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 return -1;
3455 if (*res==NULL) {
3456 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003457 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003458 }
3459 else if (*res==Py_None)
3460 ;
3461 else if (PyInt_Check(*res)) {
3462 /* no overflow check, because we know that the space is enough */
3463 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3464 }
3465 else if (PyUnicode_Check(*res)) {
3466 int repsize = PyUnicode_GET_SIZE(*res);
3467 if (repsize==1) {
3468 /* no overflow check, because we know that the space is enough */
3469 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3470 }
3471 else if (repsize!=0) {
3472 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003473 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003474 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003475 repsize - 1;
3476 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003477 return -1;
3478 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3479 *outp += repsize;
3480 }
3481 }
3482 else
3483 return -1;
3484 return 0;
3485}
3486
3487PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 int size,
3489 PyObject *mapping,
3490 const char *errors)
3491{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003492 /* output object */
3493 PyObject *res = NULL;
3494 /* pointers to the beginning and end+1 of input */
3495 const Py_UNICODE *startp = p;
3496 const Py_UNICODE *endp = p + size;
3497 /* pointer into the output */
3498 Py_UNICODE *str;
3499 /* current output position */
3500 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003501 char *reason = "character maps to <undefined>";
3502 PyObject *errorHandler = NULL;
3503 PyObject *exc = NULL;
3504 /* the following variable is used for caching string comparisons
3505 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3506 * 3=ignore, 4=xmlcharrefreplace */
3507 int known_errorHandler = -1;
3508
Guido van Rossumd57fd912000-03-10 22:53:23 +00003509 if (mapping == NULL) {
3510 PyErr_BadArgument();
3511 return NULL;
3512 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003513
3514 /* allocate enough for a simple 1:1 translation without
3515 replacements, if we need more, we'll resize */
3516 res = PyUnicode_FromUnicode(NULL, size);
3517 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003518 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003519 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003520 return res;
3521 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003523 while (p<endp) {
3524 /* try to encode it */
3525 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003526 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528 goto onError;
3529 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003530 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003531 if (x!=Py_None) /* it worked => adjust input pointer */
3532 ++p;
3533 else { /* untranslatable character */
3534 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3535 int repsize;
3536 int newpos;
3537 Py_UNICODE *uni2;
3538 /* startpos for collecting untranslatable chars */
3539 const Py_UNICODE *collstart = p;
3540 const Py_UNICODE *collend = p+1;
3541 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 /* find all untranslatable characters */
3544 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003545 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003546 goto onError;
3547 Py_XDECREF(x);
3548 if (x!=Py_None)
3549 break;
3550 ++collend;
3551 }
3552 /* cache callback name lookup
3553 * (if not done yet, i.e. it's the first error) */
3554 if (known_errorHandler==-1) {
3555 if ((errors==NULL) || (!strcmp(errors, "strict")))
3556 known_errorHandler = 1;
3557 else if (!strcmp(errors, "replace"))
3558 known_errorHandler = 2;
3559 else if (!strcmp(errors, "ignore"))
3560 known_errorHandler = 3;
3561 else if (!strcmp(errors, "xmlcharrefreplace"))
3562 known_errorHandler = 4;
3563 else
3564 known_errorHandler = 0;
3565 }
3566 switch (known_errorHandler) {
3567 case 1: /* strict */
3568 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3569 goto onError;
3570 case 2: /* replace */
3571 /* No need to check for space, this is a 1:1 replacement */
3572 for (coll = collstart; coll<collend; ++coll)
3573 *str++ = '?';
3574 /* fall through */
3575 case 3: /* ignore */
3576 p = collend;
3577 break;
3578 case 4: /* xmlcharrefreplace */
3579 /* generate replacement (temporarily (mis)uses p) */
3580 for (p = collstart; p < collend; ++p) {
3581 char buffer[2+29+1+1];
3582 char *cp;
3583 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003584 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003585 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3586 goto onError;
3587 for (cp = buffer; *cp; ++cp)
3588 *str++ = *cp;
3589 }
3590 p = collend;
3591 break;
3592 default:
3593 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3594 reason, startp, size, &exc,
3595 collstart-startp, collend-startp, &newpos);
3596 if (repunicode == NULL)
3597 goto onError;
3598 /* generate replacement */
3599 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003600 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003601 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3602 Py_DECREF(repunicode);
3603 goto onError;
3604 }
3605 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3606 *str++ = *uni2;
3607 p = startp + newpos;
3608 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609 }
3610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003612 /* Resize if we allocated to much */
3613 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003614 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003615 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003616 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 }
3618 Py_XDECREF(exc);
3619 Py_XDECREF(errorHandler);
3620 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003621
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003622 onError:
3623 Py_XDECREF(res);
3624 Py_XDECREF(exc);
3625 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 return NULL;
3627}
3628
3629PyObject *PyUnicode_Translate(PyObject *str,
3630 PyObject *mapping,
3631 const char *errors)
3632{
3633 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003634
Guido van Rossumd57fd912000-03-10 22:53:23 +00003635 str = PyUnicode_FromObject(str);
3636 if (str == NULL)
3637 goto onError;
3638 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3639 PyUnicode_GET_SIZE(str),
3640 mapping,
3641 errors);
3642 Py_DECREF(str);
3643 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003644
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 onError:
3646 Py_XDECREF(str);
3647 return NULL;
3648}
Tim Petersced69f82003-09-16 20:30:58 +00003649
Guido van Rossum9e896b32000-04-05 20:11:21 +00003650/* --- Decimal Encoder ---------------------------------------------------- */
3651
3652int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3653 int length,
3654 char *output,
3655 const char *errors)
3656{
3657 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 PyObject *errorHandler = NULL;
3659 PyObject *exc = NULL;
3660 const char *encoding = "decimal";
3661 const char *reason = "invalid decimal Unicode string";
3662 /* the following variable is used for caching string comparisons
3663 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3664 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003665
3666 if (output == NULL) {
3667 PyErr_BadArgument();
3668 return -1;
3669 }
3670
3671 p = s;
3672 end = s + length;
3673 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003674 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003675 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003676 PyObject *repunicode;
3677 int repsize;
3678 int newpos;
3679 Py_UNICODE *uni2;
3680 Py_UNICODE *collstart;
3681 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003682
Guido van Rossum9e896b32000-04-05 20:11:21 +00003683 if (Py_UNICODE_ISSPACE(ch)) {
3684 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003685 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003686 continue;
3687 }
3688 decimal = Py_UNICODE_TODECIMAL(ch);
3689 if (decimal >= 0) {
3690 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003691 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003692 continue;
3693 }
Guido van Rossumba477042000-04-06 18:18:10 +00003694 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003695 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003696 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003697 continue;
3698 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003699 /* All other characters are considered unencodable */
3700 collstart = p;
3701 collend = p+1;
3702 while (collend < end) {
3703 if ((0 < *collend && *collend < 256) ||
3704 !Py_UNICODE_ISSPACE(*collend) ||
3705 Py_UNICODE_TODECIMAL(*collend))
3706 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003707 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003708 /* cache callback name lookup
3709 * (if not done yet, i.e. it's the first error) */
3710 if (known_errorHandler==-1) {
3711 if ((errors==NULL) || (!strcmp(errors, "strict")))
3712 known_errorHandler = 1;
3713 else if (!strcmp(errors, "replace"))
3714 known_errorHandler = 2;
3715 else if (!strcmp(errors, "ignore"))
3716 known_errorHandler = 3;
3717 else if (!strcmp(errors, "xmlcharrefreplace"))
3718 known_errorHandler = 4;
3719 else
3720 known_errorHandler = 0;
3721 }
3722 switch (known_errorHandler) {
3723 case 1: /* strict */
3724 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3725 goto onError;
3726 case 2: /* replace */
3727 for (p = collstart; p < collend; ++p)
3728 *output++ = '?';
3729 /* fall through */
3730 case 3: /* ignore */
3731 p = collend;
3732 break;
3733 case 4: /* xmlcharrefreplace */
3734 /* generate replacement (temporarily (mis)uses p) */
3735 for (p = collstart; p < collend; ++p)
3736 output += sprintf(output, "&#%d;", (int)*p);
3737 p = collend;
3738 break;
3739 default:
3740 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3741 encoding, reason, s, length, &exc,
3742 collstart-s, collend-s, &newpos);
3743 if (repunicode == NULL)
3744 goto onError;
3745 /* generate replacement */
3746 repsize = PyUnicode_GET_SIZE(repunicode);
3747 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3748 Py_UNICODE ch = *uni2;
3749 if (Py_UNICODE_ISSPACE(ch))
3750 *output++ = ' ';
3751 else {
3752 decimal = Py_UNICODE_TODECIMAL(ch);
3753 if (decimal >= 0)
3754 *output++ = '0' + decimal;
3755 else if (0 < ch && ch < 256)
3756 *output++ = (char)ch;
3757 else {
3758 Py_DECREF(repunicode);
3759 raise_encode_exception(&exc, encoding,
3760 s, length, collstart-s, collend-s, reason);
3761 goto onError;
3762 }
3763 }
3764 }
3765 p = s + newpos;
3766 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003767 }
3768 }
3769 /* 0-terminate the output string */
3770 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003771 Py_XDECREF(exc);
3772 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003773 return 0;
3774
3775 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003776 Py_XDECREF(exc);
3777 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003778 return -1;
3779}
3780
Guido van Rossumd57fd912000-03-10 22:53:23 +00003781/* --- Helpers ------------------------------------------------------------ */
3782
Tim Petersced69f82003-09-16 20:30:58 +00003783static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784int count(PyUnicodeObject *self,
3785 int start,
3786 int end,
3787 PyUnicodeObject *substring)
3788{
3789 int count = 0;
3790
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003791 if (start < 0)
3792 start += self->length;
3793 if (start < 0)
3794 start = 0;
3795 if (end > self->length)
3796 end = self->length;
3797 if (end < 0)
3798 end += self->length;
3799 if (end < 0)
3800 end = 0;
3801
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003802 if (substring->length == 0)
3803 return (end - start + 1);
3804
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 end -= substring->length;
3806
3807 while (start <= end)
3808 if (Py_UNICODE_MATCH(self, start, substring)) {
3809 count++;
3810 start += substring->length;
3811 } else
3812 start++;
3813
3814 return count;
3815}
3816
3817int PyUnicode_Count(PyObject *str,
3818 PyObject *substr,
3819 int start,
3820 int end)
3821{
3822 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824 str = PyUnicode_FromObject(str);
3825 if (str == NULL)
3826 return -1;
3827 substr = PyUnicode_FromObject(substr);
3828 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003829 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 return -1;
3831 }
Tim Petersced69f82003-09-16 20:30:58 +00003832
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833 result = count((PyUnicodeObject *)str,
3834 start, end,
3835 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003836
Guido van Rossumd57fd912000-03-10 22:53:23 +00003837 Py_DECREF(str);
3838 Py_DECREF(substr);
3839 return result;
3840}
3841
Tim Petersced69f82003-09-16 20:30:58 +00003842static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843int findstring(PyUnicodeObject *self,
3844 PyUnicodeObject *substring,
3845 int start,
3846 int end,
3847 int direction)
3848{
3849 if (start < 0)
3850 start += self->length;
3851 if (start < 0)
3852 start = 0;
3853
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854 if (end > self->length)
3855 end = self->length;
3856 if (end < 0)
3857 end += self->length;
3858 if (end < 0)
3859 end = 0;
3860
Guido van Rossum76afbd92002-08-20 17:29:29 +00003861 if (substring->length == 0)
3862 return (direction > 0) ? start : end;
3863
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 end -= substring->length;
3865
3866 if (direction < 0) {
3867 for (; end >= start; end--)
3868 if (Py_UNICODE_MATCH(self, end, substring))
3869 return end;
3870 } else {
3871 for (; start <= end; start++)
3872 if (Py_UNICODE_MATCH(self, start, substring))
3873 return start;
3874 }
3875
3876 return -1;
3877}
3878
3879int PyUnicode_Find(PyObject *str,
3880 PyObject *substr,
3881 int start,
3882 int end,
3883 int direction)
3884{
3885 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003886
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 str = PyUnicode_FromObject(str);
3888 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003889 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 substr = PyUnicode_FromObject(substr);
3891 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003892 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003893 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 }
Tim Petersced69f82003-09-16 20:30:58 +00003895
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 result = findstring((PyUnicodeObject *)str,
3897 (PyUnicodeObject *)substr,
3898 start, end, direction);
3899 Py_DECREF(str);
3900 Py_DECREF(substr);
3901 return result;
3902}
3903
Tim Petersced69f82003-09-16 20:30:58 +00003904static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003905int tailmatch(PyUnicodeObject *self,
3906 PyUnicodeObject *substring,
3907 int start,
3908 int end,
3909 int direction)
3910{
3911 if (start < 0)
3912 start += self->length;
3913 if (start < 0)
3914 start = 0;
3915
3916 if (substring->length == 0)
3917 return 1;
3918
3919 if (end > self->length)
3920 end = self->length;
3921 if (end < 0)
3922 end += self->length;
3923 if (end < 0)
3924 end = 0;
3925
3926 end -= substring->length;
3927 if (end < start)
3928 return 0;
3929
3930 if (direction > 0) {
3931 if (Py_UNICODE_MATCH(self, end, substring))
3932 return 1;
3933 } else {
3934 if (Py_UNICODE_MATCH(self, start, substring))
3935 return 1;
3936 }
3937
3938 return 0;
3939}
3940
3941int PyUnicode_Tailmatch(PyObject *str,
3942 PyObject *substr,
3943 int start,
3944 int end,
3945 int direction)
3946{
3947 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003948
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 str = PyUnicode_FromObject(str);
3950 if (str == NULL)
3951 return -1;
3952 substr = PyUnicode_FromObject(substr);
3953 if (substr == NULL) {
3954 Py_DECREF(substr);
3955 return -1;
3956 }
Tim Petersced69f82003-09-16 20:30:58 +00003957
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 result = tailmatch((PyUnicodeObject *)str,
3959 (PyUnicodeObject *)substr,
3960 start, end, direction);
3961 Py_DECREF(str);
3962 Py_DECREF(substr);
3963 return result;
3964}
3965
Tim Petersced69f82003-09-16 20:30:58 +00003966static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967const Py_UNICODE *findchar(const Py_UNICODE *s,
3968 int size,
3969 Py_UNICODE ch)
3970{
3971 /* like wcschr, but doesn't stop at NULL characters */
3972
3973 while (size-- > 0) {
3974 if (*s == ch)
3975 return s;
3976 s++;
3977 }
3978
3979 return NULL;
3980}
3981
3982/* Apply fixfct filter to the Unicode object self and return a
3983 reference to the modified object */
3984
Tim Petersced69f82003-09-16 20:30:58 +00003985static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986PyObject *fixup(PyUnicodeObject *self,
3987 int (*fixfct)(PyUnicodeObject *s))
3988{
3989
3990 PyUnicodeObject *u;
3991
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003992 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003993 if (u == NULL)
3994 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003995
3996 Py_UNICODE_COPY(u->str, self->str, self->length);
3997
Tim Peters7a29bd52001-09-12 03:03:31 +00003998 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 /* fixfct should return TRUE if it modified the buffer. If
4000 FALSE, return a reference to the original buffer instead
4001 (to save space, not time) */
4002 Py_INCREF(self);
4003 Py_DECREF(u);
4004 return (PyObject*) self;
4005 }
4006 return (PyObject*) u;
4007}
4008
Tim Petersced69f82003-09-16 20:30:58 +00004009static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010int fixupper(PyUnicodeObject *self)
4011{
4012 int len = self->length;
4013 Py_UNICODE *s = self->str;
4014 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004015
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 while (len-- > 0) {
4017 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004018
Guido van Rossumd57fd912000-03-10 22:53:23 +00004019 ch = Py_UNICODE_TOUPPER(*s);
4020 if (ch != *s) {
4021 status = 1;
4022 *s = ch;
4023 }
4024 s++;
4025 }
4026
4027 return status;
4028}
4029
Tim Petersced69f82003-09-16 20:30:58 +00004030static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031int fixlower(PyUnicodeObject *self)
4032{
4033 int len = self->length;
4034 Py_UNICODE *s = self->str;
4035 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004036
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037 while (len-- > 0) {
4038 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004039
Guido van Rossumd57fd912000-03-10 22:53:23 +00004040 ch = Py_UNICODE_TOLOWER(*s);
4041 if (ch != *s) {
4042 status = 1;
4043 *s = ch;
4044 }
4045 s++;
4046 }
4047
4048 return status;
4049}
4050
Tim Petersced69f82003-09-16 20:30:58 +00004051static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052int fixswapcase(PyUnicodeObject *self)
4053{
4054 int len = self->length;
4055 Py_UNICODE *s = self->str;
4056 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004057
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 while (len-- > 0) {
4059 if (Py_UNICODE_ISUPPER(*s)) {
4060 *s = Py_UNICODE_TOLOWER(*s);
4061 status = 1;
4062 } else if (Py_UNICODE_ISLOWER(*s)) {
4063 *s = Py_UNICODE_TOUPPER(*s);
4064 status = 1;
4065 }
4066 s++;
4067 }
4068
4069 return status;
4070}
4071
Tim Petersced69f82003-09-16 20:30:58 +00004072static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073int fixcapitalize(PyUnicodeObject *self)
4074{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004075 int len = self->length;
4076 Py_UNICODE *s = self->str;
4077 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004078
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004079 if (len == 0)
4080 return 0;
4081 if (Py_UNICODE_ISLOWER(*s)) {
4082 *s = Py_UNICODE_TOUPPER(*s);
4083 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004085 s++;
4086 while (--len > 0) {
4087 if (Py_UNICODE_ISUPPER(*s)) {
4088 *s = Py_UNICODE_TOLOWER(*s);
4089 status = 1;
4090 }
4091 s++;
4092 }
4093 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004094}
4095
4096static
4097int fixtitle(PyUnicodeObject *self)
4098{
4099 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4100 register Py_UNICODE *e;
4101 int previous_is_cased;
4102
4103 /* Shortcut for single character strings */
4104 if (PyUnicode_GET_SIZE(self) == 1) {
4105 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4106 if (*p != ch) {
4107 *p = ch;
4108 return 1;
4109 }
4110 else
4111 return 0;
4112 }
Tim Petersced69f82003-09-16 20:30:58 +00004113
Guido van Rossumd57fd912000-03-10 22:53:23 +00004114 e = p + PyUnicode_GET_SIZE(self);
4115 previous_is_cased = 0;
4116 for (; p < e; p++) {
4117 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004118
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119 if (previous_is_cased)
4120 *p = Py_UNICODE_TOLOWER(ch);
4121 else
4122 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004123
4124 if (Py_UNICODE_ISLOWER(ch) ||
4125 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 Py_UNICODE_ISTITLE(ch))
4127 previous_is_cased = 1;
4128 else
4129 previous_is_cased = 0;
4130 }
4131 return 1;
4132}
4133
Tim Peters8ce9f162004-08-27 01:49:32 +00004134PyObject *
4135PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136{
Tim Peters8ce9f162004-08-27 01:49:32 +00004137 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004138 const Py_UNICODE blank = ' ';
4139 const Py_UNICODE *sep = &blank;
4140 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004141 PyUnicodeObject *res = NULL; /* the result */
4142 size_t res_alloc = 100; /* # allocated bytes for string in res */
4143 size_t res_used; /* # used bytes */
4144 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4145 PyObject *fseq; /* PySequence_Fast(seq) */
4146 int seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004147 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 int i;
4149
Tim Peters05eba1f2004-08-27 21:32:02 +00004150 fseq = PySequence_Fast(seq, "");
4151 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004152 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004153 }
4154
Tim Peters91879ab2004-08-27 22:35:44 +00004155 /* Grrrr. A codec may be invoked to convert str objects to
4156 * Unicode, and so it's possible to call back into Python code
4157 * during PyUnicode_FromObject(), and so it's possible for a sick
4158 * codec to change the size of fseq (if seq is a list). Therefore
4159 * we have to keep refetching the size -- can't assume seqlen
4160 * is invariant.
4161 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004162 seqlen = PySequence_Fast_GET_SIZE(fseq);
4163 /* If empty sequence, return u"". */
4164 if (seqlen == 0) {
4165 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4166 goto Done;
4167 }
4168 /* If singleton sequence with an exact Unicode, return that. */
4169 if (seqlen == 1) {
4170 item = PySequence_Fast_GET_ITEM(fseq, 0);
4171 if (PyUnicode_CheckExact(item)) {
4172 Py_INCREF(item);
4173 res = (PyUnicodeObject *)item;
4174 goto Done;
4175 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004176 }
4177
Tim Peters05eba1f2004-08-27 21:32:02 +00004178 /* At least two items to join, or one that isn't exact Unicode. */
4179 if (seqlen > 1) {
4180 /* Set up sep and seplen -- they're needed. */
4181 if (separator == NULL) {
4182 sep = &blank;
4183 seplen = 1;
4184 }
4185 else {
4186 internal_separator = PyUnicode_FromObject(separator);
4187 if (internal_separator == NULL)
4188 goto onError;
4189 sep = PyUnicode_AS_UNICODE(internal_separator);
4190 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004191 /* In case PyUnicode_FromObject() mutated seq. */
4192 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004193 }
4194 }
4195
4196 /* Get space. */
4197 res = _PyUnicode_New((int)res_alloc);
4198 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004199 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004200 res_p = PyUnicode_AS_UNICODE(res);
4201 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004202
Tim Peters05eba1f2004-08-27 21:32:02 +00004203 for (i = 0; i < seqlen; ++i) {
4204 size_t itemlen;
4205 size_t new_res_used;
4206
4207 item = PySequence_Fast_GET_ITEM(fseq, i);
4208 /* Convert item to Unicode. */
4209 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4210 PyErr_Format(PyExc_TypeError,
4211 "sequence item %i: expected string or Unicode,"
4212 " %.80s found",
4213 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004214 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004215 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004216 item = PyUnicode_FromObject(item);
4217 if (item == NULL)
4218 goto onError;
4219 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004220
Tim Peters91879ab2004-08-27 22:35:44 +00004221 /* In case PyUnicode_FromObject() mutated seq. */
4222 seqlen = PySequence_Fast_GET_SIZE(fseq);
4223
Tim Peters8ce9f162004-08-27 01:49:32 +00004224 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004226 new_res_used = res_used + itemlen;
4227 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004228 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004229 if (i < seqlen - 1) {
4230 new_res_used += seplen;
4231 if (new_res_used < res_used || new_res_used > INT_MAX)
4232 goto Overflow;
4233 }
4234 if (new_res_used > res_alloc) {
4235 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004236 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004237 size_t oldsize = res_alloc;
4238 res_alloc += res_alloc;
4239 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004240 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004241 } while (new_res_used > res_alloc);
4242 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004243 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004245 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004246 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004248
4249 /* Copy item, and maybe the separator. */
4250 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4251 res_p += itemlen;
4252 if (i < seqlen - 1) {
4253 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4254 res_p += seplen;
4255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004257 res_used = new_res_used;
4258 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004259
Tim Peters05eba1f2004-08-27 21:32:02 +00004260 /* Shrink res to match the used area; this probably can't fail,
4261 * but it's cheap to check.
4262 */
4263 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004264 goto onError;
4265
4266 Done:
4267 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004268 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269 return (PyObject *)res;
4270
Tim Peters8ce9f162004-08-27 01:49:32 +00004271 Overflow:
4272 PyErr_SetString(PyExc_OverflowError,
4273 "join() is too long for a Python string");
4274 Py_DECREF(item);
4275 /* fall through */
4276
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004278 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004279 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004280 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281 return NULL;
4282}
4283
Tim Petersced69f82003-09-16 20:30:58 +00004284static
4285PyUnicodeObject *pad(PyUnicodeObject *self,
4286 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 int right,
4288 Py_UNICODE fill)
4289{
4290 PyUnicodeObject *u;
4291
4292 if (left < 0)
4293 left = 0;
4294 if (right < 0)
4295 right = 0;
4296
Tim Peters7a29bd52001-09-12 03:03:31 +00004297 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298 Py_INCREF(self);
4299 return self;
4300 }
4301
4302 u = _PyUnicode_New(left + self->length + right);
4303 if (u) {
4304 if (left)
4305 Py_UNICODE_FILL(u->str, fill, left);
4306 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4307 if (right)
4308 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4309 }
4310
4311 return u;
4312}
4313
4314#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004315 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 if (!str) \
4317 goto onError; \
4318 if (PyList_Append(list, str)) { \
4319 Py_DECREF(str); \
4320 goto onError; \
4321 } \
4322 else \
4323 Py_DECREF(str);
4324
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004325#define SPLIT_INSERT(data, left, right) \
4326 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4327 if (!str) \
4328 goto onError; \
4329 if (PyList_Insert(list, 0, str)) { \
4330 Py_DECREF(str); \
4331 goto onError; \
4332 } \
4333 else \
4334 Py_DECREF(str);
4335
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336static
4337PyObject *split_whitespace(PyUnicodeObject *self,
4338 PyObject *list,
4339 int maxcount)
4340{
4341 register int i;
4342 register int j;
4343 int len = self->length;
4344 PyObject *str;
4345
4346 for (i = j = 0; i < len; ) {
4347 /* find a token */
4348 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4349 i++;
4350 j = i;
4351 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4352 i++;
4353 if (j < i) {
4354 if (maxcount-- <= 0)
4355 break;
4356 SPLIT_APPEND(self->str, j, i);
4357 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4358 i++;
4359 j = i;
4360 }
4361 }
4362 if (j < len) {
4363 SPLIT_APPEND(self->str, j, len);
4364 }
4365 return list;
4366
4367 onError:
4368 Py_DECREF(list);
4369 return NULL;
4370}
4371
4372PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004373 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374{
4375 register int i;
4376 register int j;
4377 int len;
4378 PyObject *list;
4379 PyObject *str;
4380 Py_UNICODE *data;
4381
4382 string = PyUnicode_FromObject(string);
4383 if (string == NULL)
4384 return NULL;
4385 data = PyUnicode_AS_UNICODE(string);
4386 len = PyUnicode_GET_SIZE(string);
4387
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 list = PyList_New(0);
4389 if (!list)
4390 goto onError;
4391
4392 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004393 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004394
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395 /* Find a line and append it */
4396 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4397 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398
4399 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004400 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401 if (i < len) {
4402 if (data[i] == '\r' && i + 1 < len &&
4403 data[i+1] == '\n')
4404 i += 2;
4405 else
4406 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004407 if (keepends)
4408 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 }
Guido van Rossum86662912000-04-11 15:38:46 +00004410 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004411 j = i;
4412 }
4413 if (j < len) {
4414 SPLIT_APPEND(data, j, len);
4415 }
4416
4417 Py_DECREF(string);
4418 return list;
4419
4420 onError:
4421 Py_DECREF(list);
4422 Py_DECREF(string);
4423 return NULL;
4424}
4425
Tim Petersced69f82003-09-16 20:30:58 +00004426static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427PyObject *split_char(PyUnicodeObject *self,
4428 PyObject *list,
4429 Py_UNICODE ch,
4430 int maxcount)
4431{
4432 register int i;
4433 register int j;
4434 int len = self->length;
4435 PyObject *str;
4436
4437 for (i = j = 0; i < len; ) {
4438 if (self->str[i] == ch) {
4439 if (maxcount-- <= 0)
4440 break;
4441 SPLIT_APPEND(self->str, j, i);
4442 i = j = i + 1;
4443 } else
4444 i++;
4445 }
4446 if (j <= len) {
4447 SPLIT_APPEND(self->str, j, len);
4448 }
4449 return list;
4450
4451 onError:
4452 Py_DECREF(list);
4453 return NULL;
4454}
4455
Tim Petersced69f82003-09-16 20:30:58 +00004456static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004457PyObject *split_substring(PyUnicodeObject *self,
4458 PyObject *list,
4459 PyUnicodeObject *substring,
4460 int maxcount)
4461{
4462 register int i;
4463 register int j;
4464 int len = self->length;
4465 int sublen = substring->length;
4466 PyObject *str;
4467
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004468 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 if (Py_UNICODE_MATCH(self, i, substring)) {
4470 if (maxcount-- <= 0)
4471 break;
4472 SPLIT_APPEND(self->str, j, i);
4473 i = j = i + sublen;
4474 } else
4475 i++;
4476 }
4477 if (j <= len) {
4478 SPLIT_APPEND(self->str, j, len);
4479 }
4480 return list;
4481
4482 onError:
4483 Py_DECREF(list);
4484 return NULL;
4485}
4486
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004487static
4488PyObject *rsplit_whitespace(PyUnicodeObject *self,
4489 PyObject *list,
4490 int maxcount)
4491{
4492 register int i;
4493 register int j;
4494 int len = self->length;
4495 PyObject *str;
4496
4497 for (i = j = len - 1; i >= 0; ) {
4498 /* find a token */
4499 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4500 i--;
4501 j = i;
4502 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4503 i--;
4504 if (j > i) {
4505 if (maxcount-- <= 0)
4506 break;
4507 SPLIT_INSERT(self->str, i + 1, j + 1);
4508 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4509 i--;
4510 j = i;
4511 }
4512 }
4513 if (j >= 0) {
4514 SPLIT_INSERT(self->str, 0, j + 1);
4515 }
4516 return list;
4517
4518 onError:
4519 Py_DECREF(list);
4520 return NULL;
4521}
4522
4523static
4524PyObject *rsplit_char(PyUnicodeObject *self,
4525 PyObject *list,
4526 Py_UNICODE ch,
4527 int maxcount)
4528{
4529 register int i;
4530 register int j;
4531 int len = self->length;
4532 PyObject *str;
4533
4534 for (i = j = len - 1; i >= 0; ) {
4535 if (self->str[i] == ch) {
4536 if (maxcount-- <= 0)
4537 break;
4538 SPLIT_INSERT(self->str, i + 1, j + 1);
4539 j = i = i - 1;
4540 } else
4541 i--;
4542 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004543 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004544 SPLIT_INSERT(self->str, 0, j + 1);
4545 }
4546 return list;
4547
4548 onError:
4549 Py_DECREF(list);
4550 return NULL;
4551}
4552
4553static
4554PyObject *rsplit_substring(PyUnicodeObject *self,
4555 PyObject *list,
4556 PyUnicodeObject *substring,
4557 int maxcount)
4558{
4559 register int i;
4560 register int j;
4561 int len = self->length;
4562 int sublen = substring->length;
4563 PyObject *str;
4564
4565 for (i = len - sublen, j = len; i >= 0; ) {
4566 if (Py_UNICODE_MATCH(self, i, substring)) {
4567 if (maxcount-- <= 0)
4568 break;
4569 SPLIT_INSERT(self->str, i + sublen, j);
4570 j = i;
4571 i -= sublen;
4572 } else
4573 i--;
4574 }
4575 if (j >= 0) {
4576 SPLIT_INSERT(self->str, 0, j);
4577 }
4578 return list;
4579
4580 onError:
4581 Py_DECREF(list);
4582 return NULL;
4583}
4584
Guido van Rossumd57fd912000-03-10 22:53:23 +00004585#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004586#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587
4588static
4589PyObject *split(PyUnicodeObject *self,
4590 PyUnicodeObject *substring,
4591 int maxcount)
4592{
4593 PyObject *list;
4594
4595 if (maxcount < 0)
4596 maxcount = INT_MAX;
4597
4598 list = PyList_New(0);
4599 if (!list)
4600 return NULL;
4601
4602 if (substring == NULL)
4603 return split_whitespace(self,list,maxcount);
4604
4605 else if (substring->length == 1)
4606 return split_char(self,list,substring->str[0],maxcount);
4607
4608 else if (substring->length == 0) {
4609 Py_DECREF(list);
4610 PyErr_SetString(PyExc_ValueError, "empty separator");
4611 return NULL;
4612 }
4613 else
4614 return split_substring(self,list,substring,maxcount);
4615}
4616
Tim Petersced69f82003-09-16 20:30:58 +00004617static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004618PyObject *rsplit(PyUnicodeObject *self,
4619 PyUnicodeObject *substring,
4620 int maxcount)
4621{
4622 PyObject *list;
4623
4624 if (maxcount < 0)
4625 maxcount = INT_MAX;
4626
4627 list = PyList_New(0);
4628 if (!list)
4629 return NULL;
4630
4631 if (substring == NULL)
4632 return rsplit_whitespace(self,list,maxcount);
4633
4634 else if (substring->length == 1)
4635 return rsplit_char(self,list,substring->str[0],maxcount);
4636
4637 else if (substring->length == 0) {
4638 Py_DECREF(list);
4639 PyErr_SetString(PyExc_ValueError, "empty separator");
4640 return NULL;
4641 }
4642 else
4643 return rsplit_substring(self,list,substring,maxcount);
4644}
4645
4646static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647PyObject *replace(PyUnicodeObject *self,
4648 PyUnicodeObject *str1,
4649 PyUnicodeObject *str2,
4650 int maxcount)
4651{
4652 PyUnicodeObject *u;
4653
4654 if (maxcount < 0)
4655 maxcount = INT_MAX;
4656
4657 if (str1->length == 1 && str2->length == 1) {
4658 int i;
4659
4660 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004661 if (!findchar(self->str, self->length, str1->str[0]) &&
4662 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 /* nothing to replace, return original string */
4664 Py_INCREF(self);
4665 u = self;
4666 } else {
4667 Py_UNICODE u1 = str1->str[0];
4668 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004669
Guido van Rossumd57fd912000-03-10 22:53:23 +00004670 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004671 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672 self->length
4673 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004674 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004675 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004676 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 for (i = 0; i < u->length; i++)
4678 if (u->str[i] == u1) {
4679 if (--maxcount < 0)
4680 break;
4681 u->str[i] = u2;
4682 }
4683 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004684 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685
4686 } else {
4687 int n, i;
4688 Py_UNICODE *p;
4689
4690 /* replace strings */
4691 n = count(self, 0, self->length, str1);
4692 if (n > maxcount)
4693 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004694 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004695 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004696 if (PyUnicode_CheckExact(self)) {
4697 Py_INCREF(self);
4698 u = self;
4699 }
4700 else {
4701 u = (PyUnicodeObject *)
4702 PyUnicode_FromUnicode(self->str, self->length);
4703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 } else {
4705 u = _PyUnicode_New(
4706 self->length + n * (str2->length - str1->length));
4707 if (u) {
4708 i = 0;
4709 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004710 if (str1->length > 0) {
4711 while (i <= self->length - str1->length)
4712 if (Py_UNICODE_MATCH(self, i, str1)) {
4713 /* replace string segment */
4714 Py_UNICODE_COPY(p, str2->str, str2->length);
4715 p += str2->length;
4716 i += str1->length;
4717 if (--n <= 0) {
4718 /* copy remaining part */
4719 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4720 break;
4721 }
4722 } else
4723 *p++ = self->str[i++];
4724 } else {
4725 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 Py_UNICODE_COPY(p, str2->str, str2->length);
4727 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004728 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004731 }
4732 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 }
4735 }
4736 }
Tim Petersced69f82003-09-16 20:30:58 +00004737
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738 return (PyObject *) u;
4739}
4740
4741/* --- Unicode Object Methods --------------------------------------------- */
4742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004743PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744"S.title() -> unicode\n\
4745\n\
4746Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004747characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748
4749static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004750unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 return fixup(self, fixtitle);
4753}
4754
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004755PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756"S.capitalize() -> unicode\n\
4757\n\
4758Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004759have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760
4761static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004762unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 return fixup(self, fixcapitalize);
4765}
4766
4767#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004768PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769"S.capwords() -> unicode\n\
4770\n\
4771Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004772normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773
4774static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004775unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776{
4777 PyObject *list;
4778 PyObject *item;
4779 int i;
4780
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781 /* Split into words */
4782 list = split(self, NULL, -1);
4783 if (!list)
4784 return NULL;
4785
4786 /* Capitalize each word */
4787 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4788 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4789 fixcapitalize);
4790 if (item == NULL)
4791 goto onError;
4792 Py_DECREF(PyList_GET_ITEM(list, i));
4793 PyList_SET_ITEM(list, i, item);
4794 }
4795
4796 /* Join the words to form a new string */
4797 item = PyUnicode_Join(NULL, list);
4798
4799onError:
4800 Py_DECREF(list);
4801 return (PyObject *)item;
4802}
4803#endif
4804
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004805/* Argument converter. Coerces to a single unicode character */
4806
4807static int
4808convert_uc(PyObject *obj, void *addr)
4809{
4810 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4811 PyObject *uniobj;
4812 Py_UNICODE *unistr;
4813
4814 uniobj = PyUnicode_FromObject(obj);
4815 if (uniobj == NULL) {
4816 PyErr_SetString(PyExc_TypeError,
4817 "The fill character cannot be converted to Unicode");
4818 return 0;
4819 }
4820 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4821 PyErr_SetString(PyExc_TypeError,
4822 "The fill character must be exactly one character long");
4823 Py_DECREF(uniobj);
4824 return 0;
4825 }
4826 unistr = PyUnicode_AS_UNICODE(uniobj);
4827 *fillcharloc = unistr[0];
4828 Py_DECREF(uniobj);
4829 return 1;
4830}
4831
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004832PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004833"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004835Return S centered in a Unicode string of length width. Padding is\n\
4836done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837
4838static PyObject *
4839unicode_center(PyUnicodeObject *self, PyObject *args)
4840{
4841 int marg, left;
4842 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004843 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004845 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 return NULL;
4847
Tim Peters7a29bd52001-09-12 03:03:31 +00004848 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 Py_INCREF(self);
4850 return (PyObject*) self;
4851 }
4852
4853 marg = width - self->length;
4854 left = marg / 2 + (marg & width & 1);
4855
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004856 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857}
4858
Marc-André Lemburge5034372000-08-08 08:04:29 +00004859#if 0
4860
4861/* This code should go into some future Unicode collation support
4862 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004863 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004864
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004865/* speedy UTF-16 code point order comparison */
4866/* gleaned from: */
4867/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4868
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004869static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004870{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004871 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004872 0, 0, 0, 0, 0, 0, 0, 0,
4873 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004874 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004875};
4876
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877static int
4878unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4879{
4880 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004881
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 Py_UNICODE *s1 = str1->str;
4883 Py_UNICODE *s2 = str2->str;
4884
4885 len1 = str1->length;
4886 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004887
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004889 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004890
4891 c1 = *s1++;
4892 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004893
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004894 if (c1 > (1<<11) * 26)
4895 c1 += utf16Fixup[c1>>11];
4896 if (c2 > (1<<11) * 26)
4897 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004898 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004899
4900 if (c1 != c2)
4901 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004902
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004903 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904 }
4905
4906 return (len1 < len2) ? -1 : (len1 != len2);
4907}
4908
Marc-André Lemburge5034372000-08-08 08:04:29 +00004909#else
4910
4911static int
4912unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4913{
4914 register int len1, len2;
4915
4916 Py_UNICODE *s1 = str1->str;
4917 Py_UNICODE *s2 = str2->str;
4918
4919 len1 = str1->length;
4920 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004921
Marc-André Lemburge5034372000-08-08 08:04:29 +00004922 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004923 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004924
Fredrik Lundh45714e92001-06-26 16:39:36 +00004925 c1 = *s1++;
4926 c2 = *s2++;
4927
4928 if (c1 != c2)
4929 return (c1 < c2) ? -1 : 1;
4930
Marc-André Lemburge5034372000-08-08 08:04:29 +00004931 len1--; len2--;
4932 }
4933
4934 return (len1 < len2) ? -1 : (len1 != len2);
4935}
4936
4937#endif
4938
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939int PyUnicode_Compare(PyObject *left,
4940 PyObject *right)
4941{
4942 PyUnicodeObject *u = NULL, *v = NULL;
4943 int result;
4944
4945 /* Coerce the two arguments */
4946 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4947 if (u == NULL)
4948 goto onError;
4949 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4950 if (v == NULL)
4951 goto onError;
4952
Thomas Wouters7e474022000-07-16 12:04:32 +00004953 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954 if (v == u) {
4955 Py_DECREF(u);
4956 Py_DECREF(v);
4957 return 0;
4958 }
4959
4960 result = unicode_compare(u, v);
4961
4962 Py_DECREF(u);
4963 Py_DECREF(v);
4964 return result;
4965
4966onError:
4967 Py_XDECREF(u);
4968 Py_XDECREF(v);
4969 return -1;
4970}
4971
Guido van Rossum403d68b2000-03-13 15:55:09 +00004972int PyUnicode_Contains(PyObject *container,
4973 PyObject *element)
4974{
4975 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004976 int result, size;
4977 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004978
4979 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004980 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004981 if (v == NULL) {
4982 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004983 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004984 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004985 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004986 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004987 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004988 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004989
Barry Warsaw817918c2002-08-06 16:58:21 +00004990 size = PyUnicode_GET_SIZE(v);
4991 rhs = PyUnicode_AS_UNICODE(v);
4992 lhs = PyUnicode_AS_UNICODE(u);
4993
Guido van Rossum403d68b2000-03-13 15:55:09 +00004994 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004995 if (size == 1) {
4996 end = lhs + PyUnicode_GET_SIZE(u);
4997 while (lhs < end) {
4998 if (*lhs++ == *rhs) {
4999 result = 1;
5000 break;
5001 }
5002 }
5003 }
5004 else {
5005 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5006 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005007 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005008 result = 1;
5009 break;
5010 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005011 }
5012 }
5013
5014 Py_DECREF(u);
5015 Py_DECREF(v);
5016 return result;
5017
5018onError:
5019 Py_XDECREF(u);
5020 Py_XDECREF(v);
5021 return -1;
5022}
5023
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024/* Concat to string or Unicode object giving a new Unicode object. */
5025
5026PyObject *PyUnicode_Concat(PyObject *left,
5027 PyObject *right)
5028{
5029 PyUnicodeObject *u = NULL, *v = NULL, *w;
5030
5031 /* Coerce the two arguments */
5032 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5033 if (u == NULL)
5034 goto onError;
5035 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5036 if (v == NULL)
5037 goto onError;
5038
5039 /* Shortcuts */
5040 if (v == unicode_empty) {
5041 Py_DECREF(v);
5042 return (PyObject *)u;
5043 }
5044 if (u == unicode_empty) {
5045 Py_DECREF(u);
5046 return (PyObject *)v;
5047 }
5048
5049 /* Concat the two Unicode strings */
5050 w = _PyUnicode_New(u->length + v->length);
5051 if (w == NULL)
5052 goto onError;
5053 Py_UNICODE_COPY(w->str, u->str, u->length);
5054 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5055
5056 Py_DECREF(u);
5057 Py_DECREF(v);
5058 return (PyObject *)w;
5059
5060onError:
5061 Py_XDECREF(u);
5062 Py_XDECREF(v);
5063 return NULL;
5064}
5065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005066PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067"S.count(sub[, start[, end]]) -> int\n\
5068\n\
5069Return the number of occurrences of substring sub in Unicode string\n\
5070S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005071interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072
5073static PyObject *
5074unicode_count(PyUnicodeObject *self, PyObject *args)
5075{
5076 PyUnicodeObject *substring;
5077 int start = 0;
5078 int end = INT_MAX;
5079 PyObject *result;
5080
Guido van Rossumb8872e62000-05-09 14:14:27 +00005081 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5082 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083 return NULL;
5084
5085 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5086 (PyObject *)substring);
5087 if (substring == NULL)
5088 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005089
Guido van Rossumd57fd912000-03-10 22:53:23 +00005090 if (start < 0)
5091 start += self->length;
5092 if (start < 0)
5093 start = 0;
5094 if (end > self->length)
5095 end = self->length;
5096 if (end < 0)
5097 end += self->length;
5098 if (end < 0)
5099 end = 0;
5100
5101 result = PyInt_FromLong((long) count(self, start, end, substring));
5102
5103 Py_DECREF(substring);
5104 return result;
5105}
5106
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005107PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005108"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005110Encodes S using the codec registered for encoding. encoding defaults\n\
5111to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005112handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005113a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5114'xmlcharrefreplace' as well as any other name registered with\n\
5115codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116
5117static PyObject *
5118unicode_encode(PyUnicodeObject *self, PyObject *args)
5119{
5120 char *encoding = NULL;
5121 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005122 PyObject *v;
5123
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5125 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005126 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005127 if (v == NULL)
5128 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005129 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5130 PyErr_Format(PyExc_TypeError,
5131 "encoder did not return a string/unicode object "
5132 "(type=%.400s)",
5133 v->ob_type->tp_name);
5134 Py_DECREF(v);
5135 return NULL;
5136 }
5137 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005138
5139 onError:
5140 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005141}
5142
5143PyDoc_STRVAR(decode__doc__,
5144"S.decode([encoding[,errors]]) -> string or unicode\n\
5145\n\
5146Decodes S using the codec registered for encoding. encoding defaults\n\
5147to the default encoding. errors may be given to set a different error\n\
5148handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5149a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5150as well as any other name registerd with codecs.register_error that is\n\
5151able to handle UnicodeDecodeErrors.");
5152
5153static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005154unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005155{
5156 char *encoding = NULL;
5157 char *errors = NULL;
5158 PyObject *v;
5159
5160 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5161 return NULL;
5162 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005163 if (v == NULL)
5164 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005165 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5166 PyErr_Format(PyExc_TypeError,
5167 "decoder did not return a string/unicode object "
5168 "(type=%.400s)",
5169 v->ob_type->tp_name);
5170 Py_DECREF(v);
5171 return NULL;
5172 }
5173 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005174
5175 onError:
5176 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177}
5178
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005179PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180"S.expandtabs([tabsize]) -> unicode\n\
5181\n\
5182Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005183If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184
5185static PyObject*
5186unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5187{
5188 Py_UNICODE *e;
5189 Py_UNICODE *p;
5190 Py_UNICODE *q;
5191 int i, j;
5192 PyUnicodeObject *u;
5193 int tabsize = 8;
5194
5195 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5196 return NULL;
5197
Thomas Wouters7e474022000-07-16 12:04:32 +00005198 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 i = j = 0;
5200 e = self->str + self->length;
5201 for (p = self->str; p < e; p++)
5202 if (*p == '\t') {
5203 if (tabsize > 0)
5204 j += tabsize - (j % tabsize);
5205 }
5206 else {
5207 j++;
5208 if (*p == '\n' || *p == '\r') {
5209 i += j;
5210 j = 0;
5211 }
5212 }
5213
5214 /* Second pass: create output string and fill it */
5215 u = _PyUnicode_New(i + j);
5216 if (!u)
5217 return NULL;
5218
5219 j = 0;
5220 q = u->str;
5221
5222 for (p = self->str; p < e; p++)
5223 if (*p == '\t') {
5224 if (tabsize > 0) {
5225 i = tabsize - (j % tabsize);
5226 j += i;
5227 while (i--)
5228 *q++ = ' ';
5229 }
5230 }
5231 else {
5232 j++;
5233 *q++ = *p;
5234 if (*p == '\n' || *p == '\r')
5235 j = 0;
5236 }
5237
5238 return (PyObject*) u;
5239}
5240
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005241PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242"S.find(sub [,start [,end]]) -> int\n\
5243\n\
5244Return the lowest index in S where substring sub is found,\n\
5245such that sub is contained within s[start,end]. Optional\n\
5246arguments start and end are interpreted as in slice notation.\n\
5247\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005248Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005249
5250static PyObject *
5251unicode_find(PyUnicodeObject *self, PyObject *args)
5252{
5253 PyUnicodeObject *substring;
5254 int start = 0;
5255 int end = INT_MAX;
5256 PyObject *result;
5257
Guido van Rossumb8872e62000-05-09 14:14:27 +00005258 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5259 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 return NULL;
5261 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5262 (PyObject *)substring);
5263 if (substring == NULL)
5264 return NULL;
5265
5266 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5267
5268 Py_DECREF(substring);
5269 return result;
5270}
5271
5272static PyObject *
5273unicode_getitem(PyUnicodeObject *self, int index)
5274{
5275 if (index < 0 || index >= self->length) {
5276 PyErr_SetString(PyExc_IndexError, "string index out of range");
5277 return NULL;
5278 }
5279
5280 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5281}
5282
5283static long
5284unicode_hash(PyUnicodeObject *self)
5285{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005286 /* Since Unicode objects compare equal to their ASCII string
5287 counterparts, they should use the individual character values
5288 as basis for their hash value. This is needed to assure that
5289 strings and Unicode objects behave in the same way as
5290 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291
Fredrik Lundhdde61642000-07-10 18:27:47 +00005292 register int len;
5293 register Py_UNICODE *p;
5294 register long x;
5295
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296 if (self->hash != -1)
5297 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005298 len = PyUnicode_GET_SIZE(self);
5299 p = PyUnicode_AS_UNICODE(self);
5300 x = *p << 7;
5301 while (--len >= 0)
5302 x = (1000003*x) ^ *p++;
5303 x ^= PyUnicode_GET_SIZE(self);
5304 if (x == -1)
5305 x = -2;
5306 self->hash = x;
5307 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308}
5309
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005310PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311"S.index(sub [,start [,end]]) -> int\n\
5312\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005313Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314
5315static PyObject *
5316unicode_index(PyUnicodeObject *self, PyObject *args)
5317{
5318 int result;
5319 PyUnicodeObject *substring;
5320 int start = 0;
5321 int end = INT_MAX;
5322
Guido van Rossumb8872e62000-05-09 14:14:27 +00005323 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5324 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005325 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005326
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5328 (PyObject *)substring);
5329 if (substring == NULL)
5330 return NULL;
5331
5332 result = findstring(self, substring, start, end, 1);
5333
5334 Py_DECREF(substring);
5335 if (result < 0) {
5336 PyErr_SetString(PyExc_ValueError, "substring not found");
5337 return NULL;
5338 }
5339 return PyInt_FromLong(result);
5340}
5341
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005342PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005343"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005345Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005346at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347
5348static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005349unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350{
5351 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5352 register const Py_UNICODE *e;
5353 int cased;
5354
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 /* Shortcut for single character strings */
5356 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005357 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005359 /* Special case for empty strings */
5360 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005361 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005362
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363 e = p + PyUnicode_GET_SIZE(self);
5364 cased = 0;
5365 for (; p < e; p++) {
5366 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005369 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370 else if (!cased && Py_UNICODE_ISLOWER(ch))
5371 cased = 1;
5372 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005373 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005374}
5375
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005376PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005377"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005379Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005380at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381
5382static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005383unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384{
5385 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5386 register const Py_UNICODE *e;
5387 int cased;
5388
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 /* Shortcut for single character strings */
5390 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005391 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005393 /* Special case for empty strings */
5394 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005395 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005396
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 e = p + PyUnicode_GET_SIZE(self);
5398 cased = 0;
5399 for (; p < e; p++) {
5400 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005401
Guido van Rossumd57fd912000-03-10 22:53:23 +00005402 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005403 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 else if (!cased && Py_UNICODE_ISUPPER(ch))
5405 cased = 1;
5406 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005407 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408}
5409
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005410PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005411"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005413Return True if S is a titlecased string and there is at least one\n\
5414character in S, i.e. upper- and titlecase characters may only\n\
5415follow uncased characters and lowercase characters only cased ones.\n\
5416Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417
5418static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005419unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005420{
5421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5422 register const Py_UNICODE *e;
5423 int cased, previous_is_cased;
5424
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 /* Shortcut for single character strings */
5426 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005427 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5428 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005430 /* Special case for empty strings */
5431 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005432 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005433
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 e = p + PyUnicode_GET_SIZE(self);
5435 cased = 0;
5436 previous_is_cased = 0;
5437 for (; p < e; p++) {
5438 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005439
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5441 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005442 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005443 previous_is_cased = 1;
5444 cased = 1;
5445 }
5446 else if (Py_UNICODE_ISLOWER(ch)) {
5447 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005448 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 previous_is_cased = 1;
5450 cased = 1;
5451 }
5452 else
5453 previous_is_cased = 0;
5454 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005455 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456}
5457
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005458PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005459"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005461Return True if all characters in S are whitespace\n\
5462and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463
5464static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005465unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005466{
5467 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5468 register const Py_UNICODE *e;
5469
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 /* Shortcut for single character strings */
5471 if (PyUnicode_GET_SIZE(self) == 1 &&
5472 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005473 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005475 /* Special case for empty strings */
5476 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005477 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005478
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 e = p + PyUnicode_GET_SIZE(self);
5480 for (; p < e; p++) {
5481 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005482 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005483 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005484 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485}
5486
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005487PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005488"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005489\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005490Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005491and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005492
5493static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005494unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005495{
5496 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5497 register const Py_UNICODE *e;
5498
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005499 /* Shortcut for single character strings */
5500 if (PyUnicode_GET_SIZE(self) == 1 &&
5501 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005502 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005503
5504 /* Special case for empty strings */
5505 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005506 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005507
5508 e = p + PyUnicode_GET_SIZE(self);
5509 for (; p < e; p++) {
5510 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005511 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005512 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005513 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005514}
5515
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005516PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005517"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005518\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005519Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005520and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005521
5522static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005523unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005524{
5525 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5526 register const Py_UNICODE *e;
5527
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005528 /* Shortcut for single character strings */
5529 if (PyUnicode_GET_SIZE(self) == 1 &&
5530 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005531 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005532
5533 /* Special case for empty strings */
5534 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005535 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005536
5537 e = p + PyUnicode_GET_SIZE(self);
5538 for (; p < e; p++) {
5539 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005540 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005541 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005542 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005543}
5544
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005545PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005546"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005547\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005548Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005549False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005550
5551static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005552unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553{
5554 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5555 register const Py_UNICODE *e;
5556
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557 /* Shortcut for single character strings */
5558 if (PyUnicode_GET_SIZE(self) == 1 &&
5559 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005560 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005562 /* Special case for empty strings */
5563 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005564 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005565
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 e = p + PyUnicode_GET_SIZE(self);
5567 for (; p < e; p++) {
5568 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005569 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005570 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005571 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572}
5573
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005574PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005575"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005577Return True if all characters in S are digits\n\
5578and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579
5580static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005581unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582{
5583 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5584 register const Py_UNICODE *e;
5585
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586 /* Shortcut for single character strings */
5587 if (PyUnicode_GET_SIZE(self) == 1 &&
5588 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005589 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005590
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005591 /* Special case for empty strings */
5592 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005593 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005594
Guido van Rossumd57fd912000-03-10 22:53:23 +00005595 e = p + PyUnicode_GET_SIZE(self);
5596 for (; p < e; p++) {
5597 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005598 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005600 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005601}
5602
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005603PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005604"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005606Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005607False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608
5609static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005610unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611{
5612 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5613 register const Py_UNICODE *e;
5614
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 /* Shortcut for single character strings */
5616 if (PyUnicode_GET_SIZE(self) == 1 &&
5617 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005618 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005620 /* Special case for empty strings */
5621 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005622 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005623
Guido van Rossumd57fd912000-03-10 22:53:23 +00005624 e = p + PyUnicode_GET_SIZE(self);
5625 for (; p < e; p++) {
5626 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005627 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005629 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630}
5631
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005632PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633"S.join(sequence) -> unicode\n\
5634\n\
5635Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005636sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637
5638static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005639unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005641 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005642}
5643
5644static int
5645unicode_length(PyUnicodeObject *self)
5646{
5647 return self->length;
5648}
5649
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005650PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005651"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005652\n\
5653Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005654done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005655
5656static PyObject *
5657unicode_ljust(PyUnicodeObject *self, PyObject *args)
5658{
5659 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005660 Py_UNICODE fillchar = ' ';
5661
5662 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 return NULL;
5664
Tim Peters7a29bd52001-09-12 03:03:31 +00005665 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 Py_INCREF(self);
5667 return (PyObject*) self;
5668 }
5669
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005670 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005671}
5672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005673PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005674"S.lower() -> unicode\n\
5675\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005676Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677
5678static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005679unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005681 return fixup(self, fixlower);
5682}
5683
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005684#define LEFTSTRIP 0
5685#define RIGHTSTRIP 1
5686#define BOTHSTRIP 2
5687
5688/* Arrays indexed by above */
5689static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5690
5691#define STRIPNAME(i) (stripformat[i]+3)
5692
5693static const Py_UNICODE *
5694unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5695{
Tim Peters030a5ce2002-04-22 19:00:10 +00005696 size_t i;
5697 for (i = 0; i < n; ++i)
5698 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005699 return s+i;
5700 return NULL;
5701}
5702
5703/* externally visible for str.strip(unicode) */
5704PyObject *
5705_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5706{
5707 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5708 int len = PyUnicode_GET_SIZE(self);
5709 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5710 int seplen = PyUnicode_GET_SIZE(sepobj);
5711 int i, j;
5712
5713 i = 0;
5714 if (striptype != RIGHTSTRIP) {
5715 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5716 i++;
5717 }
5718 }
5719
5720 j = len;
5721 if (striptype != LEFTSTRIP) {
5722 do {
5723 j--;
5724 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5725 j++;
5726 }
5727
5728 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5729 Py_INCREF(self);
5730 return (PyObject*)self;
5731 }
5732 else
5733 return PyUnicode_FromUnicode(s+i, j-i);
5734}
5735
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736
5737static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005738do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005740 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5741 int len = PyUnicode_GET_SIZE(self), i, j;
5742
5743 i = 0;
5744 if (striptype != RIGHTSTRIP) {
5745 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5746 i++;
5747 }
5748 }
5749
5750 j = len;
5751 if (striptype != LEFTSTRIP) {
5752 do {
5753 j--;
5754 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5755 j++;
5756 }
5757
5758 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5759 Py_INCREF(self);
5760 return (PyObject*)self;
5761 }
5762 else
5763 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764}
5765
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005766
5767static PyObject *
5768do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5769{
5770 PyObject *sep = NULL;
5771
5772 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5773 return NULL;
5774
5775 if (sep != NULL && sep != Py_None) {
5776 if (PyUnicode_Check(sep))
5777 return _PyUnicode_XStrip(self, striptype, sep);
5778 else if (PyString_Check(sep)) {
5779 PyObject *res;
5780 sep = PyUnicode_FromObject(sep);
5781 if (sep==NULL)
5782 return NULL;
5783 res = _PyUnicode_XStrip(self, striptype, sep);
5784 Py_DECREF(sep);
5785 return res;
5786 }
5787 else {
5788 PyErr_Format(PyExc_TypeError,
5789 "%s arg must be None, unicode or str",
5790 STRIPNAME(striptype));
5791 return NULL;
5792 }
5793 }
5794
5795 return do_strip(self, striptype);
5796}
5797
5798
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005799PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005800"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005801\n\
5802Return a copy of the string S with leading and trailing\n\
5803whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005804If chars is given and not None, remove characters in chars instead.\n\
5805If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005806
5807static PyObject *
5808unicode_strip(PyUnicodeObject *self, PyObject *args)
5809{
5810 if (PyTuple_GET_SIZE(args) == 0)
5811 return do_strip(self, BOTHSTRIP); /* Common case */
5812 else
5813 return do_argstrip(self, BOTHSTRIP, args);
5814}
5815
5816
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005817PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005818"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005819\n\
5820Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005821If chars is given and not None, remove characters in chars instead.\n\
5822If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005823
5824static PyObject *
5825unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5826{
5827 if (PyTuple_GET_SIZE(args) == 0)
5828 return do_strip(self, LEFTSTRIP); /* Common case */
5829 else
5830 return do_argstrip(self, LEFTSTRIP, args);
5831}
5832
5833
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005834PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005835"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005836\n\
5837Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005838If chars is given and not None, remove characters in chars instead.\n\
5839If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005840
5841static PyObject *
5842unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5843{
5844 if (PyTuple_GET_SIZE(args) == 0)
5845 return do_strip(self, RIGHTSTRIP); /* Common case */
5846 else
5847 return do_argstrip(self, RIGHTSTRIP, args);
5848}
5849
5850
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851static PyObject*
5852unicode_repeat(PyUnicodeObject *str, int len)
5853{
5854 PyUnicodeObject *u;
5855 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005856 int nchars;
5857 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858
5859 if (len < 0)
5860 len = 0;
5861
Tim Peters7a29bd52001-09-12 03:03:31 +00005862 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863 /* no repeat, return original string */
5864 Py_INCREF(str);
5865 return (PyObject*) str;
5866 }
Tim Peters8f422462000-09-09 06:13:41 +00005867
5868 /* ensure # of chars needed doesn't overflow int and # of bytes
5869 * needed doesn't overflow size_t
5870 */
5871 nchars = len * str->length;
5872 if (len && nchars / len != str->length) {
5873 PyErr_SetString(PyExc_OverflowError,
5874 "repeated string is too long");
5875 return NULL;
5876 }
5877 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5878 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5879 PyErr_SetString(PyExc_OverflowError,
5880 "repeated string is too long");
5881 return NULL;
5882 }
5883 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 if (!u)
5885 return NULL;
5886
5887 p = u->str;
5888
5889 while (len-- > 0) {
5890 Py_UNICODE_COPY(p, str->str, str->length);
5891 p += str->length;
5892 }
5893
5894 return (PyObject*) u;
5895}
5896
5897PyObject *PyUnicode_Replace(PyObject *obj,
5898 PyObject *subobj,
5899 PyObject *replobj,
5900 int maxcount)
5901{
5902 PyObject *self;
5903 PyObject *str1;
5904 PyObject *str2;
5905 PyObject *result;
5906
5907 self = PyUnicode_FromObject(obj);
5908 if (self == NULL)
5909 return NULL;
5910 str1 = PyUnicode_FromObject(subobj);
5911 if (str1 == NULL) {
5912 Py_DECREF(self);
5913 return NULL;
5914 }
5915 str2 = PyUnicode_FromObject(replobj);
5916 if (str2 == NULL) {
5917 Py_DECREF(self);
5918 Py_DECREF(str1);
5919 return NULL;
5920 }
Tim Petersced69f82003-09-16 20:30:58 +00005921 result = replace((PyUnicodeObject *)self,
5922 (PyUnicodeObject *)str1,
5923 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005924 maxcount);
5925 Py_DECREF(self);
5926 Py_DECREF(str1);
5927 Py_DECREF(str2);
5928 return result;
5929}
5930
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005931PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932"S.replace (old, new[, maxsplit]) -> unicode\n\
5933\n\
5934Return a copy of S with all occurrences of substring\n\
5935old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005936given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
5938static PyObject*
5939unicode_replace(PyUnicodeObject *self, PyObject *args)
5940{
5941 PyUnicodeObject *str1;
5942 PyUnicodeObject *str2;
5943 int maxcount = -1;
5944 PyObject *result;
5945
5946 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5947 return NULL;
5948 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5949 if (str1 == NULL)
5950 return NULL;
5951 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005952 if (str2 == NULL) {
5953 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005955 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005956
5957 result = replace(self, str1, str2, maxcount);
5958
5959 Py_DECREF(str1);
5960 Py_DECREF(str2);
5961 return result;
5962}
5963
5964static
5965PyObject *unicode_repr(PyObject *unicode)
5966{
5967 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5968 PyUnicode_GET_SIZE(unicode),
5969 1);
5970}
5971
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005972PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973"S.rfind(sub [,start [,end]]) -> int\n\
5974\n\
5975Return the highest index in S where substring sub is found,\n\
5976such that sub is contained within s[start,end]. Optional\n\
5977arguments start and end are interpreted as in slice notation.\n\
5978\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005979Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
5981static PyObject *
5982unicode_rfind(PyUnicodeObject *self, PyObject *args)
5983{
5984 PyUnicodeObject *substring;
5985 int start = 0;
5986 int end = INT_MAX;
5987 PyObject *result;
5988
Guido van Rossumb8872e62000-05-09 14:14:27 +00005989 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5990 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005991 return NULL;
5992 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5993 (PyObject *)substring);
5994 if (substring == NULL)
5995 return NULL;
5996
5997 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5998
5999 Py_DECREF(substring);
6000 return result;
6001}
6002
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006003PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006004"S.rindex(sub [,start [,end]]) -> int\n\
6005\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006006Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006007
6008static PyObject *
6009unicode_rindex(PyUnicodeObject *self, PyObject *args)
6010{
6011 int result;
6012 PyUnicodeObject *substring;
6013 int start = 0;
6014 int end = INT_MAX;
6015
Guido van Rossumb8872e62000-05-09 14:14:27 +00006016 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6017 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018 return NULL;
6019 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6020 (PyObject *)substring);
6021 if (substring == NULL)
6022 return NULL;
6023
6024 result = findstring(self, substring, start, end, -1);
6025
6026 Py_DECREF(substring);
6027 if (result < 0) {
6028 PyErr_SetString(PyExc_ValueError, "substring not found");
6029 return NULL;
6030 }
6031 return PyInt_FromLong(result);
6032}
6033
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006034PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006035"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006036\n\
6037Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006038done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039
6040static PyObject *
6041unicode_rjust(PyUnicodeObject *self, PyObject *args)
6042{
6043 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006044 Py_UNICODE fillchar = ' ';
6045
6046 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047 return NULL;
6048
Tim Peters7a29bd52001-09-12 03:03:31 +00006049 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050 Py_INCREF(self);
6051 return (PyObject*) self;
6052 }
6053
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006054 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055}
6056
Guido van Rossumd57fd912000-03-10 22:53:23 +00006057static PyObject*
6058unicode_slice(PyUnicodeObject *self, int start, int end)
6059{
6060 /* standard clamping */
6061 if (start < 0)
6062 start = 0;
6063 if (end < 0)
6064 end = 0;
6065 if (end > self->length)
6066 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006067 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006068 /* full slice, return original string */
6069 Py_INCREF(self);
6070 return (PyObject*) self;
6071 }
6072 if (start > end)
6073 start = end;
6074 /* copy slice */
6075 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6076 end - start);
6077}
6078
6079PyObject *PyUnicode_Split(PyObject *s,
6080 PyObject *sep,
6081 int maxsplit)
6082{
6083 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006084
Guido van Rossumd57fd912000-03-10 22:53:23 +00006085 s = PyUnicode_FromObject(s);
6086 if (s == NULL)
6087 return NULL;
6088 if (sep != NULL) {
6089 sep = PyUnicode_FromObject(sep);
6090 if (sep == NULL) {
6091 Py_DECREF(s);
6092 return NULL;
6093 }
6094 }
6095
6096 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6097
6098 Py_DECREF(s);
6099 Py_XDECREF(sep);
6100 return result;
6101}
6102
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006103PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006104"S.split([sep [,maxsplit]]) -> list of strings\n\
6105\n\
6106Return a list of the words in S, using sep as the\n\
6107delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006108splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006109any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
6111static PyObject*
6112unicode_split(PyUnicodeObject *self, PyObject *args)
6113{
6114 PyObject *substring = Py_None;
6115 int maxcount = -1;
6116
6117 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6118 return NULL;
6119
6120 if (substring == Py_None)
6121 return split(self, NULL, maxcount);
6122 else if (PyUnicode_Check(substring))
6123 return split(self, (PyUnicodeObject *)substring, maxcount);
6124 else
6125 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6126}
6127
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006128PyObject *PyUnicode_RSplit(PyObject *s,
6129 PyObject *sep,
6130 int maxsplit)
6131{
6132 PyObject *result;
6133
6134 s = PyUnicode_FromObject(s);
6135 if (s == NULL)
6136 return NULL;
6137 if (sep != NULL) {
6138 sep = PyUnicode_FromObject(sep);
6139 if (sep == NULL) {
6140 Py_DECREF(s);
6141 return NULL;
6142 }
6143 }
6144
6145 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6146
6147 Py_DECREF(s);
6148 Py_XDECREF(sep);
6149 return result;
6150}
6151
6152PyDoc_STRVAR(rsplit__doc__,
6153"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6154\n\
6155Return a list of the words in S, using sep as the\n\
6156delimiter string, starting at the end of the string and\n\
6157working to the front. If maxsplit is given, at most maxsplit\n\
6158splits are done. If sep is not specified, any whitespace string\n\
6159is a separator.");
6160
6161static PyObject*
6162unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6163{
6164 PyObject *substring = Py_None;
6165 int maxcount = -1;
6166
6167 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6168 return NULL;
6169
6170 if (substring == Py_None)
6171 return rsplit(self, NULL, maxcount);
6172 else if (PyUnicode_Check(substring))
6173 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6174 else
6175 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6176}
6177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006178PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006179"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180\n\
6181Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006182Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006183is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006184
6185static PyObject*
6186unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6187{
Guido van Rossum86662912000-04-11 15:38:46 +00006188 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006189
Guido van Rossum86662912000-04-11 15:38:46 +00006190 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 return NULL;
6192
Guido van Rossum86662912000-04-11 15:38:46 +00006193 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006194}
6195
6196static
6197PyObject *unicode_str(PyUnicodeObject *self)
6198{
Fred Drakee4315f52000-05-09 19:53:39 +00006199 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006200}
6201
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006202PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203"S.swapcase() -> unicode\n\
6204\n\
6205Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006206and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207
6208static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006209unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211 return fixup(self, fixswapcase);
6212}
6213
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006214PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006215"S.translate(table) -> unicode\n\
6216\n\
6217Return a copy of the string S, where all characters have been mapped\n\
6218through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006219Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6220Unmapped characters are left untouched. Characters mapped to None\n\
6221are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006222
6223static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006224unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006225{
Tim Petersced69f82003-09-16 20:30:58 +00006226 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006227 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006228 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 "ignore");
6230}
6231
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006232PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233"S.upper() -> unicode\n\
6234\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006235Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236
6237static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006238unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 return fixup(self, fixupper);
6241}
6242
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006243PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244"S.zfill(width) -> unicode\n\
6245\n\
6246Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006247of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248
6249static PyObject *
6250unicode_zfill(PyUnicodeObject *self, PyObject *args)
6251{
6252 int fill;
6253 PyUnicodeObject *u;
6254
6255 int width;
6256 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6257 return NULL;
6258
6259 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006260 if (PyUnicode_CheckExact(self)) {
6261 Py_INCREF(self);
6262 return (PyObject*) self;
6263 }
6264 else
6265 return PyUnicode_FromUnicode(
6266 PyUnicode_AS_UNICODE(self),
6267 PyUnicode_GET_SIZE(self)
6268 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006269 }
6270
6271 fill = width - self->length;
6272
6273 u = pad(self, fill, 0, '0');
6274
Walter Dörwald068325e2002-04-15 13:36:47 +00006275 if (u == NULL)
6276 return NULL;
6277
Guido van Rossumd57fd912000-03-10 22:53:23 +00006278 if (u->str[fill] == '+' || u->str[fill] == '-') {
6279 /* move sign to beginning of string */
6280 u->str[0] = u->str[fill];
6281 u->str[fill] = '0';
6282 }
6283
6284 return (PyObject*) u;
6285}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006286
6287#if 0
6288static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006289unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 return PyInt_FromLong(unicode_freelist_size);
6292}
6293#endif
6294
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006295PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006296"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006298Return True if S starts with the specified prefix, False otherwise.\n\
6299With optional start, test S beginning at that position.\n\
6300With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301
6302static PyObject *
6303unicode_startswith(PyUnicodeObject *self,
6304 PyObject *args)
6305{
6306 PyUnicodeObject *substring;
6307 int start = 0;
6308 int end = INT_MAX;
6309 PyObject *result;
6310
Guido van Rossumb8872e62000-05-09 14:14:27 +00006311 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6312 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 return NULL;
6314 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6315 (PyObject *)substring);
6316 if (substring == NULL)
6317 return NULL;
6318
Guido van Rossum77f6a652002-04-03 22:41:51 +00006319 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006320
6321 Py_DECREF(substring);
6322 return result;
6323}
6324
6325
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006326PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006327"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006329Return True if S ends with the specified suffix, False otherwise.\n\
6330With optional start, test S beginning at that position.\n\
6331With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332
6333static PyObject *
6334unicode_endswith(PyUnicodeObject *self,
6335 PyObject *args)
6336{
6337 PyUnicodeObject *substring;
6338 int start = 0;
6339 int end = INT_MAX;
6340 PyObject *result;
6341
Guido van Rossumb8872e62000-05-09 14:14:27 +00006342 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6343 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006344 return NULL;
6345 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6346 (PyObject *)substring);
6347 if (substring == NULL)
6348 return NULL;
6349
Guido van Rossum77f6a652002-04-03 22:41:51 +00006350 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351
6352 Py_DECREF(substring);
6353 return result;
6354}
6355
6356
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006357
6358static PyObject *
6359unicode_getnewargs(PyUnicodeObject *v)
6360{
6361 return Py_BuildValue("(u#)", v->str, v->length);
6362}
6363
6364
Guido van Rossumd57fd912000-03-10 22:53:23 +00006365static PyMethodDef unicode_methods[] = {
6366
6367 /* Order is according to common usage: often used methods should
6368 appear first, since lookup is done sequentially. */
6369
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006370 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6371 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6372 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006373 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006374 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6375 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6376 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6377 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6378 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6379 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6380 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6381 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6382 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6383 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006384 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006385 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006386/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6387 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6388 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6389 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006390 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006391 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006392 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006393 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6394 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6395 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6396 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6397 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6398 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6399 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6400 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6401 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6402 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6403 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6404 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6405 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6406 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006407 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006408#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006409 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006410#endif
6411
6412#if 0
6413 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006414 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415#endif
6416
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006417 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 {NULL, NULL}
6419};
6420
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006421static PyObject *
6422unicode_mod(PyObject *v, PyObject *w)
6423{
6424 if (!PyUnicode_Check(v)) {
6425 Py_INCREF(Py_NotImplemented);
6426 return Py_NotImplemented;
6427 }
6428 return PyUnicode_Format(v, w);
6429}
6430
6431static PyNumberMethods unicode_as_number = {
6432 0, /*nb_add*/
6433 0, /*nb_subtract*/
6434 0, /*nb_multiply*/
6435 0, /*nb_divide*/
6436 unicode_mod, /*nb_remainder*/
6437};
6438
Guido van Rossumd57fd912000-03-10 22:53:23 +00006439static PySequenceMethods unicode_as_sequence = {
6440 (inquiry) unicode_length, /* sq_length */
6441 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6442 (intargfunc) unicode_repeat, /* sq_repeat */
6443 (intargfunc) unicode_getitem, /* sq_item */
6444 (intintargfunc) unicode_slice, /* sq_slice */
6445 0, /* sq_ass_item */
6446 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006447 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448};
6449
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006450static PyObject*
6451unicode_subscript(PyUnicodeObject* self, PyObject* item)
6452{
6453 if (PyInt_Check(item)) {
6454 long i = PyInt_AS_LONG(item);
6455 if (i < 0)
6456 i += PyString_GET_SIZE(self);
6457 return unicode_getitem(self, i);
6458 } else if (PyLong_Check(item)) {
6459 long i = PyLong_AsLong(item);
6460 if (i == -1 && PyErr_Occurred())
6461 return NULL;
6462 if (i < 0)
6463 i += PyString_GET_SIZE(self);
6464 return unicode_getitem(self, i);
6465 } else if (PySlice_Check(item)) {
6466 int start, stop, step, slicelength, cur, i;
6467 Py_UNICODE* source_buf;
6468 Py_UNICODE* result_buf;
6469 PyObject* result;
6470
6471 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6472 &start, &stop, &step, &slicelength) < 0) {
6473 return NULL;
6474 }
6475
6476 if (slicelength <= 0) {
6477 return PyUnicode_FromUnicode(NULL, 0);
6478 } else {
6479 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6480 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6481
6482 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6483 result_buf[i] = source_buf[cur];
6484 }
Tim Petersced69f82003-09-16 20:30:58 +00006485
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006486 result = PyUnicode_FromUnicode(result_buf, slicelength);
6487 PyMem_FREE(result_buf);
6488 return result;
6489 }
6490 } else {
6491 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6492 return NULL;
6493 }
6494}
6495
6496static PyMappingMethods unicode_as_mapping = {
6497 (inquiry)unicode_length, /* mp_length */
6498 (binaryfunc)unicode_subscript, /* mp_subscript */
6499 (objobjargproc)0, /* mp_ass_subscript */
6500};
6501
Guido van Rossumd57fd912000-03-10 22:53:23 +00006502static int
6503unicode_buffer_getreadbuf(PyUnicodeObject *self,
6504 int index,
6505 const void **ptr)
6506{
6507 if (index != 0) {
6508 PyErr_SetString(PyExc_SystemError,
6509 "accessing non-existent unicode segment");
6510 return -1;
6511 }
6512 *ptr = (void *) self->str;
6513 return PyUnicode_GET_DATA_SIZE(self);
6514}
6515
6516static int
6517unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6518 const void **ptr)
6519{
6520 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006521 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006522 return -1;
6523}
6524
6525static int
6526unicode_buffer_getsegcount(PyUnicodeObject *self,
6527 int *lenp)
6528{
6529 if (lenp)
6530 *lenp = PyUnicode_GET_DATA_SIZE(self);
6531 return 1;
6532}
6533
6534static int
6535unicode_buffer_getcharbuf(PyUnicodeObject *self,
6536 int index,
6537 const void **ptr)
6538{
6539 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006540
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 if (index != 0) {
6542 PyErr_SetString(PyExc_SystemError,
6543 "accessing non-existent unicode segment");
6544 return -1;
6545 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006546 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 if (str == NULL)
6548 return -1;
6549 *ptr = (void *) PyString_AS_STRING(str);
6550 return PyString_GET_SIZE(str);
6551}
6552
6553/* Helpers for PyUnicode_Format() */
6554
6555static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006556getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557{
6558 int argidx = *p_argidx;
6559 if (argidx < arglen) {
6560 (*p_argidx)++;
6561 if (arglen < 0)
6562 return args;
6563 else
6564 return PyTuple_GetItem(args, argidx);
6565 }
6566 PyErr_SetString(PyExc_TypeError,
6567 "not enough arguments for format string");
6568 return NULL;
6569}
6570
6571#define F_LJUST (1<<0)
6572#define F_SIGN (1<<1)
6573#define F_BLANK (1<<2)
6574#define F_ALT (1<<3)
6575#define F_ZERO (1<<4)
6576
6577static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006579{
6580 register int i;
6581 int len;
6582 va_list va;
6583 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006585
6586 /* First, format the string as char array, then expand to Py_UNICODE
6587 array. */
6588 charbuffer = (char *)buffer;
6589 len = vsprintf(charbuffer, format, va);
6590 for (i = len - 1; i >= 0; i--)
6591 buffer[i] = (Py_UNICODE) charbuffer[i];
6592
6593 va_end(va);
6594 return len;
6595}
6596
Guido van Rossum078151d2002-08-11 04:24:12 +00006597/* XXX To save some code duplication, formatfloat/long/int could have been
6598 shared with stringobject.c, converting from 8-bit to Unicode after the
6599 formatting is done. */
6600
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601static int
6602formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006603 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 int flags,
6605 int prec,
6606 int type,
6607 PyObject *v)
6608{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006609 /* fmt = '%#.' + `prec` + `type`
6610 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611 char fmt[20];
6612 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006613
Guido van Rossumd57fd912000-03-10 22:53:23 +00006614 x = PyFloat_AsDouble(v);
6615 if (x == -1.0 && PyErr_Occurred())
6616 return -1;
6617 if (prec < 0)
6618 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006619 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6620 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006621 /* Worst case length calc to ensure no buffer overrun:
6622
6623 'g' formats:
6624 fmt = %#.<prec>g
6625 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6626 for any double rep.)
6627 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6628
6629 'f' formats:
6630 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6631 len = 1 + 50 + 1 + prec = 52 + prec
6632
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006633 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006634 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006635
6636 */
6637 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6638 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006639 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006640 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006641 return -1;
6642 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006643 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6644 (flags&F_ALT) ? "#" : "",
6645 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 return usprintf(buf, fmt, x);
6647}
6648
Tim Peters38fd5b62000-09-21 05:43:11 +00006649static PyObject*
6650formatlong(PyObject *val, int flags, int prec, int type)
6651{
6652 char *buf;
6653 int i, len;
6654 PyObject *str; /* temporary string object. */
6655 PyUnicodeObject *result;
6656
6657 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6658 if (!str)
6659 return NULL;
6660 result = _PyUnicode_New(len);
6661 for (i = 0; i < len; i++)
6662 result->str[i] = buf[i];
6663 result->str[len] = 0;
6664 Py_DECREF(str);
6665 return (PyObject*)result;
6666}
6667
Guido van Rossumd57fd912000-03-10 22:53:23 +00006668static int
6669formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006670 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006671 int flags,
6672 int prec,
6673 int type,
6674 PyObject *v)
6675{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006676 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006677 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6678 * + 1 + 1
6679 * = 24
6680 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006681 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006682 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006683 long x;
6684
6685 x = PyInt_AsLong(v);
6686 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006687 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006688 if (x < 0 && type == 'u') {
6689 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006690 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006691 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6692 sign = "-";
6693 else
6694 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006695 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006696 prec = 1;
6697
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006698 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6699 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006700 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006701 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006702 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006703 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006704 return -1;
6705 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006706
6707 if ((flags & F_ALT) &&
6708 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006709 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006710 * of issues that cause pain:
6711 * - when 0 is being converted, the C standard leaves off
6712 * the '0x' or '0X', which is inconsistent with other
6713 * %#x/%#X conversions and inconsistent with Python's
6714 * hex() function
6715 * - there are platforms that violate the standard and
6716 * convert 0 with the '0x' or '0X'
6717 * (Metrowerks, Compaq Tru64)
6718 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006719 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006720 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006721 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006722 * We can achieve the desired consistency by inserting our
6723 * own '0x' or '0X' prefix, and substituting %x/%X in place
6724 * of %#x/%#X.
6725 *
6726 * Note that this is the same approach as used in
6727 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006728 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006729 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6730 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006731 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006732 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006733 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6734 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006735 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006736 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006737 if (sign[0])
6738 return usprintf(buf, fmt, -x);
6739 else
6740 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741}
6742
6743static int
6744formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006745 size_t buflen,
6746 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006747{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006748 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006749 if (PyUnicode_Check(v)) {
6750 if (PyUnicode_GET_SIZE(v) != 1)
6751 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006753 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006755 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006756 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006757 goto onError;
6758 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006760
6761 else {
6762 /* Integer input truncated to a character */
6763 long x;
6764 x = PyInt_AsLong(v);
6765 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006766 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006767#ifdef Py_UNICODE_WIDE
6768 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006769 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006770 "%c arg not in range(0x110000) "
6771 "(wide Python build)");
6772 return -1;
6773 }
6774#else
6775 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006776 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006777 "%c arg not in range(0x10000) "
6778 "(narrow Python build)");
6779 return -1;
6780 }
6781#endif
6782 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 }
6784 buf[1] = '\0';
6785 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006786
6787 onError:
6788 PyErr_SetString(PyExc_TypeError,
6789 "%c requires int or char");
6790 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006791}
6792
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006793/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6794
6795 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6796 chars are formatted. XXX This is a magic number. Each formatting
6797 routine does bounds checking to ensure no overflow, but a better
6798 solution may be to malloc a buffer of appropriate size for each
6799 format. For now, the current solution is sufficient.
6800*/
6801#define FORMATBUFLEN (size_t)120
6802
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803PyObject *PyUnicode_Format(PyObject *format,
6804 PyObject *args)
6805{
6806 Py_UNICODE *fmt, *res;
6807 int fmtcnt, rescnt, reslen, arglen, argidx;
6808 int args_owned = 0;
6809 PyUnicodeObject *result = NULL;
6810 PyObject *dict = NULL;
6811 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006812
Guido van Rossumd57fd912000-03-10 22:53:23 +00006813 if (format == NULL || args == NULL) {
6814 PyErr_BadInternalCall();
6815 return NULL;
6816 }
6817 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006818 if (uformat == NULL)
6819 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006820 fmt = PyUnicode_AS_UNICODE(uformat);
6821 fmtcnt = PyUnicode_GET_SIZE(uformat);
6822
6823 reslen = rescnt = fmtcnt + 100;
6824 result = _PyUnicode_New(reslen);
6825 if (result == NULL)
6826 goto onError;
6827 res = PyUnicode_AS_UNICODE(result);
6828
6829 if (PyTuple_Check(args)) {
6830 arglen = PyTuple_Size(args);
6831 argidx = 0;
6832 }
6833 else {
6834 arglen = -1;
6835 argidx = -2;
6836 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006837 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6838 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006839 dict = args;
6840
6841 while (--fmtcnt >= 0) {
6842 if (*fmt != '%') {
6843 if (--rescnt < 0) {
6844 rescnt = fmtcnt + 100;
6845 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006846 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006847 return NULL;
6848 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6849 --rescnt;
6850 }
6851 *res++ = *fmt++;
6852 }
6853 else {
6854 /* Got a format specifier */
6855 int flags = 0;
6856 int width = -1;
6857 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858 Py_UNICODE c = '\0';
6859 Py_UNICODE fill;
6860 PyObject *v = NULL;
6861 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006862 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006863 Py_UNICODE sign;
6864 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006865 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006866
6867 fmt++;
6868 if (*fmt == '(') {
6869 Py_UNICODE *keystart;
6870 int keylen;
6871 PyObject *key;
6872 int pcount = 1;
6873
6874 if (dict == NULL) {
6875 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006876 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006877 goto onError;
6878 }
6879 ++fmt;
6880 --fmtcnt;
6881 keystart = fmt;
6882 /* Skip over balanced parentheses */
6883 while (pcount > 0 && --fmtcnt >= 0) {
6884 if (*fmt == ')')
6885 --pcount;
6886 else if (*fmt == '(')
6887 ++pcount;
6888 fmt++;
6889 }
6890 keylen = fmt - keystart - 1;
6891 if (fmtcnt < 0 || pcount > 0) {
6892 PyErr_SetString(PyExc_ValueError,
6893 "incomplete format key");
6894 goto onError;
6895 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006896#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006897 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006898 then looked up since Python uses strings to hold
6899 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006900 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006901 key = PyUnicode_EncodeUTF8(keystart,
6902 keylen,
6903 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006904#else
6905 key = PyUnicode_FromUnicode(keystart, keylen);
6906#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 if (key == NULL)
6908 goto onError;
6909 if (args_owned) {
6910 Py_DECREF(args);
6911 args_owned = 0;
6912 }
6913 args = PyObject_GetItem(dict, key);
6914 Py_DECREF(key);
6915 if (args == NULL) {
6916 goto onError;
6917 }
6918 args_owned = 1;
6919 arglen = -1;
6920 argidx = -2;
6921 }
6922 while (--fmtcnt >= 0) {
6923 switch (c = *fmt++) {
6924 case '-': flags |= F_LJUST; continue;
6925 case '+': flags |= F_SIGN; continue;
6926 case ' ': flags |= F_BLANK; continue;
6927 case '#': flags |= F_ALT; continue;
6928 case '0': flags |= F_ZERO; continue;
6929 }
6930 break;
6931 }
6932 if (c == '*') {
6933 v = getnextarg(args, arglen, &argidx);
6934 if (v == NULL)
6935 goto onError;
6936 if (!PyInt_Check(v)) {
6937 PyErr_SetString(PyExc_TypeError,
6938 "* wants int");
6939 goto onError;
6940 }
6941 width = PyInt_AsLong(v);
6942 if (width < 0) {
6943 flags |= F_LJUST;
6944 width = -width;
6945 }
6946 if (--fmtcnt >= 0)
6947 c = *fmt++;
6948 }
6949 else if (c >= '0' && c <= '9') {
6950 width = c - '0';
6951 while (--fmtcnt >= 0) {
6952 c = *fmt++;
6953 if (c < '0' || c > '9')
6954 break;
6955 if ((width*10) / 10 != width) {
6956 PyErr_SetString(PyExc_ValueError,
6957 "width too big");
6958 goto onError;
6959 }
6960 width = width*10 + (c - '0');
6961 }
6962 }
6963 if (c == '.') {
6964 prec = 0;
6965 if (--fmtcnt >= 0)
6966 c = *fmt++;
6967 if (c == '*') {
6968 v = getnextarg(args, arglen, &argidx);
6969 if (v == NULL)
6970 goto onError;
6971 if (!PyInt_Check(v)) {
6972 PyErr_SetString(PyExc_TypeError,
6973 "* wants int");
6974 goto onError;
6975 }
6976 prec = PyInt_AsLong(v);
6977 if (prec < 0)
6978 prec = 0;
6979 if (--fmtcnt >= 0)
6980 c = *fmt++;
6981 }
6982 else if (c >= '0' && c <= '9') {
6983 prec = c - '0';
6984 while (--fmtcnt >= 0) {
6985 c = Py_CHARMASK(*fmt++);
6986 if (c < '0' || c > '9')
6987 break;
6988 if ((prec*10) / 10 != prec) {
6989 PyErr_SetString(PyExc_ValueError,
6990 "prec too big");
6991 goto onError;
6992 }
6993 prec = prec*10 + (c - '0');
6994 }
6995 }
6996 } /* prec */
6997 if (fmtcnt >= 0) {
6998 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 if (--fmtcnt >= 0)
7000 c = *fmt++;
7001 }
7002 }
7003 if (fmtcnt < 0) {
7004 PyErr_SetString(PyExc_ValueError,
7005 "incomplete format");
7006 goto onError;
7007 }
7008 if (c != '%') {
7009 v = getnextarg(args, arglen, &argidx);
7010 if (v == NULL)
7011 goto onError;
7012 }
7013 sign = 0;
7014 fill = ' ';
7015 switch (c) {
7016
7017 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007018 pbuf = formatbuf;
7019 /* presume that buffer length is at least 1 */
7020 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007021 len = 1;
7022 break;
7023
7024 case 's':
7025 case 'r':
7026 if (PyUnicode_Check(v) && c == 's') {
7027 temp = v;
7028 Py_INCREF(temp);
7029 }
7030 else {
7031 PyObject *unicode;
7032 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007033 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007034 else
7035 temp = PyObject_Repr(v);
7036 if (temp == NULL)
7037 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007038 if (PyUnicode_Check(temp))
7039 /* nothing to do */;
7040 else if (PyString_Check(temp)) {
7041 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00007042 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007043 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00007044 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 "strict");
7046 Py_DECREF(temp);
7047 temp = unicode;
7048 if (temp == NULL)
7049 goto onError;
7050 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007051 else {
7052 Py_DECREF(temp);
7053 PyErr_SetString(PyExc_TypeError,
7054 "%s argument has non-string str()");
7055 goto onError;
7056 }
7057 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007058 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 len = PyUnicode_GET_SIZE(temp);
7060 if (prec >= 0 && len > prec)
7061 len = prec;
7062 break;
7063
7064 case 'i':
7065 case 'd':
7066 case 'u':
7067 case 'o':
7068 case 'x':
7069 case 'X':
7070 if (c == 'i')
7071 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007072 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007073 temp = formatlong(v, flags, prec, c);
7074 if (!temp)
7075 goto onError;
7076 pbuf = PyUnicode_AS_UNICODE(temp);
7077 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007078 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007079 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007080 else {
7081 pbuf = formatbuf;
7082 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7083 flags, prec, c, v);
7084 if (len < 0)
7085 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007086 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007087 }
7088 if (flags & F_ZERO)
7089 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007090 break;
7091
7092 case 'e':
7093 case 'E':
7094 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007095 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 case 'g':
7097 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007098 if (c == 'F')
7099 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007100 pbuf = formatbuf;
7101 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7102 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007103 if (len < 0)
7104 goto onError;
7105 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007106 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 fill = '0';
7108 break;
7109
7110 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007111 pbuf = formatbuf;
7112 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007113 if (len < 0)
7114 goto onError;
7115 break;
7116
7117 default:
7118 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007119 "unsupported format character '%c' (0x%x) "
7120 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007121 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007122 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007123 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007124 goto onError;
7125 }
7126 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007127 if (*pbuf == '-' || *pbuf == '+') {
7128 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007129 len--;
7130 }
7131 else if (flags & F_SIGN)
7132 sign = '+';
7133 else if (flags & F_BLANK)
7134 sign = ' ';
7135 else
7136 sign = 0;
7137 }
7138 if (width < len)
7139 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007140 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007141 reslen -= rescnt;
7142 rescnt = width + fmtcnt + 100;
7143 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007144 if (reslen < 0) {
7145 Py_DECREF(result);
7146 return PyErr_NoMemory();
7147 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007148 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007149 return NULL;
7150 res = PyUnicode_AS_UNICODE(result)
7151 + reslen - rescnt;
7152 }
7153 if (sign) {
7154 if (fill != ' ')
7155 *res++ = sign;
7156 rescnt--;
7157 if (width > len)
7158 width--;
7159 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007160 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7161 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007162 assert(pbuf[1] == c);
7163 if (fill != ' ') {
7164 *res++ = *pbuf++;
7165 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007166 }
Tim Petersfff53252001-04-12 18:38:48 +00007167 rescnt -= 2;
7168 width -= 2;
7169 if (width < 0)
7170 width = 0;
7171 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007172 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007173 if (width > len && !(flags & F_LJUST)) {
7174 do {
7175 --rescnt;
7176 *res++ = fill;
7177 } while (--width > len);
7178 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007179 if (fill == ' ') {
7180 if (sign)
7181 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007182 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007183 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007184 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007185 *res++ = *pbuf++;
7186 *res++ = *pbuf++;
7187 }
7188 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007189 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190 res += len;
7191 rescnt -= len;
7192 while (--width >= len) {
7193 --rescnt;
7194 *res++ = ' ';
7195 }
7196 if (dict && (argidx < arglen) && c != '%') {
7197 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007198 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007199 goto onError;
7200 }
7201 Py_XDECREF(temp);
7202 } /* '%' */
7203 } /* until end */
7204 if (argidx < arglen && !dict) {
7205 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007206 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007207 goto onError;
7208 }
7209
7210 if (args_owned) {
7211 Py_DECREF(args);
7212 }
7213 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007214 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007215 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216 return (PyObject *)result;
7217
7218 onError:
7219 Py_XDECREF(result);
7220 Py_DECREF(uformat);
7221 if (args_owned) {
7222 Py_DECREF(args);
7223 }
7224 return NULL;
7225}
7226
7227static PyBufferProcs unicode_as_buffer = {
7228 (getreadbufferproc) unicode_buffer_getreadbuf,
7229 (getwritebufferproc) unicode_buffer_getwritebuf,
7230 (getsegcountproc) unicode_buffer_getsegcount,
7231 (getcharbufferproc) unicode_buffer_getcharbuf,
7232};
7233
Jeremy Hylton938ace62002-07-17 16:30:39 +00007234static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007235unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7236
Tim Peters6d6c1a32001-08-02 04:15:00 +00007237static PyObject *
7238unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7239{
7240 PyObject *x = NULL;
Jeremy Hyltonaf68c872005-12-10 18:50:16 +00007241 static const char *kwlist[] = {"string", "encoding", "errors", 0};
Tim Peters6d6c1a32001-08-02 04:15:00 +00007242 char *encoding = NULL;
7243 char *errors = NULL;
7244
Guido van Rossume023fe02001-08-30 03:12:59 +00007245 if (type != &PyUnicode_Type)
7246 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007247 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7248 kwlist, &x, &encoding, &errors))
7249 return NULL;
7250 if (x == NULL)
7251 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007252 if (encoding == NULL && errors == NULL)
7253 return PyObject_Unicode(x);
7254 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007255 return PyUnicode_FromEncodedObject(x, encoding, errors);
7256}
7257
Guido van Rossume023fe02001-08-30 03:12:59 +00007258static PyObject *
7259unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7260{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007261 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007262 int n;
7263
7264 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7265 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7266 if (tmp == NULL)
7267 return NULL;
7268 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007269 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007270 if (pnew == NULL) {
7271 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007272 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007273 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007274 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7275 if (pnew->str == NULL) {
7276 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007277 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007278 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007279 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007280 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007281 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7282 pnew->length = n;
7283 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007284 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007285 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007286}
7287
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007288PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007289"unicode(string [, encoding[, errors]]) -> object\n\
7290\n\
7291Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007292encoding defaults to the current default string encoding.\n\
7293errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007294
Guido van Rossumd57fd912000-03-10 22:53:23 +00007295PyTypeObject PyUnicode_Type = {
7296 PyObject_HEAD_INIT(&PyType_Type)
7297 0, /* ob_size */
7298 "unicode", /* tp_name */
7299 sizeof(PyUnicodeObject), /* tp_size */
7300 0, /* tp_itemsize */
7301 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007302 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007303 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007304 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007305 0, /* tp_setattr */
7306 (cmpfunc) unicode_compare, /* tp_compare */
7307 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007308 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007309 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007310 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007311 (hashfunc) unicode_hash, /* tp_hash*/
7312 0, /* tp_call*/
7313 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007314 PyObject_GenericGetAttr, /* tp_getattro */
7315 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007316 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007317 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7318 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007319 unicode_doc, /* tp_doc */
7320 0, /* tp_traverse */
7321 0, /* tp_clear */
7322 0, /* tp_richcompare */
7323 0, /* tp_weaklistoffset */
7324 0, /* tp_iter */
7325 0, /* tp_iternext */
7326 unicode_methods, /* tp_methods */
7327 0, /* tp_members */
7328 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007329 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007330 0, /* tp_dict */
7331 0, /* tp_descr_get */
7332 0, /* tp_descr_set */
7333 0, /* tp_dictoffset */
7334 0, /* tp_init */
7335 0, /* tp_alloc */
7336 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007337 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007338};
7339
7340/* Initialize the Unicode implementation */
7341
Thomas Wouters78890102000-07-22 19:25:51 +00007342void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007343{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007344 int i;
7345
Fred Drakee4315f52000-05-09 19:53:39 +00007346 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007347 unicode_freelist = NULL;
7348 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007349 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007350 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007351 for (i = 0; i < 256; i++)
7352 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007353 if (PyType_Ready(&PyUnicode_Type) < 0)
7354 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007355}
7356
7357/* Finalize the Unicode implementation */
7358
7359void
Thomas Wouters78890102000-07-22 19:25:51 +00007360_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007361{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007362 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007363 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007364
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007365 Py_XDECREF(unicode_empty);
7366 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007367
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007368 for (i = 0; i < 256; i++) {
7369 if (unicode_latin1[i]) {
7370 Py_DECREF(unicode_latin1[i]);
7371 unicode_latin1[i] = NULL;
7372 }
7373 }
7374
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007375 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007376 PyUnicodeObject *v = u;
7377 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007378 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007379 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007380 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007381 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007382 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007383 unicode_freelist = NULL;
7384 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007385}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007386
7387/*
7388Local variables:
7389c-basic-offset: 4
7390indent-tabs-mode: nil
7391End:
7392*/