blob: db2a6900cb46161dd571d28e98c3d41f47c63de7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000371 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000387
388 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000390 size = PyUnicode_GET_SIZE(unicode) + 1;
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392#ifdef HAVE_USABLE_WCHAR_T
393 memcpy(w, unicode->str, size * sizeof(wchar_t));
394#else
395 {
396 register Py_UNICODE *u;
397 register int i;
398 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000399 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 *w++ = *u++;
401 }
402#endif
403
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000404 if (size > PyUnicode_GET_SIZE(unicode))
405 return PyUnicode_GET_SIZE(unicode);
406 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000407 return size;
408}
409
410#endif
411
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000412PyObject *PyUnicode_FromOrdinal(int ordinal)
413{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000414 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000415
416#ifdef Py_UNICODE_WIDE
417 if (ordinal < 0 || ordinal > 0x10ffff) {
418 PyErr_SetString(PyExc_ValueError,
419 "unichr() arg not in range(0x110000) "
420 "(wide Python build)");
421 return NULL;
422 }
423#else
424 if (ordinal < 0 || ordinal > 0xffff) {
425 PyErr_SetString(PyExc_ValueError,
426 "unichr() arg not in range(0x10000) "
427 "(narrow Python build)");
428 return NULL;
429 }
430#endif
431
Hye-Shik Chang40574832004-04-06 07:24:51 +0000432 s[0] = (Py_UNICODE)ordinal;
433 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000434}
435
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436PyObject *PyUnicode_FromObject(register PyObject *obj)
437{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 /* XXX Perhaps we should make this API an alias of
439 PyObject_Unicode() instead ?! */
440 if (PyUnicode_CheckExact(obj)) {
441 Py_INCREF(obj);
442 return obj;
443 }
444 if (PyUnicode_Check(obj)) {
445 /* For a Unicode subtype that's not a Unicode object,
446 return a true Unicode object with the same data. */
447 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
448 PyUnicode_GET_SIZE(obj));
449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
451}
452
453PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
454 const char *encoding,
455 const char *errors)
456{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000457 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 if (obj == NULL) {
462 PyErr_BadInternalCall();
463 return NULL;
464 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466#if 0
467 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000468 that no encodings is given and then redirect to
469 PyObject_Unicode() which then applies the additional logic for
470 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000471
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000472 NOTE: This API should really only be used for object which
473 represent *encoded* Unicode !
474
475 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000476 if (PyUnicode_Check(obj)) {
477 if (encoding) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000481 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#else
485 if (PyUnicode_Check(obj)) {
486 PyErr_SetString(PyExc_TypeError,
487 "decoding Unicode is not supported");
488 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000489 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000490#endif
491
492 /* Coerce object */
493 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000494 s = PyString_AS_STRING(obj);
495 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000496 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000497 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
498 /* Overwrite the error message with something more useful in
499 case of a TypeError. */
500 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000502 "coercing to Unicode: need string or buffer, "
503 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000504 obj->ob_type->tp_name);
505 goto onError;
506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 if (len == 0) {
510 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000511 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000515
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520}
521
522PyObject *PyUnicode_Decode(const char *s,
523 int size,
524 const char *encoding,
525 const char *errors)
526{
527 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528
529 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000530 encoding = PyUnicode_GetDefaultEncoding();
531
532 /* Shortcuts for common default encodings */
533 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "latin-1") == 0)
536 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000537#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
538 else if (strcmp(encoding, "mbcs") == 0)
539 return PyUnicode_DecodeMBCS(s, size, errors);
540#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000541 else if (strcmp(encoding, "ascii") == 0)
542 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543
544 /* Decode via the codec registry */
545 buffer = PyBuffer_FromMemory((void *)s, size);
546 if (buffer == NULL)
547 goto onError;
548 unicode = PyCodec_Decode(buffer, encoding, errors);
549 if (unicode == NULL)
550 goto onError;
551 if (!PyUnicode_Check(unicode)) {
552 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000553 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 unicode->ob_type->tp_name);
555 Py_DECREF(unicode);
556 goto onError;
557 }
558 Py_DECREF(buffer);
559 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000560
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 onError:
562 Py_XDECREF(buffer);
563 return NULL;
564}
565
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000566PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v;
571
572 if (!PyUnicode_Check(unicode)) {
573 PyErr_BadArgument();
574 goto onError;
575 }
576
577 if (encoding == NULL)
578 encoding = PyUnicode_GetDefaultEncoding();
579
580 /* Decode via the codec registry */
581 v = PyCodec_Decode(unicode, encoding, errors);
582 if (v == NULL)
583 goto onError;
584 return v;
585
586 onError:
587 return NULL;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590PyObject *PyUnicode_Encode(const Py_UNICODE *s,
591 int size,
592 const char *encoding,
593 const char *errors)
594{
595 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 unicode = PyUnicode_FromUnicode(s, size);
598 if (unicode == NULL)
599 return NULL;
600 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
601 Py_DECREF(unicode);
602 return v;
603}
604
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000605PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
606 const char *encoding,
607 const char *errors)
608{
609 PyObject *v;
610
611 if (!PyUnicode_Check(unicode)) {
612 PyErr_BadArgument();
613 goto onError;
614 }
615
616 if (encoding == NULL)
617 encoding = PyUnicode_GetDefaultEncoding();
618
619 /* Encode via the codec registry */
620 v = PyCodec_Encode(unicode, encoding, errors);
621 if (v == NULL)
622 goto onError;
623 return v;
624
625 onError:
626 return NULL;
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
630 const char *encoding,
631 const char *errors)
632{
633 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 if (!PyUnicode_Check(unicode)) {
636 PyErr_BadArgument();
637 goto onError;
638 }
Fred Drakee4315f52000-05-09 19:53:39 +0000639
Tim Petersced69f82003-09-16 20:30:58 +0000640 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000641 encoding = PyUnicode_GetDefaultEncoding();
642
643 /* Shortcuts for common default encodings */
644 if (errors == NULL) {
645 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000646 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "latin-1") == 0)
648 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000649#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
650 else if (strcmp(encoding, "mbcs") == 0)
651 return PyUnicode_AsMBCSString(unicode);
652#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000653 else if (strcmp(encoding, "ascii") == 0)
654 return PyUnicode_AsASCIIString(unicode);
655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656
657 /* Encode via the codec registry */
658 v = PyCodec_Encode(unicode, encoding, errors);
659 if (v == NULL)
660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 if (!PyString_Check(v)) {
662 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000663 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 v->ob_type->tp_name);
665 Py_DECREF(v);
666 goto onError;
667 }
668 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000669
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 onError:
671 return NULL;
672}
673
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000674PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
675 const char *errors)
676{
677 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
678
679 if (v)
680 return v;
681 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
682 if (v && errors == NULL)
683 ((PyUnicodeObject *)unicode)->defenc = v;
684 return v;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
688{
689 if (!PyUnicode_Check(unicode)) {
690 PyErr_BadArgument();
691 goto onError;
692 }
693 return PyUnicode_AS_UNICODE(unicode);
694
695 onError:
696 return NULL;
697}
698
699int PyUnicode_GetSize(PyObject *unicode)
700{
701 if (!PyUnicode_Check(unicode)) {
702 PyErr_BadArgument();
703 goto onError;
704 }
705 return PyUnicode_GET_SIZE(unicode);
706
707 onError:
708 return -1;
709}
710
Thomas Wouters78890102000-07-22 19:25:51 +0000711const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000712{
713 return unicode_default_encoding;
714}
715
716int PyUnicode_SetDefaultEncoding(const char *encoding)
717{
718 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000719
Fred Drakee4315f52000-05-09 19:53:39 +0000720 /* Make sure the encoding is valid. As side effect, this also
721 loads the encoding into the codec registry cache. */
722 v = _PyCodec_Lookup(encoding);
723 if (v == NULL)
724 goto onError;
725 Py_DECREF(v);
726 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000727 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000728 sizeof(unicode_default_encoding));
729 return 0;
730
731 onError:
732 return -1;
733}
734
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000735/* error handling callback helper:
736 build arguments, call the callback and check the arguments,
737 if no exception occured, copy the replacement to the output
738 and adjust various state variables.
739 return 0 on success, -1 on error
740*/
741
742static
743int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
744 const char *encoding, const char *reason,
745 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
746 PyObject **output, int *outpos, Py_UNICODE **outptr)
747{
748 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
749
750 PyObject *restuple = NULL;
751 PyObject *repunicode = NULL;
752 int outsize = PyUnicode_GET_SIZE(*output);
753 int requiredsize;
754 int newpos;
755 Py_UNICODE *repptr;
756 int repsize;
757 int res = -1;
758
759 if (*errorHandler == NULL) {
760 *errorHandler = PyCodec_LookupError(errors);
761 if (*errorHandler == NULL)
762 goto onError;
763 }
764
765 if (*exceptionObject == NULL) {
766 *exceptionObject = PyUnicodeDecodeError_Create(
767 encoding, input, insize, *startinpos, *endinpos, reason);
768 if (*exceptionObject == NULL)
769 goto onError;
770 }
771 else {
772 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
773 goto onError;
774 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
777 goto onError;
778 }
779
780 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
781 if (restuple == NULL)
782 goto onError;
783 if (!PyTuple_Check(restuple)) {
784 PyErr_Format(PyExc_TypeError, &argparse[4]);
785 goto onError;
786 }
787 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
788 goto onError;
789 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000790 newpos = insize+newpos;
791 if (newpos<0 || newpos>insize) {
792 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
793 goto onError;
794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000795
796 /* need more space? (at least enough for what we
797 have+the replacement+the rest of the string (starting
798 at the new input position), so we won't have to check space
799 when there are no errors in the rest of the string) */
800 repptr = PyUnicode_AS_UNICODE(repunicode);
801 repsize = PyUnicode_GET_SIZE(repunicode);
802 requiredsize = *outpos + repsize + insize-newpos;
803 if (requiredsize > outsize) {
804 if (requiredsize<2*outsize)
805 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000806 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807 goto onError;
808 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
809 }
810 *endinpos = newpos;
811 *inptr = input + newpos;
812 Py_UNICODE_COPY(*outptr, repptr, repsize);
813 *outptr += repsize;
814 *outpos += repsize;
815 /* we made it! */
816 res = 0;
817
818 onError:
819 Py_XDECREF(restuple);
820 return res;
821}
822
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000823/* --- UTF-7 Codec -------------------------------------------------------- */
824
825/* see RFC2152 for details */
826
Tim Petersced69f82003-09-16 20:30:58 +0000827static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828char utf7_special[128] = {
829 /* indicate whether a UTF-7 character is special i.e. cannot be directly
830 encoded:
831 0 - not special
832 1 - special
833 2 - whitespace (optional)
834 3 - RFC2152 Set O (optional) */
835 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
837 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
839 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
843
844};
845
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000846/* Note: The comparison (c) <= 0 is a trick to work-around gcc
847 warnings about the comparison always being false; since
848 utf7_special[0] is 1, we can safely make that one comparison
849 true */
850
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851#define SPECIAL(c, encodeO, encodeWS) \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000852 ((c) > 127 || (c) <= 0 || utf7_special[(c)] == 1 || \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000853 (encodeWS && (utf7_special[(c)] == 2)) || \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000854 (encodeO && (utf7_special[(c)] == 3)))
855
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000856#define B64(n) \
857 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
858#define B64CHAR(c) \
859 (isalnum(c) || (c) == '+' || (c) == '/')
860#define UB64(c) \
861 ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
862 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4 )
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000863
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000864#define ENCODE(out, ch, bits) \
865 while (bits >= 6) { \
866 *out++ = B64(ch >> (bits-6)); \
867 bits -= 6; \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000868 }
869
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000870#define DECODE(out, ch, bits, surrogate) \
871 while (bits >= 16) { \
872 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
873 bits -= 16; \
874 if (surrogate) { \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000875 /* We have already generated an error for the high surrogate \
876 so let's not bother seeing if the low surrogate is correct or not */ \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000877 surrogate = 0; \
878 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000879 /* This is a surrogate pair. Unfortunately we can't represent \
Marc-André Lemburg5c4a9d62005-10-19 22:39:02 +0000880 it in a 16-bit character */ \
881 surrogate = 1; \
882 errmsg = "code pairs are not supported"; \
883 goto utf7Error; \
884 } else { \
885 *out++ = outCh; \
886 } \
Marc-André Lemburge115ec82005-10-19 22:33:31 +0000887 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000888
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889PyObject *PyUnicode_DecodeUTF7(const char *s,
890 int size,
891 const char *errors)
892{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000893 const char *starts = s;
894 int startinpos;
895 int endinpos;
896 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000897 const char *e;
898 PyUnicodeObject *unicode;
899 Py_UNICODE *p;
900 const char *errmsg = "";
901 int inShift = 0;
902 unsigned int bitsleft = 0;
903 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 int surrogate = 0;
905 PyObject *errorHandler = NULL;
906 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000907
908 unicode = _PyUnicode_New(size);
909 if (!unicode)
910 return NULL;
911 if (size == 0)
912 return (PyObject *)unicode;
913
914 p = unicode->str;
915 e = s + size;
916
917 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000918 Py_UNICODE ch;
919 restart:
920 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000921
922 if (inShift) {
923 if ((ch == '-') || !B64CHAR(ch)) {
924 inShift = 0;
925 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000926
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000927 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
928 if (bitsleft >= 6) {
929 /* The shift sequence has a partial character in it. If
930 bitsleft < 6 then we could just classify it as padding
931 but that is not the case here */
932
933 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000934 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000935 }
936 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000937 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000938 here so indicate the potential of a misencoded character. */
939
940 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
941 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
942 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000943 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 }
945
946 if (ch == '-') {
947 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000948 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 inShift = 1;
950 }
951 } else if (SPECIAL(ch,0,0)) {
952 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000953 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000954 } else {
955 *p++ = ch;
956 }
957 } else {
958 charsleft = (charsleft << 6) | UB64(ch);
959 bitsleft += 6;
960 s++;
961 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
962 }
963 }
964 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000965 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000966 s++;
967 if (s < e && *s == '-') {
968 s++;
969 *p++ = '+';
970 } else
971 {
972 inShift = 1;
973 bitsleft = 0;
974 }
975 }
976 else if (SPECIAL(ch,0,0)) {
977 errmsg = "unexpected special character";
978 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000979 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000980 }
981 else {
982 *p++ = ch;
983 s++;
984 }
985 continue;
986 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000987 outpos = p-PyUnicode_AS_UNICODE(unicode);
988 endinpos = s-starts;
989 if (unicode_decode_call_errorhandler(
990 errors, &errorHandler,
991 "utf7", errmsg,
992 starts, size, &startinpos, &endinpos, &exc, &s,
993 (PyObject **)&unicode, &outpos, &p))
994 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000995 }
996
997 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000998 outpos = p-PyUnicode_AS_UNICODE(unicode);
999 endinpos = size;
1000 if (unicode_decode_call_errorhandler(
1001 errors, &errorHandler,
1002 "utf7", "unterminated shift sequence",
1003 starts, size, &startinpos, &endinpos, &exc, &s,
1004 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001005 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001006 if (s < e)
1007 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001008 }
1009
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001010 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001011 goto onError;
1012
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001013 Py_XDECREF(errorHandler);
1014 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001015 return (PyObject *)unicode;
1016
1017onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001018 Py_XDECREF(errorHandler);
1019 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024
1025PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1026 int size,
1027 int encodeSetO,
1028 int encodeWhiteSpace,
1029 const char *errors)
1030{
1031 PyObject *v;
1032 /* It might be possible to tighten this worst case */
1033 unsigned int cbAllocated = 5 * size;
1034 int inShift = 0;
1035 int i = 0;
1036 unsigned int bitsleft = 0;
1037 unsigned long charsleft = 0;
1038 char * out;
1039 char * start;
1040
1041 if (size == 0)
1042 return PyString_FromStringAndSize(NULL, 0);
1043
1044 v = PyString_FromStringAndSize(NULL, cbAllocated);
1045 if (v == NULL)
1046 return NULL;
1047
1048 start = out = PyString_AS_STRING(v);
1049 for (;i < size; ++i) {
1050 Py_UNICODE ch = s[i];
1051
1052 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001053 if (ch == '+') {
1054 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001055 *out++ = '-';
1056 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1057 charsleft = ch;
1058 bitsleft = 16;
1059 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001060 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001062 } else {
1063 *out++ = (char) ch;
1064 }
1065 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1067 *out++ = B64(charsleft << (6-bitsleft));
1068 charsleft = 0;
1069 bitsleft = 0;
1070 /* Characters not in the BASE64 set implicitly unshift the sequence
1071 so no '-' is required, except if the character is itself a '-' */
1072 if (B64CHAR(ch) || ch == '-') {
1073 *out++ = '-';
1074 }
1075 inShift = 0;
1076 *out++ = (char) ch;
1077 } else {
1078 bitsleft += 16;
1079 charsleft = (charsleft << 16) | ch;
1080 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1081
1082 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001083 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001084 or '-' then the shift sequence will be terminated implicitly and we
1085 don't have to insert a '-'. */
1086
1087 if (bitsleft == 0) {
1088 if (i + 1 < size) {
1089 Py_UNICODE ch2 = s[i+1];
1090
1091 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001092
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001093 } else if (B64CHAR(ch2) || ch2 == '-') {
1094 *out++ = '-';
1095 inShift = 0;
1096 } else {
1097 inShift = 0;
1098 }
1099
1100 }
1101 else {
1102 *out++ = '-';
1103 inShift = 0;
1104 }
1105 }
Tim Petersced69f82003-09-16 20:30:58 +00001106 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001107 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001108 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001109 if (bitsleft) {
1110 *out++= B64(charsleft << (6-bitsleft) );
1111 *out++ = '-';
1112 }
1113
Tim Peters5de98422002-04-27 18:44:32 +00001114 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001115 return v;
1116}
1117
1118#undef SPECIAL
1119#undef B64
1120#undef B64CHAR
1121#undef UB64
1122#undef ENCODE
1123#undef DECODE
1124
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125/* --- UTF-8 Codec -------------------------------------------------------- */
1126
Tim Petersced69f82003-09-16 20:30:58 +00001127static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128char utf8_code_length[256] = {
1129 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1130 illegal prefix. see RFC 2279 for details */
1131 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1132 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1133 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1134 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1135 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1136 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1137 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1138 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1139 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1141 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1142 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1143 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1144 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1145 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1146 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1147};
1148
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149PyObject *PyUnicode_DecodeUTF8(const char *s,
1150 int size,
1151 const char *errors)
1152{
Walter Dörwald69652032004-09-07 20:24:22 +00001153 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1154}
1155
1156PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1157 int size,
1158 const char *errors,
1159 int *consumed)
1160{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001161 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001163 int startinpos;
1164 int endinpos;
1165 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 const char *e;
1167 PyUnicodeObject *unicode;
1168 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001169 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001170 PyObject *errorHandler = NULL;
1171 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172
1173 /* Note: size will always be longer than the resulting Unicode
1174 character count */
1175 unicode = _PyUnicode_New(size);
1176 if (!unicode)
1177 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001178 if (size == 0) {
1179 if (consumed)
1180 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001182 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183
1184 /* Unpack UTF-8 encoded data */
1185 p = unicode->str;
1186 e = s + size;
1187
1188 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001189 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001192 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 s++;
1194 continue;
1195 }
1196
1197 n = utf8_code_length[ch];
1198
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001199 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001200 if (consumed)
1201 break;
1202 else {
1203 errmsg = "unexpected end of data";
1204 startinpos = s-starts;
1205 endinpos = size;
1206 goto utf8Error;
1207 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210 switch (n) {
1211
1212 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001213 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001214 startinpos = s-starts;
1215 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001216 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217
1218 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001219 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001220 startinpos = s-starts;
1221 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001222 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223
1224 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001225 if ((s[1] & 0xc0) != 0x80) {
1226 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 startinpos = s-starts;
1228 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001229 goto utf8Error;
1230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001232 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001233 startinpos = s-starts;
1234 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 errmsg = "illegal encoding";
1236 goto utf8Error;
1237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001239 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 break;
1241
1242 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001243 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001244 (s[2] & 0xc0) != 0x80) {
1245 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001246 startinpos = s-starts;
1247 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001248 goto utf8Error;
1249 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001251 if (ch < 0x0800) {
1252 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001253 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001254
1255 XXX For wide builds (UCS-4) we should probably try
1256 to recombine the surrogates into a single code
1257 unit.
1258 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001259 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001260 startinpos = s-starts;
1261 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001262 goto utf8Error;
1263 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001265 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001266 break;
1267
1268 case 4:
1269 if ((s[1] & 0xc0) != 0x80 ||
1270 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 (s[3] & 0xc0) != 0x80) {
1272 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001273 startinpos = s-starts;
1274 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001275 goto utf8Error;
1276 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001277 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1278 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1279 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001280 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001281 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001282 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001283 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001284 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001285 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001286 startinpos = s-starts;
1287 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001288 goto utf8Error;
1289 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001290#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001291 *p++ = (Py_UNICODE)ch;
1292#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001293 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001294
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001295 /* translate from 10000..10FFFF to 0..FFFF */
1296 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001297
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001298 /* high surrogate = top 10 bits added to D800 */
1299 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001300
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001301 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001302 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001303#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 break;
1305
1306 default:
1307 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001309 startinpos = s-starts;
1310 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001311 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 }
1313 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001314 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001315
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001316 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001317 outpos = p-PyUnicode_AS_UNICODE(unicode);
1318 if (unicode_decode_call_errorhandler(
1319 errors, &errorHandler,
1320 "utf8", errmsg,
1321 starts, size, &startinpos, &endinpos, &exc, &s,
1322 (PyObject **)&unicode, &outpos, &p))
1323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 }
Walter Dörwald69652032004-09-07 20:24:22 +00001325 if (consumed)
1326 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327
1328 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001329 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330 goto onError;
1331
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001332 Py_XDECREF(errorHandler);
1333 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 return (PyObject *)unicode;
1335
1336onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001337 Py_XDECREF(errorHandler);
1338 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339 Py_DECREF(unicode);
1340 return NULL;
1341}
1342
Tim Peters602f7402002-04-27 18:03:26 +00001343/* Allocation strategy: if the string is short, convert into a stack buffer
1344 and allocate exactly as much space needed at the end. Else allocate the
1345 maximum possible needed (4 result bytes per Unicode character), and return
1346 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001347*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001348PyObject *
1349PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1350 int size,
1351 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352{
Tim Peters602f7402002-04-27 18:03:26 +00001353#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001354
Tim Peters602f7402002-04-27 18:03:26 +00001355 int i; /* index into s of next input byte */
1356 PyObject *v; /* result string object */
1357 char *p; /* next free byte in output buffer */
1358 int nallocated; /* number of result bytes allocated */
1359 int nneeded; /* number of result bytes needed */
1360 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001361
Tim Peters602f7402002-04-27 18:03:26 +00001362 assert(s != NULL);
1363 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364
Tim Peters602f7402002-04-27 18:03:26 +00001365 if (size <= MAX_SHORT_UNICHARS) {
1366 /* Write into the stack buffer; nallocated can't overflow.
1367 * At the end, we'll allocate exactly as much heap space as it
1368 * turns out we need.
1369 */
1370 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1371 v = NULL; /* will allocate after we're done */
1372 p = stackbuf;
1373 }
1374 else {
1375 /* Overallocate on the heap, and give the excess back at the end. */
1376 nallocated = size * 4;
1377 if (nallocated / 4 != size) /* overflow! */
1378 return PyErr_NoMemory();
1379 v = PyString_FromStringAndSize(NULL, nallocated);
1380 if (v == NULL)
1381 return NULL;
1382 p = PyString_AS_STRING(v);
1383 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001384
Tim Peters602f7402002-04-27 18:03:26 +00001385 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001386 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001387
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001388 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001389 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001391
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001393 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001394 *p++ = (char)(0xc0 | (ch >> 6));
1395 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001396 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001397 else {
Tim Peters602f7402002-04-27 18:03:26 +00001398 /* Encode UCS2 Unicode ordinals */
1399 if (ch < 0x10000) {
1400 /* Special case: check for high surrogate */
1401 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1402 Py_UCS4 ch2 = s[i];
1403 /* Check for low surrogate and combine the two to
1404 form a UCS4 value */
1405 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001406 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001407 i++;
1408 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001409 }
Tim Peters602f7402002-04-27 18:03:26 +00001410 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001411 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001412 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001413 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1414 *p++ = (char)(0x80 | (ch & 0x3f));
1415 continue;
1416 }
1417encodeUCS4:
1418 /* Encode UCS4 Unicode ordinals */
1419 *p++ = (char)(0xf0 | (ch >> 18));
1420 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1421 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1422 *p++ = (char)(0x80 | (ch & 0x3f));
1423 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001424 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001425
Tim Peters602f7402002-04-27 18:03:26 +00001426 if (v == NULL) {
1427 /* This was stack allocated. */
1428 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1429 assert(nneeded <= nallocated);
1430 v = PyString_FromStringAndSize(stackbuf, nneeded);
1431 }
1432 else {
1433 /* Cut back to size actually needed. */
1434 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1435 assert(nneeded <= nallocated);
1436 _PyString_Resize(&v, nneeded);
1437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001439
Tim Peters602f7402002-04-27 18:03:26 +00001440#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441}
1442
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1444{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445 if (!PyUnicode_Check(unicode)) {
1446 PyErr_BadArgument();
1447 return NULL;
1448 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001449 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1450 PyUnicode_GET_SIZE(unicode),
1451 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452}
1453
1454/* --- UTF-16 Codec ------------------------------------------------------- */
1455
Tim Peters772747b2001-08-09 22:21:55 +00001456PyObject *
1457PyUnicode_DecodeUTF16(const char *s,
1458 int size,
1459 const char *errors,
1460 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461{
Walter Dörwald69652032004-09-07 20:24:22 +00001462 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1463}
1464
1465PyObject *
1466PyUnicode_DecodeUTF16Stateful(const char *s,
1467 int size,
1468 const char *errors,
1469 int *byteorder,
1470 int *consumed)
1471{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 const char *starts = s;
1473 int startinpos;
1474 int endinpos;
1475 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476 PyUnicodeObject *unicode;
1477 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001478 const unsigned char *q, *e;
1479 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001480 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001481 /* Offsets from q for retrieving byte pairs in the right order. */
1482#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1483 int ihi = 1, ilo = 0;
1484#else
1485 int ihi = 0, ilo = 1;
1486#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001487 PyObject *errorHandler = NULL;
1488 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489
1490 /* Note: size will always be longer than the resulting Unicode
1491 character count */
1492 unicode = _PyUnicode_New(size);
1493 if (!unicode)
1494 return NULL;
1495 if (size == 0)
1496 return (PyObject *)unicode;
1497
1498 /* Unpack UTF-16 encoded data */
1499 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001500 q = (unsigned char *)s;
1501 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502
1503 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001504 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001506 /* Check for BOM marks (U+FEFF) in the input and adjust current
1507 byte order setting accordingly. In native mode, the leading BOM
1508 mark is skipped, in all other modes, it is copied to the output
1509 stream as-is (giving a ZWNBSP character). */
1510 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001511 if (size >= 2) {
1512 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001513#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001514 if (bom == 0xFEFF) {
1515 q += 2;
1516 bo = -1;
1517 }
1518 else if (bom == 0xFFFE) {
1519 q += 2;
1520 bo = 1;
1521 }
Tim Petersced69f82003-09-16 20:30:58 +00001522#else
Walter Dörwald69652032004-09-07 20:24:22 +00001523 if (bom == 0xFEFF) {
1524 q += 2;
1525 bo = 1;
1526 }
1527 else if (bom == 0xFFFE) {
1528 q += 2;
1529 bo = -1;
1530 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001531#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001532 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534
Tim Peters772747b2001-08-09 22:21:55 +00001535 if (bo == -1) {
1536 /* force LE */
1537 ihi = 1;
1538 ilo = 0;
1539 }
1540 else if (bo == 1) {
1541 /* force BE */
1542 ihi = 0;
1543 ilo = 1;
1544 }
1545
1546 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001547 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001548 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001549 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001550 if (consumed)
1551 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001552 errmsg = "truncated data";
1553 startinpos = ((const char *)q)-starts;
1554 endinpos = ((const char *)e)-starts;
1555 goto utf16Error;
1556 /* The remaining input chars are ignored if the callback
1557 chooses to skip the input */
1558 }
1559 ch = (q[ihi] << 8) | q[ilo];
1560
Tim Peters772747b2001-08-09 22:21:55 +00001561 q += 2;
1562
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 if (ch < 0xD800 || ch > 0xDFFF) {
1564 *p++ = ch;
1565 continue;
1566 }
1567
1568 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001569 if (q >= e) {
1570 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001571 startinpos = (((const char *)q)-2)-starts;
1572 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001573 goto utf16Error;
1574 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001575 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001576 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1577 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001578 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001579#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001580 *p++ = ch;
1581 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001582#else
1583 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001584#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001585 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001586 }
1587 else {
1588 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001589 startinpos = (((const char *)q)-4)-starts;
1590 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001591 goto utf16Error;
1592 }
1593
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001595 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001596 startinpos = (((const char *)q)-2)-starts;
1597 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001598 /* Fall through to report the error */
1599
1600 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001601 outpos = p-PyUnicode_AS_UNICODE(unicode);
1602 if (unicode_decode_call_errorhandler(
1603 errors, &errorHandler,
1604 "utf16", errmsg,
1605 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1606 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001607 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 }
1609
1610 if (byteorder)
1611 *byteorder = bo;
1612
Walter Dörwald69652032004-09-07 20:24:22 +00001613 if (consumed)
1614 *consumed = (const char *)q-starts;
1615
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001617 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 goto onError;
1619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001620 Py_XDECREF(errorHandler);
1621 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622 return (PyObject *)unicode;
1623
1624onError:
1625 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001626 Py_XDECREF(errorHandler);
1627 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 return NULL;
1629}
1630
Tim Peters772747b2001-08-09 22:21:55 +00001631PyObject *
1632PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1633 int size,
1634 const char *errors,
1635 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001636{
1637 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001638 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001639#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001640 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001641#else
1642 const int pairs = 0;
1643#endif
Tim Peters772747b2001-08-09 22:21:55 +00001644 /* Offsets from p for storing byte pairs in the right order. */
1645#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1646 int ihi = 1, ilo = 0;
1647#else
1648 int ihi = 0, ilo = 1;
1649#endif
1650
1651#define STORECHAR(CH) \
1652 do { \
1653 p[ihi] = ((CH) >> 8) & 0xff; \
1654 p[ilo] = (CH) & 0xff; \
1655 p += 2; \
1656 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001658#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001659 for (i = pairs = 0; i < size; i++)
1660 if (s[i] >= 0x10000)
1661 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001662#endif
Tim Petersced69f82003-09-16 20:30:58 +00001663 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001664 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 if (v == NULL)
1666 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667
Tim Peters772747b2001-08-09 22:21:55 +00001668 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001670 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001671 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001672 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001673
1674 if (byteorder == -1) {
1675 /* force LE */
1676 ihi = 1;
1677 ilo = 0;
1678 }
1679 else if (byteorder == 1) {
1680 /* force BE */
1681 ihi = 0;
1682 ilo = 1;
1683 }
1684
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001685 while (size-- > 0) {
1686 Py_UNICODE ch = *s++;
1687 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001688#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001689 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001690 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1691 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001693#endif
Tim Peters772747b2001-08-09 22:21:55 +00001694 STORECHAR(ch);
1695 if (ch2)
1696 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001699#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700}
1701
1702PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1703{
1704 if (!PyUnicode_Check(unicode)) {
1705 PyErr_BadArgument();
1706 return NULL;
1707 }
1708 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1709 PyUnicode_GET_SIZE(unicode),
1710 NULL,
1711 0);
1712}
1713
1714/* --- Unicode Escape Codec ----------------------------------------------- */
1715
Fredrik Lundh06d12682001-01-24 07:59:11 +00001716static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001717
Guido van Rossumd57fd912000-03-10 22:53:23 +00001718PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1719 int size,
1720 const char *errors)
1721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001722 const char *starts = s;
1723 int startinpos;
1724 int endinpos;
1725 int outpos;
1726 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001728 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001730 char* message;
1731 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001732 PyObject *errorHandler = NULL;
1733 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001734
Guido van Rossumd57fd912000-03-10 22:53:23 +00001735 /* Escaped strings will always be longer than the resulting
1736 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 length after conversion to the true value.
1738 (but if the error callback returns a long replacement string
1739 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 v = _PyUnicode_New(size);
1741 if (v == NULL)
1742 goto onError;
1743 if (size == 0)
1744 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001746 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001748
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 while (s < end) {
1750 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001751 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 /* Non-escape characters are interpreted as Unicode ordinals */
1755 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001756 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 continue;
1758 }
1759
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001760 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 /* \ - Escapes */
1762 s++;
1763 switch (*s++) {
1764
1765 /* \x escapes */
1766 case '\n': break;
1767 case '\\': *p++ = '\\'; break;
1768 case '\'': *p++ = '\''; break;
1769 case '\"': *p++ = '\"'; break;
1770 case 'b': *p++ = '\b'; break;
1771 case 'f': *p++ = '\014'; break; /* FF */
1772 case 't': *p++ = '\t'; break;
1773 case 'n': *p++ = '\n'; break;
1774 case 'r': *p++ = '\r'; break;
1775 case 'v': *p++ = '\013'; break; /* VT */
1776 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1777
1778 /* \OOO (octal) escapes */
1779 case '0': case '1': case '2': case '3':
1780 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001781 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001783 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001785 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001787 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788 break;
1789
Fredrik Lundhccc74732001-02-18 22:13:49 +00001790 /* hex escapes */
1791 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001792 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001793 digits = 2;
1794 message = "truncated \\xXX escape";
1795 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001798 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001799 digits = 4;
1800 message = "truncated \\uXXXX escape";
1801 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802
Fredrik Lundhccc74732001-02-18 22:13:49 +00001803 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001804 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001805 digits = 8;
1806 message = "truncated \\UXXXXXXXX escape";
1807 hexescape:
1808 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001809 outpos = p-PyUnicode_AS_UNICODE(v);
1810 if (s+digits>end) {
1811 endinpos = size;
1812 if (unicode_decode_call_errorhandler(
1813 errors, &errorHandler,
1814 "unicodeescape", "end of string in escape sequence",
1815 starts, size, &startinpos, &endinpos, &exc, &s,
1816 (PyObject **)&v, &outpos, &p))
1817 goto onError;
1818 goto nextByte;
1819 }
1820 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001821 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001822 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 endinpos = (s+i+1)-starts;
1824 if (unicode_decode_call_errorhandler(
1825 errors, &errorHandler,
1826 "unicodeescape", message,
1827 starts, size, &startinpos, &endinpos, &exc, &s,
1828 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001829 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001830 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001831 }
1832 chr = (chr<<4) & ~0xF;
1833 if (c >= '0' && c <= '9')
1834 chr += c - '0';
1835 else if (c >= 'a' && c <= 'f')
1836 chr += 10 + c - 'a';
1837 else
1838 chr += 10 + c - 'A';
1839 }
1840 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001841 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001842 /* _decoding_error will have already written into the
1843 target buffer. */
1844 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001845 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001846 /* when we get here, chr is a 32-bit unicode character */
1847 if (chr <= 0xffff)
1848 /* UCS-2 character */
1849 *p++ = (Py_UNICODE) chr;
1850 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001851 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001852 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001853#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001854 *p++ = chr;
1855#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001856 chr -= 0x10000L;
1857 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001858 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001859#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001860 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001861 endinpos = s-starts;
1862 outpos = p-PyUnicode_AS_UNICODE(v);
1863 if (unicode_decode_call_errorhandler(
1864 errors, &errorHandler,
1865 "unicodeescape", "illegal Unicode character",
1866 starts, size, &startinpos, &endinpos, &exc, &s,
1867 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001868 goto onError;
1869 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001870 break;
1871
1872 /* \N{name} */
1873 case 'N':
1874 message = "malformed \\N character escape";
1875 if (ucnhash_CAPI == NULL) {
1876 /* load the unicode data module */
1877 PyObject *m, *v;
1878 m = PyImport_ImportModule("unicodedata");
1879 if (m == NULL)
1880 goto ucnhashError;
1881 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1882 Py_DECREF(m);
1883 if (v == NULL)
1884 goto ucnhashError;
1885 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1886 Py_DECREF(v);
1887 if (ucnhash_CAPI == NULL)
1888 goto ucnhashError;
1889 }
1890 if (*s == '{') {
1891 const char *start = s+1;
1892 /* look for the closing brace */
1893 while (*s != '}' && s < end)
1894 s++;
1895 if (s > start && s < end && *s == '}') {
1896 /* found a name. look it up in the unicode database */
1897 message = "unknown Unicode character name";
1898 s++;
1899 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1900 goto store;
1901 }
1902 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001903 endinpos = s-starts;
1904 outpos = p-PyUnicode_AS_UNICODE(v);
1905 if (unicode_decode_call_errorhandler(
1906 errors, &errorHandler,
1907 "unicodeescape", message,
1908 starts, size, &startinpos, &endinpos, &exc, &s,
1909 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001910 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001911 break;
1912
1913 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001914 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001915 message = "\\ at end of string";
1916 s--;
1917 endinpos = s-starts;
1918 outpos = p-PyUnicode_AS_UNICODE(v);
1919 if (unicode_decode_call_errorhandler(
1920 errors, &errorHandler,
1921 "unicodeescape", message,
1922 starts, size, &startinpos, &endinpos, &exc, &s,
1923 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001924 goto onError;
1925 }
1926 else {
1927 *p++ = '\\';
1928 *p++ = (unsigned char)s[-1];
1929 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001930 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001932 nextByte:
1933 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001934 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001935 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001936 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001937 Py_XDECREF(errorHandler);
1938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001940
Fredrik Lundhccc74732001-02-18 22:13:49 +00001941ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001942 PyErr_SetString(
1943 PyExc_UnicodeError,
1944 "\\N escapes not supported (can't load unicodedata module)"
1945 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001946 Py_XDECREF(errorHandler);
1947 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001948 return NULL;
1949
Fredrik Lundhccc74732001-02-18 22:13:49 +00001950onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001952 Py_XDECREF(errorHandler);
1953 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 return NULL;
1955}
1956
1957/* Return a Unicode-Escape string version of the Unicode object.
1958
1959 If quotes is true, the string is enclosed in u"" or u'' quotes as
1960 appropriate.
1961
1962*/
1963
Barry Warsaw51ac5802000-03-20 16:36:48 +00001964static const Py_UNICODE *findchar(const Py_UNICODE *s,
1965 int size,
1966 Py_UNICODE ch);
1967
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968static
1969PyObject *unicodeescape_string(const Py_UNICODE *s,
1970 int size,
1971 int quotes)
1972{
1973 PyObject *repr;
1974 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001976 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977
1978 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1979 if (repr == NULL)
1980 return NULL;
1981
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983
1984 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001986 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 !findchar(s, size, '"')) ? '"' : '\'';
1988 }
1989 while (size-- > 0) {
1990 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001991
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001993 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 *p++ = '\\';
1996 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001997 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001998 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001999
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002000#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002001 /* Map 21-bit characters to '\U00xxxxxx' */
2002 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002003 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00002004
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002005 /* Resize the string if necessary */
2006 if (offset + 12 > PyString_GET_SIZE(repr)) {
2007 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002008 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002009 p = PyString_AS_STRING(repr) + offset;
2010 }
2011
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002012 *p++ = '\\';
2013 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002014 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2015 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2016 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2017 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2018 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2019 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2020 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002021 *p++ = hexdigit[ch & 0x0000000F];
2022 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002023 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002024#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002025 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2026 else if (ch >= 0xD800 && ch < 0xDC00) {
2027 Py_UNICODE ch2;
2028 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002029
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002030 ch2 = *s++;
2031 size--;
2032 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2033 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2034 *p++ = '\\';
2035 *p++ = 'U';
2036 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2037 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2038 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2039 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2040 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2041 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2042 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2043 *p++ = hexdigit[ucs & 0x0000000F];
2044 continue;
2045 }
2046 /* Fall through: isolated surrogates are copied as-is */
2047 s--;
2048 size++;
2049 }
2050
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002052 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 *p++ = '\\';
2054 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002055 *p++ = hexdigit[(ch >> 12) & 0x000F];
2056 *p++ = hexdigit[(ch >> 8) & 0x000F];
2057 *p++ = hexdigit[(ch >> 4) & 0x000F];
2058 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002060
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002061 /* Map special whitespace to '\t', \n', '\r' */
2062 else if (ch == '\t') {
2063 *p++ = '\\';
2064 *p++ = 't';
2065 }
2066 else if (ch == '\n') {
2067 *p++ = '\\';
2068 *p++ = 'n';
2069 }
2070 else if (ch == '\r') {
2071 *p++ = '\\';
2072 *p++ = 'r';
2073 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002075 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002076 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002078 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002079 *p++ = hexdigit[(ch >> 4) & 0x000F];
2080 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002081 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002082
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083 /* Copy everything else as-is */
2084 else
2085 *p++ = (char) ch;
2086 }
2087 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002088 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089
2090 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002091 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 return repr;
2093}
2094
2095PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2096 int size)
2097{
2098 return unicodeescape_string(s, size, 0);
2099}
2100
2101PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2102{
2103 if (!PyUnicode_Check(unicode)) {
2104 PyErr_BadArgument();
2105 return NULL;
2106 }
2107 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2108 PyUnicode_GET_SIZE(unicode));
2109}
2110
2111/* --- Raw Unicode Escape Codec ------------------------------------------- */
2112
2113PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2114 int size,
2115 const char *errors)
2116{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 const char *starts = s;
2118 int startinpos;
2119 int endinpos;
2120 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 const char *end;
2124 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 PyObject *errorHandler = NULL;
2126 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002127
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 /* Escaped strings will always be longer than the resulting
2129 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002130 length after conversion to the true value. (But decoding error
2131 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002132 v = _PyUnicode_New(size);
2133 if (v == NULL)
2134 goto onError;
2135 if (size == 0)
2136 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002137 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138 end = s + size;
2139 while (s < end) {
2140 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002141 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144
2145 /* Non-escape characters are interpreted as Unicode ordinals */
2146 if (*s != '\\') {
2147 *p++ = (unsigned char)*s++;
2148 continue;
2149 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002150 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151
2152 /* \u-escapes are only interpreted iff the number of leading
2153 backslashes if odd */
2154 bs = s;
2155 for (;s < end;) {
2156 if (*s != '\\')
2157 break;
2158 *p++ = (unsigned char)*s++;
2159 }
2160 if (((s - bs) & 1) == 0 ||
2161 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002162 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 continue;
2164 }
2165 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 s++;
2168
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002169 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002170 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002171 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002172 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002173 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002174 endinpos = s-starts;
2175 if (unicode_decode_call_errorhandler(
2176 errors, &errorHandler,
2177 "rawunicodeescape", "truncated \\uXXXX",
2178 starts, size, &startinpos, &endinpos, &exc, &s,
2179 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002181 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 }
2183 x = (x<<4) & ~0xF;
2184 if (c >= '0' && c <= '9')
2185 x += c - '0';
2186 else if (c >= 'a' && c <= 'f')
2187 x += 10 + c - 'a';
2188 else
2189 x += 10 + c - 'A';
2190 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002191#ifndef Py_UNICODE_WIDE
2192 if (x > 0x10000) {
2193 if (unicode_decode_call_errorhandler(
2194 errors, &errorHandler,
2195 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2196 starts, size, &startinpos, &endinpos, &exc, &s,
2197 (PyObject **)&v, &outpos, &p))
2198 goto onError;
2199 }
2200#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002201 *p++ = x;
2202 nextByte:
2203 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002205 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002206 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002207 Py_XDECREF(errorHandler);
2208 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002210
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 onError:
2212 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002213 Py_XDECREF(errorHandler);
2214 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 return NULL;
2216}
2217
2218PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2219 int size)
2220{
2221 PyObject *repr;
2222 char *p;
2223 char *q;
2224
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002225 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002227#ifdef Py_UNICODE_WIDE
2228 repr = PyString_FromStringAndSize(NULL, 10 * size);
2229#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232 if (repr == NULL)
2233 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002234 if (size == 0)
2235 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236
2237 p = q = PyString_AS_STRING(repr);
2238 while (size-- > 0) {
2239 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002240#ifdef Py_UNICODE_WIDE
2241 /* Map 32-bit characters to '\Uxxxxxxxx' */
2242 if (ch >= 0x10000) {
2243 *p++ = '\\';
2244 *p++ = 'U';
2245 *p++ = hexdigit[(ch >> 28) & 0xf];
2246 *p++ = hexdigit[(ch >> 24) & 0xf];
2247 *p++ = hexdigit[(ch >> 20) & 0xf];
2248 *p++ = hexdigit[(ch >> 16) & 0xf];
2249 *p++ = hexdigit[(ch >> 12) & 0xf];
2250 *p++ = hexdigit[(ch >> 8) & 0xf];
2251 *p++ = hexdigit[(ch >> 4) & 0xf];
2252 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002253 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002254 else
2255#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256 /* Map 16-bit characters to '\uxxxx' */
2257 if (ch >= 256) {
2258 *p++ = '\\';
2259 *p++ = 'u';
2260 *p++ = hexdigit[(ch >> 12) & 0xf];
2261 *p++ = hexdigit[(ch >> 8) & 0xf];
2262 *p++ = hexdigit[(ch >> 4) & 0xf];
2263 *p++ = hexdigit[ch & 15];
2264 }
2265 /* Copy everything else as-is */
2266 else
2267 *p++ = (char) ch;
2268 }
2269 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002270 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002271 return repr;
2272}
2273
2274PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2275{
2276 if (!PyUnicode_Check(unicode)) {
2277 PyErr_BadArgument();
2278 return NULL;
2279 }
2280 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2281 PyUnicode_GET_SIZE(unicode));
2282}
2283
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002284/* --- Unicode Internal Codec ------------------------------------------- */
2285
2286PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2287 int size,
2288 const char *errors)
2289{
2290 const char *starts = s;
2291 int startinpos;
2292 int endinpos;
2293 int outpos;
2294 Py_UNICODE unimax;
2295 PyUnicodeObject *v;
2296 Py_UNICODE *p;
2297 const char *end;
2298 const char *reason;
2299 PyObject *errorHandler = NULL;
2300 PyObject *exc = NULL;
2301
2302 unimax = PyUnicode_GetMax();
2303 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2304 if (v == NULL)
2305 goto onError;
2306 if (PyUnicode_GetSize((PyObject *)v) == 0)
2307 return (PyObject *)v;
2308 p = PyUnicode_AS_UNICODE(v);
2309 end = s + size;
2310
2311 while (s < end) {
2312 *p = *(Py_UNICODE *)s;
2313 /* We have to sanity check the raw data, otherwise doom looms for
2314 some malformed UCS-4 data. */
2315 if (
2316 #ifdef Py_UNICODE_WIDE
2317 *p > unimax || *p < 0 ||
2318 #endif
2319 end-s < Py_UNICODE_SIZE
2320 )
2321 {
2322 startinpos = s - starts;
2323 if (end-s < Py_UNICODE_SIZE) {
2324 endinpos = end-starts;
2325 reason = "truncated input";
2326 }
2327 else {
2328 endinpos = s - starts + Py_UNICODE_SIZE;
2329 reason = "illegal code point (> 0x10FFFF)";
2330 }
2331 outpos = p - PyUnicode_AS_UNICODE(v);
2332 if (unicode_decode_call_errorhandler(
2333 errors, &errorHandler,
2334 "unicode_internal", reason,
2335 starts, size, &startinpos, &endinpos, &exc, &s,
2336 (PyObject **)&v, &outpos, &p)) {
2337 goto onError;
2338 }
2339 }
2340 else {
2341 p++;
2342 s += Py_UNICODE_SIZE;
2343 }
2344 }
2345
2346 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2347 goto onError;
2348 Py_XDECREF(errorHandler);
2349 Py_XDECREF(exc);
2350 return (PyObject *)v;
2351
2352 onError:
2353 Py_XDECREF(v);
2354 Py_XDECREF(errorHandler);
2355 Py_XDECREF(exc);
2356 return NULL;
2357}
2358
Guido van Rossumd57fd912000-03-10 22:53:23 +00002359/* --- Latin-1 Codec ------------------------------------------------------ */
2360
2361PyObject *PyUnicode_DecodeLatin1(const char *s,
2362 int size,
2363 const char *errors)
2364{
2365 PyUnicodeObject *v;
2366 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002367
Guido van Rossumd57fd912000-03-10 22:53:23 +00002368 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002369 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002370 Py_UNICODE r = *(unsigned char*)s;
2371 return PyUnicode_FromUnicode(&r, 1);
2372 }
2373
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 v = _PyUnicode_New(size);
2375 if (v == NULL)
2376 goto onError;
2377 if (size == 0)
2378 return (PyObject *)v;
2379 p = PyUnicode_AS_UNICODE(v);
2380 while (size-- > 0)
2381 *p++ = (unsigned char)*s++;
2382 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002383
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 onError:
2385 Py_XDECREF(v);
2386 return NULL;
2387}
2388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002389/* create or adjust a UnicodeEncodeError */
2390static void make_encode_exception(PyObject **exceptionObject,
2391 const char *encoding,
2392 const Py_UNICODE *unicode, int size,
2393 int startpos, int endpos,
2394 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002396 if (*exceptionObject == NULL) {
2397 *exceptionObject = PyUnicodeEncodeError_Create(
2398 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399 }
2400 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002401 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2402 goto onError;
2403 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2404 goto onError;
2405 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2406 goto onError;
2407 return;
2408 onError:
2409 Py_DECREF(*exceptionObject);
2410 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002411 }
2412}
2413
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002414/* raises a UnicodeEncodeError */
2415static void raise_encode_exception(PyObject **exceptionObject,
2416 const char *encoding,
2417 const Py_UNICODE *unicode, int size,
2418 int startpos, int endpos,
2419 const char *reason)
2420{
2421 make_encode_exception(exceptionObject,
2422 encoding, unicode, size, startpos, endpos, reason);
2423 if (*exceptionObject != NULL)
2424 PyCodec_StrictErrors(*exceptionObject);
2425}
2426
2427/* error handling callback helper:
2428 build arguments, call the callback and check the arguments,
2429 put the result into newpos and return the replacement string, which
2430 has to be freed by the caller */
2431static PyObject *unicode_encode_call_errorhandler(const char *errors,
2432 PyObject **errorHandler,
2433 const char *encoding, const char *reason,
2434 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2435 int startpos, int endpos,
2436 int *newpos)
2437{
2438 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2439
2440 PyObject *restuple;
2441 PyObject *resunicode;
2442
2443 if (*errorHandler == NULL) {
2444 *errorHandler = PyCodec_LookupError(errors);
2445 if (*errorHandler == NULL)
2446 return NULL;
2447 }
2448
2449 make_encode_exception(exceptionObject,
2450 encoding, unicode, size, startpos, endpos, reason);
2451 if (*exceptionObject == NULL)
2452 return NULL;
2453
2454 restuple = PyObject_CallFunctionObjArgs(
2455 *errorHandler, *exceptionObject, NULL);
2456 if (restuple == NULL)
2457 return NULL;
2458 if (!PyTuple_Check(restuple)) {
2459 PyErr_Format(PyExc_TypeError, &argparse[4]);
2460 Py_DECREF(restuple);
2461 return NULL;
2462 }
2463 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2464 &resunicode, newpos)) {
2465 Py_DECREF(restuple);
2466 return NULL;
2467 }
2468 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002469 *newpos = size+*newpos;
2470 if (*newpos<0 || *newpos>size) {
2471 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2472 Py_DECREF(restuple);
2473 return NULL;
2474 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002475 Py_INCREF(resunicode);
2476 Py_DECREF(restuple);
2477 return resunicode;
2478}
2479
2480static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2481 int size,
2482 const char *errors,
2483 int limit)
2484{
2485 /* output object */
2486 PyObject *res;
2487 /* pointers to the beginning and end+1 of input */
2488 const Py_UNICODE *startp = p;
2489 const Py_UNICODE *endp = p + size;
2490 /* pointer to the beginning of the unencodable characters */
2491 /* const Py_UNICODE *badp = NULL; */
2492 /* pointer into the output */
2493 char *str;
2494 /* current output position */
2495 int respos = 0;
2496 int ressize;
2497 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2498 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2499 PyObject *errorHandler = NULL;
2500 PyObject *exc = NULL;
2501 /* the following variable is used for caching string comparisons
2502 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2503 int known_errorHandler = -1;
2504
2505 /* allocate enough for a simple encoding without
2506 replacements, if we need more, we'll resize */
2507 res = PyString_FromStringAndSize(NULL, size);
2508 if (res == NULL)
2509 goto onError;
2510 if (size == 0)
2511 return res;
2512 str = PyString_AS_STRING(res);
2513 ressize = size;
2514
2515 while (p<endp) {
2516 Py_UNICODE c = *p;
2517
2518 /* can we encode this? */
2519 if (c<limit) {
2520 /* no overflow check, because we know that the space is enough */
2521 *str++ = (char)c;
2522 ++p;
2523 }
2524 else {
2525 int unicodepos = p-startp;
2526 int requiredsize;
2527 PyObject *repunicode;
2528 int repsize;
2529 int newpos;
2530 int respos;
2531 Py_UNICODE *uni2;
2532 /* startpos for collecting unencodable chars */
2533 const Py_UNICODE *collstart = p;
2534 const Py_UNICODE *collend = p;
2535 /* find all unecodable characters */
2536 while ((collend < endp) && ((*collend)>=limit))
2537 ++collend;
2538 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2539 if (known_errorHandler==-1) {
2540 if ((errors==NULL) || (!strcmp(errors, "strict")))
2541 known_errorHandler = 1;
2542 else if (!strcmp(errors, "replace"))
2543 known_errorHandler = 2;
2544 else if (!strcmp(errors, "ignore"))
2545 known_errorHandler = 3;
2546 else if (!strcmp(errors, "xmlcharrefreplace"))
2547 known_errorHandler = 4;
2548 else
2549 known_errorHandler = 0;
2550 }
2551 switch (known_errorHandler) {
2552 case 1: /* strict */
2553 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2554 goto onError;
2555 case 2: /* replace */
2556 while (collstart++<collend)
2557 *str++ = '?'; /* fall through */
2558 case 3: /* ignore */
2559 p = collend;
2560 break;
2561 case 4: /* xmlcharrefreplace */
2562 respos = str-PyString_AS_STRING(res);
2563 /* determine replacement size (temporarily (mis)uses p) */
2564 for (p = collstart, repsize = 0; p < collend; ++p) {
2565 if (*p<10)
2566 repsize += 2+1+1;
2567 else if (*p<100)
2568 repsize += 2+2+1;
2569 else if (*p<1000)
2570 repsize += 2+3+1;
2571 else if (*p<10000)
2572 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002573#ifndef Py_UNICODE_WIDE
2574 else
2575 repsize += 2+5+1;
2576#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002577 else if (*p<100000)
2578 repsize += 2+5+1;
2579 else if (*p<1000000)
2580 repsize += 2+6+1;
2581 else
2582 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002583#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002584 }
2585 requiredsize = respos+repsize+(endp-collend);
2586 if (requiredsize > ressize) {
2587 if (requiredsize<2*ressize)
2588 requiredsize = 2*ressize;
2589 if (_PyString_Resize(&res, requiredsize))
2590 goto onError;
2591 str = PyString_AS_STRING(res) + respos;
2592 ressize = requiredsize;
2593 }
2594 /* generate replacement (temporarily (mis)uses p) */
2595 for (p = collstart; p < collend; ++p) {
2596 str += sprintf(str, "&#%d;", (int)*p);
2597 }
2598 p = collend;
2599 break;
2600 default:
2601 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2602 encoding, reason, startp, size, &exc,
2603 collstart-startp, collend-startp, &newpos);
2604 if (repunicode == NULL)
2605 goto onError;
2606 /* need more space? (at least enough for what we
2607 have+the replacement+the rest of the string, so
2608 we won't have to check space for encodable characters) */
2609 respos = str-PyString_AS_STRING(res);
2610 repsize = PyUnicode_GET_SIZE(repunicode);
2611 requiredsize = respos+repsize+(endp-collend);
2612 if (requiredsize > ressize) {
2613 if (requiredsize<2*ressize)
2614 requiredsize = 2*ressize;
2615 if (_PyString_Resize(&res, requiredsize)) {
2616 Py_DECREF(repunicode);
2617 goto onError;
2618 }
2619 str = PyString_AS_STRING(res) + respos;
2620 ressize = requiredsize;
2621 }
2622 /* check if there is anything unencodable in the replacement
2623 and copy it to the output */
2624 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2625 c = *uni2;
2626 if (c >= limit) {
2627 raise_encode_exception(&exc, encoding, startp, size,
2628 unicodepos, unicodepos+1, reason);
2629 Py_DECREF(repunicode);
2630 goto onError;
2631 }
2632 *str = (char)c;
2633 }
2634 p = startp + newpos;
2635 Py_DECREF(repunicode);
2636 }
2637 }
2638 }
2639 /* Resize if we allocated to much */
2640 respos = str-PyString_AS_STRING(res);
2641 if (respos<ressize)
2642 /* If this falls res will be NULL */
2643 _PyString_Resize(&res, respos);
2644 Py_XDECREF(errorHandler);
2645 Py_XDECREF(exc);
2646 return res;
2647
2648 onError:
2649 Py_XDECREF(res);
2650 Py_XDECREF(errorHandler);
2651 Py_XDECREF(exc);
2652 return NULL;
2653}
2654
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2656 int size,
2657 const char *errors)
2658{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002659 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002660}
2661
2662PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2663{
2664 if (!PyUnicode_Check(unicode)) {
2665 PyErr_BadArgument();
2666 return NULL;
2667 }
2668 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2669 PyUnicode_GET_SIZE(unicode),
2670 NULL);
2671}
2672
2673/* --- 7-bit ASCII Codec -------------------------------------------------- */
2674
Guido van Rossumd57fd912000-03-10 22:53:23 +00002675PyObject *PyUnicode_DecodeASCII(const char *s,
2676 int size,
2677 const char *errors)
2678{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002679 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002680 PyUnicodeObject *v;
2681 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002682 int startinpos;
2683 int endinpos;
2684 int outpos;
2685 const char *e;
2686 PyObject *errorHandler = NULL;
2687 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002688
Guido van Rossumd57fd912000-03-10 22:53:23 +00002689 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002690 if (size == 1 && *(unsigned char*)s < 128) {
2691 Py_UNICODE r = *(unsigned char*)s;
2692 return PyUnicode_FromUnicode(&r, 1);
2693 }
Tim Petersced69f82003-09-16 20:30:58 +00002694
Guido van Rossumd57fd912000-03-10 22:53:23 +00002695 v = _PyUnicode_New(size);
2696 if (v == NULL)
2697 goto onError;
2698 if (size == 0)
2699 return (PyObject *)v;
2700 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002701 e = s + size;
2702 while (s < e) {
2703 register unsigned char c = (unsigned char)*s;
2704 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 ++s;
2707 }
2708 else {
2709 startinpos = s-starts;
2710 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002711 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002712 if (unicode_decode_call_errorhandler(
2713 errors, &errorHandler,
2714 "ascii", "ordinal not in range(128)",
2715 starts, size, &startinpos, &endinpos, &exc, &s,
2716 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002720 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002721 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002722 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002723 Py_XDECREF(errorHandler);
2724 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002725 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002726
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727 onError:
2728 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002729 Py_XDECREF(errorHandler);
2730 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731 return NULL;
2732}
2733
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2735 int size,
2736 const char *errors)
2737{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002738 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739}
2740
2741PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2742{
2743 if (!PyUnicode_Check(unicode)) {
2744 PyErr_BadArgument();
2745 return NULL;
2746 }
2747 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2748 PyUnicode_GET_SIZE(unicode),
2749 NULL);
2750}
2751
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002752#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002753
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002754/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002755
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002756PyObject *PyUnicode_DecodeMBCS(const char *s,
2757 int size,
2758 const char *errors)
2759{
2760 PyUnicodeObject *v;
2761 Py_UNICODE *p;
2762
2763 /* First get the size of the result */
2764 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002765 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002766 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2767
2768 v = _PyUnicode_New(usize);
2769 if (v == NULL)
2770 return NULL;
2771 if (usize == 0)
2772 return (PyObject *)v;
2773 p = PyUnicode_AS_UNICODE(v);
2774 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2775 Py_DECREF(v);
2776 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2777 }
2778
2779 return (PyObject *)v;
2780}
2781
2782PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2783 int size,
2784 const char *errors)
2785{
2786 PyObject *repr;
2787 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002788 DWORD mbcssize;
2789
2790 /* If there are no characters, bail now! */
2791 if (size==0)
2792 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002793
2794 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002795 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002796 if (mbcssize==0)
2797 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2798
2799 repr = PyString_FromStringAndSize(NULL, mbcssize);
2800 if (repr == NULL)
2801 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002802 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002803 return repr;
2804
2805 /* Do the conversion */
2806 s = PyString_AS_STRING(repr);
2807 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2808 Py_DECREF(repr);
2809 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2810 }
2811 return repr;
2812}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002813
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002814PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2815{
2816 if (!PyUnicode_Check(unicode)) {
2817 PyErr_BadArgument();
2818 return NULL;
2819 }
2820 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2821 PyUnicode_GET_SIZE(unicode),
2822 NULL);
2823}
2824
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002825#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002826
Guido van Rossumd57fd912000-03-10 22:53:23 +00002827/* --- Character Mapping Codec -------------------------------------------- */
2828
Guido van Rossumd57fd912000-03-10 22:53:23 +00002829PyObject *PyUnicode_DecodeCharmap(const char *s,
2830 int size,
2831 PyObject *mapping,
2832 const char *errors)
2833{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 const char *starts = s;
2835 int startinpos;
2836 int endinpos;
2837 int outpos;
2838 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002839 PyUnicodeObject *v;
2840 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002841 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 PyObject *errorHandler = NULL;
2843 PyObject *exc = NULL;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002844 Py_UNICODE *mapstring = NULL;
2845 int maplen = 0;
Tim Petersced69f82003-09-16 20:30:58 +00002846
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 /* Default to Latin-1 */
2848 if (mapping == NULL)
2849 return PyUnicode_DecodeLatin1(s, size, errors);
2850
2851 v = _PyUnicode_New(size);
2852 if (v == NULL)
2853 goto onError;
2854 if (size == 0)
2855 return (PyObject *)v;
2856 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002857 e = s + size;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002858 if (PyUnicode_CheckExact(mapping)) {
2859 mapstring = PyUnicode_AS_UNICODE(mapping);
2860 maplen = PyUnicode_GET_SIZE(mapping);
2861 while (s < e) {
2862 unsigned char ch = *s;
2863 Py_UNICODE x = 0xfffe; /* illegal value */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002865 if (ch < maplen)
2866 x = mapstring[ch];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002868 if (x == 0xfffe) {
2869 /* undefined mapping */
2870 outpos = p-PyUnicode_AS_UNICODE(v);
2871 startinpos = s-starts;
2872 endinpos = startinpos+1;
2873 if (unicode_decode_call_errorhandler(
2874 errors, &errorHandler,
2875 "charmap", "character maps to <undefined>",
2876 starts, size, &startinpos, &endinpos, &exc, &s,
2877 (PyObject **)&v, &outpos, &p)) {
2878 goto onError;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002879 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002880 continue;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002881 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002882 *p++ = x;
2883 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002884 }
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002885 }
2886 else {
2887 while (s < e) {
2888 unsigned char ch = *s;
2889 PyObject *w, *x;
2890
2891 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2892 w = PyInt_FromLong((long)ch);
2893 if (w == NULL)
2894 goto onError;
2895 x = PyObject_GetItem(mapping, w);
2896 Py_DECREF(w);
2897 if (x == NULL) {
2898 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2899 /* No mapping found means: mapping is undefined. */
2900 PyErr_Clear();
2901 x = Py_None;
2902 Py_INCREF(x);
2903 } else
2904 goto onError;
2905 }
2906
2907 /* Apply mapping */
2908 if (PyInt_Check(x)) {
2909 long value = PyInt_AS_LONG(x);
2910 if (value < 0 || value > 65535) {
2911 PyErr_SetString(PyExc_TypeError,
2912 "character mapping must be in range(65536)");
2913 Py_DECREF(x);
2914 goto onError;
2915 }
2916 *p++ = (Py_UNICODE)value;
2917 }
2918 else if (x == Py_None) {
2919 /* undefined mapping */
2920 outpos = p-PyUnicode_AS_UNICODE(v);
2921 startinpos = s-starts;
2922 endinpos = startinpos+1;
2923 if (unicode_decode_call_errorhandler(
2924 errors, &errorHandler,
2925 "charmap", "character maps to <undefined>",
2926 starts, size, &startinpos, &endinpos, &exc, &s,
2927 (PyObject **)&v, &outpos, &p)) {
2928 Py_DECREF(x);
2929 goto onError;
2930 }
2931 continue;
2932 }
2933 else if (PyUnicode_Check(x)) {
2934 int targetsize = PyUnicode_GET_SIZE(x);
2935
2936 if (targetsize == 1)
2937 /* 1-1 mapping */
2938 *p++ = *PyUnicode_AS_UNICODE(x);
2939
2940 else if (targetsize > 1) {
2941 /* 1-n mapping */
2942 if (targetsize > extrachars) {
2943 /* resize first */
2944 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2945 int needed = (targetsize - extrachars) + \
2946 (targetsize << 2);
2947 extrachars += needed;
2948 if (_PyUnicode_Resize(&v,
2949 PyUnicode_GET_SIZE(v) + needed) < 0) {
2950 Py_DECREF(x);
2951 goto onError;
2952 }
2953 p = PyUnicode_AS_UNICODE(v) + oldpos;
2954 }
2955 Py_UNICODE_COPY(p,
2956 PyUnicode_AS_UNICODE(x),
2957 targetsize);
2958 p += targetsize;
2959 extrachars -= targetsize;
2960 }
2961 /* 1-0 mapping: skip the character */
2962 }
2963 else {
2964 /* wrong return value */
2965 PyErr_SetString(PyExc_TypeError,
2966 "character mapping must return integer, None or unicode");
2967 Py_DECREF(x);
2968 goto onError;
2969 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 Py_DECREF(x);
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00002971 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 }
2974 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002975 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 Py_XDECREF(errorHandler);
2978 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002980
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002982 Py_XDECREF(errorHandler);
2983 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 Py_XDECREF(v);
2985 return NULL;
2986}
2987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002988/* Lookup the character ch in the mapping. If the character
2989 can't be found, Py_None is returned (or NULL, if another
2990 error occured). */
2991static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002992{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002993 PyObject *w = PyInt_FromLong((long)c);
2994 PyObject *x;
2995
2996 if (w == NULL)
2997 return NULL;
2998 x = PyObject_GetItem(mapping, w);
2999 Py_DECREF(w);
3000 if (x == NULL) {
3001 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3002 /* No mapping found means: mapping is undefined. */
3003 PyErr_Clear();
3004 x = Py_None;
3005 Py_INCREF(x);
3006 return x;
3007 } else
3008 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003009 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00003010 else if (x == Py_None)
3011 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003012 else if (PyInt_Check(x)) {
3013 long value = PyInt_AS_LONG(x);
3014 if (value < 0 || value > 255) {
3015 PyErr_SetString(PyExc_TypeError,
3016 "character mapping must be in range(256)");
3017 Py_DECREF(x);
3018 return NULL;
3019 }
3020 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003021 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003022 else if (PyString_Check(x))
3023 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003024 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 /* wrong return value */
3026 PyErr_SetString(PyExc_TypeError,
3027 "character mapping must return integer, None or str");
3028 Py_DECREF(x);
3029 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 }
3031}
3032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033/* lookup the character, put the result in the output string and adjust
3034 various state variables. Reallocate the output string if not enough
3035 space is available. Return a new reference to the object that
3036 was put in the output buffer, or Py_None, if the mapping was undefined
3037 (in which case no character was written) or NULL, if a
3038 reallocation error ocurred. The called must decref the result */
3039static
3040PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
3041 PyObject **outobj, int *outpos)
3042{
3043 PyObject *rep = charmapencode_lookup(c, mapping);
3044
3045 if (rep==NULL)
3046 return NULL;
3047 else if (rep==Py_None)
3048 return rep;
3049 else {
3050 char *outstart = PyString_AS_STRING(*outobj);
3051 int outsize = PyString_GET_SIZE(*outobj);
3052 if (PyInt_Check(rep)) {
3053 int requiredsize = *outpos+1;
3054 if (outsize<requiredsize) {
3055 /* exponentially overallocate to minimize reallocations */
3056 if (requiredsize < 2*outsize)
3057 requiredsize = 2*outsize;
3058 if (_PyString_Resize(outobj, requiredsize)) {
3059 Py_DECREF(rep);
3060 return NULL;
3061 }
3062 outstart = PyString_AS_STRING(*outobj);
3063 }
3064 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3065 }
3066 else {
3067 const char *repchars = PyString_AS_STRING(rep);
3068 int repsize = PyString_GET_SIZE(rep);
3069 int requiredsize = *outpos+repsize;
3070 if (outsize<requiredsize) {
3071 /* exponentially overallocate to minimize reallocations */
3072 if (requiredsize < 2*outsize)
3073 requiredsize = 2*outsize;
3074 if (_PyString_Resize(outobj, requiredsize)) {
3075 Py_DECREF(rep);
3076 return NULL;
3077 }
3078 outstart = PyString_AS_STRING(*outobj);
3079 }
3080 memcpy(outstart + *outpos, repchars, repsize);
3081 *outpos += repsize;
3082 }
3083 }
3084 return rep;
3085}
3086
3087/* handle an error in PyUnicode_EncodeCharmap
3088 Return 0 on success, -1 on error */
3089static
3090int charmap_encoding_error(
3091 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
3092 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003093 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003094 PyObject **res, int *respos)
3095{
3096 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3097 int repsize;
3098 int newpos;
3099 Py_UNICODE *uni2;
3100 /* startpos for collecting unencodable chars */
3101 int collstartpos = *inpos;
3102 int collendpos = *inpos+1;
3103 int collpos;
3104 char *encoding = "charmap";
3105 char *reason = "character maps to <undefined>";
3106
3107 PyObject *x;
3108 /* find all unencodable characters */
3109 while (collendpos < size) {
3110 x = charmapencode_lookup(p[collendpos], mapping);
3111 if (x==NULL)
3112 return -1;
3113 else if (x!=Py_None) {
3114 Py_DECREF(x);
3115 break;
3116 }
3117 Py_DECREF(x);
3118 ++collendpos;
3119 }
3120 /* cache callback name lookup
3121 * (if not done yet, i.e. it's the first error) */
3122 if (*known_errorHandler==-1) {
3123 if ((errors==NULL) || (!strcmp(errors, "strict")))
3124 *known_errorHandler = 1;
3125 else if (!strcmp(errors, "replace"))
3126 *known_errorHandler = 2;
3127 else if (!strcmp(errors, "ignore"))
3128 *known_errorHandler = 3;
3129 else if (!strcmp(errors, "xmlcharrefreplace"))
3130 *known_errorHandler = 4;
3131 else
3132 *known_errorHandler = 0;
3133 }
3134 switch (*known_errorHandler) {
3135 case 1: /* strict */
3136 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3137 return -1;
3138 case 2: /* replace */
3139 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3140 x = charmapencode_output('?', mapping, res, respos);
3141 if (x==NULL) {
3142 return -1;
3143 }
3144 else if (x==Py_None) {
3145 Py_DECREF(x);
3146 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3147 return -1;
3148 }
3149 Py_DECREF(x);
3150 }
3151 /* fall through */
3152 case 3: /* ignore */
3153 *inpos = collendpos;
3154 break;
3155 case 4: /* xmlcharrefreplace */
3156 /* generate replacement (temporarily (mis)uses p) */
3157 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3158 char buffer[2+29+1+1];
3159 char *cp;
3160 sprintf(buffer, "&#%d;", (int)p[collpos]);
3161 for (cp = buffer; *cp; ++cp) {
3162 x = charmapencode_output(*cp, mapping, res, respos);
3163 if (x==NULL)
3164 return -1;
3165 else if (x==Py_None) {
3166 Py_DECREF(x);
3167 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3168 return -1;
3169 }
3170 Py_DECREF(x);
3171 }
3172 }
3173 *inpos = collendpos;
3174 break;
3175 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003176 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003177 encoding, reason, p, size, exceptionObject,
3178 collstartpos, collendpos, &newpos);
3179 if (repunicode == NULL)
3180 return -1;
3181 /* generate replacement */
3182 repsize = PyUnicode_GET_SIZE(repunicode);
3183 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3184 x = charmapencode_output(*uni2, mapping, res, respos);
3185 if (x==NULL) {
3186 Py_DECREF(repunicode);
3187 return -1;
3188 }
3189 else if (x==Py_None) {
3190 Py_DECREF(repunicode);
3191 Py_DECREF(x);
3192 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3193 return -1;
3194 }
3195 Py_DECREF(x);
3196 }
3197 *inpos = newpos;
3198 Py_DECREF(repunicode);
3199 }
3200 return 0;
3201}
3202
Guido van Rossumd57fd912000-03-10 22:53:23 +00003203PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3204 int size,
3205 PyObject *mapping,
3206 const char *errors)
3207{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003208 /* output object */
3209 PyObject *res = NULL;
3210 /* current input position */
3211 int inpos = 0;
3212 /* current output position */
3213 int respos = 0;
3214 PyObject *errorHandler = NULL;
3215 PyObject *exc = NULL;
3216 /* the following variable is used for caching string comparisons
3217 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3218 * 3=ignore, 4=xmlcharrefreplace */
3219 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220
3221 /* Default to Latin-1 */
3222 if (mapping == NULL)
3223 return PyUnicode_EncodeLatin1(p, size, errors);
3224
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003225 /* allocate enough for a simple encoding without
3226 replacements, if we need more, we'll resize */
3227 res = PyString_FromStringAndSize(NULL, size);
3228 if (res == NULL)
3229 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003230 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003231 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003233 while (inpos<size) {
3234 /* try to encode it */
3235 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3236 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 if (x==Py_None) { /* unencodable character */
3239 if (charmap_encoding_error(p, size, &inpos, mapping,
3240 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003241 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003242 &res, &respos)) {
3243 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003244 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003247 else
3248 /* done with this character => adjust input position */
3249 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 Py_DECREF(x);
3251 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003252
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 /* Resize if we allocated to much */
3254 if (respos<PyString_GET_SIZE(res)) {
3255 if (_PyString_Resize(&res, respos))
3256 goto onError;
3257 }
3258 Py_XDECREF(exc);
3259 Py_XDECREF(errorHandler);
3260 return res;
3261
3262 onError:
3263 Py_XDECREF(res);
3264 Py_XDECREF(exc);
3265 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 return NULL;
3267}
3268
3269PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3270 PyObject *mapping)
3271{
3272 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3273 PyErr_BadArgument();
3274 return NULL;
3275 }
3276 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3277 PyUnicode_GET_SIZE(unicode),
3278 mapping,
3279 NULL);
3280}
3281
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003282/* create or adjust a UnicodeTranslateError */
3283static void make_translate_exception(PyObject **exceptionObject,
3284 const Py_UNICODE *unicode, int size,
3285 int startpos, int endpos,
3286 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 if (*exceptionObject == NULL) {
3289 *exceptionObject = PyUnicodeTranslateError_Create(
3290 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003291 }
3292 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003293 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3294 goto onError;
3295 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3296 goto onError;
3297 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3298 goto onError;
3299 return;
3300 onError:
3301 Py_DECREF(*exceptionObject);
3302 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303 }
3304}
3305
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306/* raises a UnicodeTranslateError */
3307static void raise_translate_exception(PyObject **exceptionObject,
3308 const Py_UNICODE *unicode, int size,
3309 int startpos, int endpos,
3310 const char *reason)
3311{
3312 make_translate_exception(exceptionObject,
3313 unicode, size, startpos, endpos, reason);
3314 if (*exceptionObject != NULL)
3315 PyCodec_StrictErrors(*exceptionObject);
3316}
3317
3318/* error handling callback helper:
3319 build arguments, call the callback and check the arguments,
3320 put the result into newpos and return the replacement string, which
3321 has to be freed by the caller */
3322static PyObject *unicode_translate_call_errorhandler(const char *errors,
3323 PyObject **errorHandler,
3324 const char *reason,
3325 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3326 int startpos, int endpos,
3327 int *newpos)
3328{
3329 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3330
3331 PyObject *restuple;
3332 PyObject *resunicode;
3333
3334 if (*errorHandler == NULL) {
3335 *errorHandler = PyCodec_LookupError(errors);
3336 if (*errorHandler == NULL)
3337 return NULL;
3338 }
3339
3340 make_translate_exception(exceptionObject,
3341 unicode, size, startpos, endpos, reason);
3342 if (*exceptionObject == NULL)
3343 return NULL;
3344
3345 restuple = PyObject_CallFunctionObjArgs(
3346 *errorHandler, *exceptionObject, NULL);
3347 if (restuple == NULL)
3348 return NULL;
3349 if (!PyTuple_Check(restuple)) {
3350 PyErr_Format(PyExc_TypeError, &argparse[4]);
3351 Py_DECREF(restuple);
3352 return NULL;
3353 }
3354 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3355 &resunicode, newpos)) {
3356 Py_DECREF(restuple);
3357 return NULL;
3358 }
3359 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003360 *newpos = size+*newpos;
3361 if (*newpos<0 || *newpos>size) {
3362 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3363 Py_DECREF(restuple);
3364 return NULL;
3365 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003366 Py_INCREF(resunicode);
3367 Py_DECREF(restuple);
3368 return resunicode;
3369}
3370
3371/* Lookup the character ch in the mapping and put the result in result,
3372 which must be decrefed by the caller.
3373 Return 0 on success, -1 on error */
3374static
3375int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3376{
3377 PyObject *w = PyInt_FromLong((long)c);
3378 PyObject *x;
3379
3380 if (w == NULL)
3381 return -1;
3382 x = PyObject_GetItem(mapping, w);
3383 Py_DECREF(w);
3384 if (x == NULL) {
3385 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3386 /* No mapping found means: use 1:1 mapping. */
3387 PyErr_Clear();
3388 *result = NULL;
3389 return 0;
3390 } else
3391 return -1;
3392 }
3393 else if (x == Py_None) {
3394 *result = x;
3395 return 0;
3396 }
3397 else if (PyInt_Check(x)) {
3398 long value = PyInt_AS_LONG(x);
3399 long max = PyUnicode_GetMax();
3400 if (value < 0 || value > max) {
3401 PyErr_Format(PyExc_TypeError,
3402 "character mapping must be in range(0x%lx)", max+1);
3403 Py_DECREF(x);
3404 return -1;
3405 }
3406 *result = x;
3407 return 0;
3408 }
3409 else if (PyUnicode_Check(x)) {
3410 *result = x;
3411 return 0;
3412 }
3413 else {
3414 /* wrong return value */
3415 PyErr_SetString(PyExc_TypeError,
3416 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003417 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003418 return -1;
3419 }
3420}
3421/* ensure that *outobj is at least requiredsize characters long,
3422if not reallocate and adjust various state variables.
3423Return 0 on success, -1 on error */
3424static
Walter Dörwald4894c302003-10-24 14:25:28 +00003425int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003426 int requiredsize)
3427{
Walter Dörwald4894c302003-10-24 14:25:28 +00003428 int oldsize = PyUnicode_GET_SIZE(*outobj);
3429 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003430 /* remember old output position */
3431 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3432 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003433 if (requiredsize < 2 * oldsize)
3434 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003435 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 return -1;
3437 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003438 }
3439 return 0;
3440}
3441/* lookup the character, put the result in the output string and adjust
3442 various state variables. Return a new reference to the object that
3443 was put in the output buffer in *result, or Py_None, if the mapping was
3444 undefined (in which case no character was written).
3445 The called must decref result.
3446 Return 0 on success, -1 on error. */
3447static
Walter Dörwald4894c302003-10-24 14:25:28 +00003448int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3449 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3450 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451{
Walter Dörwald4894c302003-10-24 14:25:28 +00003452 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003453 return -1;
3454 if (*res==NULL) {
3455 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003456 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 }
3458 else if (*res==Py_None)
3459 ;
3460 else if (PyInt_Check(*res)) {
3461 /* no overflow check, because we know that the space is enough */
3462 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3463 }
3464 else if (PyUnicode_Check(*res)) {
3465 int repsize = PyUnicode_GET_SIZE(*res);
3466 if (repsize==1) {
3467 /* no overflow check, because we know that the space is enough */
3468 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3469 }
3470 else if (repsize!=0) {
3471 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003472 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003473 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003474 repsize - 1;
3475 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003476 return -1;
3477 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3478 *outp += repsize;
3479 }
3480 }
3481 else
3482 return -1;
3483 return 0;
3484}
3485
3486PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 int size,
3488 PyObject *mapping,
3489 const char *errors)
3490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 /* output object */
3492 PyObject *res = NULL;
3493 /* pointers to the beginning and end+1 of input */
3494 const Py_UNICODE *startp = p;
3495 const Py_UNICODE *endp = p + size;
3496 /* pointer into the output */
3497 Py_UNICODE *str;
3498 /* current output position */
3499 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500 char *reason = "character maps to <undefined>";
3501 PyObject *errorHandler = NULL;
3502 PyObject *exc = NULL;
3503 /* the following variable is used for caching string comparisons
3504 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3505 * 3=ignore, 4=xmlcharrefreplace */
3506 int known_errorHandler = -1;
3507
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 if (mapping == NULL) {
3509 PyErr_BadArgument();
3510 return NULL;
3511 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003512
3513 /* allocate enough for a simple 1:1 translation without
3514 replacements, if we need more, we'll resize */
3515 res = PyUnicode_FromUnicode(NULL, size);
3516 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003517 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003519 return res;
3520 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003522 while (p<endp) {
3523 /* try to encode it */
3524 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003525 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003526 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 goto onError;
3528 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003529 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003530 if (x!=Py_None) /* it worked => adjust input pointer */
3531 ++p;
3532 else { /* untranslatable character */
3533 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3534 int repsize;
3535 int newpos;
3536 Py_UNICODE *uni2;
3537 /* startpos for collecting untranslatable chars */
3538 const Py_UNICODE *collstart = p;
3539 const Py_UNICODE *collend = p+1;
3540 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003542 /* find all untranslatable characters */
3543 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003544 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 goto onError;
3546 Py_XDECREF(x);
3547 if (x!=Py_None)
3548 break;
3549 ++collend;
3550 }
3551 /* cache callback name lookup
3552 * (if not done yet, i.e. it's the first error) */
3553 if (known_errorHandler==-1) {
3554 if ((errors==NULL) || (!strcmp(errors, "strict")))
3555 known_errorHandler = 1;
3556 else if (!strcmp(errors, "replace"))
3557 known_errorHandler = 2;
3558 else if (!strcmp(errors, "ignore"))
3559 known_errorHandler = 3;
3560 else if (!strcmp(errors, "xmlcharrefreplace"))
3561 known_errorHandler = 4;
3562 else
3563 known_errorHandler = 0;
3564 }
3565 switch (known_errorHandler) {
3566 case 1: /* strict */
3567 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3568 goto onError;
3569 case 2: /* replace */
3570 /* No need to check for space, this is a 1:1 replacement */
3571 for (coll = collstart; coll<collend; ++coll)
3572 *str++ = '?';
3573 /* fall through */
3574 case 3: /* ignore */
3575 p = collend;
3576 break;
3577 case 4: /* xmlcharrefreplace */
3578 /* generate replacement (temporarily (mis)uses p) */
3579 for (p = collstart; p < collend; ++p) {
3580 char buffer[2+29+1+1];
3581 char *cp;
3582 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003583 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003584 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3585 goto onError;
3586 for (cp = buffer; *cp; ++cp)
3587 *str++ = *cp;
3588 }
3589 p = collend;
3590 break;
3591 default:
3592 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3593 reason, startp, size, &exc,
3594 collstart-startp, collend-startp, &newpos);
3595 if (repunicode == NULL)
3596 goto onError;
3597 /* generate replacement */
3598 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003599 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003600 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3601 Py_DECREF(repunicode);
3602 goto onError;
3603 }
3604 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3605 *str++ = *uni2;
3606 p = startp + newpos;
3607 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 }
3609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003610 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003611 /* Resize if we allocated to much */
3612 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003613 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003614 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003615 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003616 }
3617 Py_XDECREF(exc);
3618 Py_XDECREF(errorHandler);
3619 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003621 onError:
3622 Py_XDECREF(res);
3623 Py_XDECREF(exc);
3624 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625 return NULL;
3626}
3627
3628PyObject *PyUnicode_Translate(PyObject *str,
3629 PyObject *mapping,
3630 const char *errors)
3631{
3632 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003633
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 str = PyUnicode_FromObject(str);
3635 if (str == NULL)
3636 goto onError;
3637 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3638 PyUnicode_GET_SIZE(str),
3639 mapping,
3640 errors);
3641 Py_DECREF(str);
3642 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003643
Guido van Rossumd57fd912000-03-10 22:53:23 +00003644 onError:
3645 Py_XDECREF(str);
3646 return NULL;
3647}
Tim Petersced69f82003-09-16 20:30:58 +00003648
Guido van Rossum9e896b32000-04-05 20:11:21 +00003649/* --- Decimal Encoder ---------------------------------------------------- */
3650
3651int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3652 int length,
3653 char *output,
3654 const char *errors)
3655{
3656 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003657 PyObject *errorHandler = NULL;
3658 PyObject *exc = NULL;
3659 const char *encoding = "decimal";
3660 const char *reason = "invalid decimal Unicode string";
3661 /* the following variable is used for caching string comparisons
3662 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3663 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003664
3665 if (output == NULL) {
3666 PyErr_BadArgument();
3667 return -1;
3668 }
3669
3670 p = s;
3671 end = s + length;
3672 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003673 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003674 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003675 PyObject *repunicode;
3676 int repsize;
3677 int newpos;
3678 Py_UNICODE *uni2;
3679 Py_UNICODE *collstart;
3680 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003681
Guido van Rossum9e896b32000-04-05 20:11:21 +00003682 if (Py_UNICODE_ISSPACE(ch)) {
3683 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003684 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003685 continue;
3686 }
3687 decimal = Py_UNICODE_TODECIMAL(ch);
3688 if (decimal >= 0) {
3689 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003690 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003691 continue;
3692 }
Guido van Rossumba477042000-04-06 18:18:10 +00003693 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003694 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003695 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003696 continue;
3697 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003698 /* All other characters are considered unencodable */
3699 collstart = p;
3700 collend = p+1;
3701 while (collend < end) {
3702 if ((0 < *collend && *collend < 256) ||
3703 !Py_UNICODE_ISSPACE(*collend) ||
3704 Py_UNICODE_TODECIMAL(*collend))
3705 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003706 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003707 /* cache callback name lookup
3708 * (if not done yet, i.e. it's the first error) */
3709 if (known_errorHandler==-1) {
3710 if ((errors==NULL) || (!strcmp(errors, "strict")))
3711 known_errorHandler = 1;
3712 else if (!strcmp(errors, "replace"))
3713 known_errorHandler = 2;
3714 else if (!strcmp(errors, "ignore"))
3715 known_errorHandler = 3;
3716 else if (!strcmp(errors, "xmlcharrefreplace"))
3717 known_errorHandler = 4;
3718 else
3719 known_errorHandler = 0;
3720 }
3721 switch (known_errorHandler) {
3722 case 1: /* strict */
3723 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3724 goto onError;
3725 case 2: /* replace */
3726 for (p = collstart; p < collend; ++p)
3727 *output++ = '?';
3728 /* fall through */
3729 case 3: /* ignore */
3730 p = collend;
3731 break;
3732 case 4: /* xmlcharrefreplace */
3733 /* generate replacement (temporarily (mis)uses p) */
3734 for (p = collstart; p < collend; ++p)
3735 output += sprintf(output, "&#%d;", (int)*p);
3736 p = collend;
3737 break;
3738 default:
3739 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3740 encoding, reason, s, length, &exc,
3741 collstart-s, collend-s, &newpos);
3742 if (repunicode == NULL)
3743 goto onError;
3744 /* generate replacement */
3745 repsize = PyUnicode_GET_SIZE(repunicode);
3746 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3747 Py_UNICODE ch = *uni2;
3748 if (Py_UNICODE_ISSPACE(ch))
3749 *output++ = ' ';
3750 else {
3751 decimal = Py_UNICODE_TODECIMAL(ch);
3752 if (decimal >= 0)
3753 *output++ = '0' + decimal;
3754 else if (0 < ch && ch < 256)
3755 *output++ = (char)ch;
3756 else {
3757 Py_DECREF(repunicode);
3758 raise_encode_exception(&exc, encoding,
3759 s, length, collstart-s, collend-s, reason);
3760 goto onError;
3761 }
3762 }
3763 }
3764 p = s + newpos;
3765 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003766 }
3767 }
3768 /* 0-terminate the output string */
3769 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003770 Py_XDECREF(exc);
3771 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003772 return 0;
3773
3774 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003775 Py_XDECREF(exc);
3776 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003777 return -1;
3778}
3779
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780/* --- Helpers ------------------------------------------------------------ */
3781
Tim Petersced69f82003-09-16 20:30:58 +00003782static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783int count(PyUnicodeObject *self,
3784 int start,
3785 int end,
3786 PyUnicodeObject *substring)
3787{
3788 int count = 0;
3789
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003790 if (start < 0)
3791 start += self->length;
3792 if (start < 0)
3793 start = 0;
3794 if (end > self->length)
3795 end = self->length;
3796 if (end < 0)
3797 end += self->length;
3798 if (end < 0)
3799 end = 0;
3800
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003801 if (substring->length == 0)
3802 return (end - start + 1);
3803
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804 end -= substring->length;
3805
3806 while (start <= end)
3807 if (Py_UNICODE_MATCH(self, start, substring)) {
3808 count++;
3809 start += substring->length;
3810 } else
3811 start++;
3812
3813 return count;
3814}
3815
3816int PyUnicode_Count(PyObject *str,
3817 PyObject *substr,
3818 int start,
3819 int end)
3820{
3821 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 str = PyUnicode_FromObject(str);
3824 if (str == NULL)
3825 return -1;
3826 substr = PyUnicode_FromObject(substr);
3827 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003828 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003829 return -1;
3830 }
Tim Petersced69f82003-09-16 20:30:58 +00003831
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 result = count((PyUnicodeObject *)str,
3833 start, end,
3834 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 Py_DECREF(str);
3837 Py_DECREF(substr);
3838 return result;
3839}
3840
Tim Petersced69f82003-09-16 20:30:58 +00003841static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003842int findstring(PyUnicodeObject *self,
3843 PyUnicodeObject *substring,
3844 int start,
3845 int end,
3846 int direction)
3847{
3848 if (start < 0)
3849 start += self->length;
3850 if (start < 0)
3851 start = 0;
3852
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 if (end > self->length)
3854 end = self->length;
3855 if (end < 0)
3856 end += self->length;
3857 if (end < 0)
3858 end = 0;
3859
Guido van Rossum76afbd92002-08-20 17:29:29 +00003860 if (substring->length == 0)
3861 return (direction > 0) ? start : end;
3862
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863 end -= substring->length;
3864
3865 if (direction < 0) {
3866 for (; end >= start; end--)
3867 if (Py_UNICODE_MATCH(self, end, substring))
3868 return end;
3869 } else {
3870 for (; start <= end; start++)
3871 if (Py_UNICODE_MATCH(self, start, substring))
3872 return start;
3873 }
3874
3875 return -1;
3876}
3877
3878int PyUnicode_Find(PyObject *str,
3879 PyObject *substr,
3880 int start,
3881 int end,
3882 int direction)
3883{
3884 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003885
Guido van Rossumd57fd912000-03-10 22:53:23 +00003886 str = PyUnicode_FromObject(str);
3887 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003888 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889 substr = PyUnicode_FromObject(substr);
3890 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003891 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003892 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893 }
Tim Petersced69f82003-09-16 20:30:58 +00003894
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895 result = findstring((PyUnicodeObject *)str,
3896 (PyUnicodeObject *)substr,
3897 start, end, direction);
3898 Py_DECREF(str);
3899 Py_DECREF(substr);
3900 return result;
3901}
3902
Tim Petersced69f82003-09-16 20:30:58 +00003903static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003904int tailmatch(PyUnicodeObject *self,
3905 PyUnicodeObject *substring,
3906 int start,
3907 int end,
3908 int direction)
3909{
3910 if (start < 0)
3911 start += self->length;
3912 if (start < 0)
3913 start = 0;
3914
3915 if (substring->length == 0)
3916 return 1;
3917
3918 if (end > self->length)
3919 end = self->length;
3920 if (end < 0)
3921 end += self->length;
3922 if (end < 0)
3923 end = 0;
3924
3925 end -= substring->length;
3926 if (end < start)
3927 return 0;
3928
3929 if (direction > 0) {
3930 if (Py_UNICODE_MATCH(self, end, substring))
3931 return 1;
3932 } else {
3933 if (Py_UNICODE_MATCH(self, start, substring))
3934 return 1;
3935 }
3936
3937 return 0;
3938}
3939
3940int PyUnicode_Tailmatch(PyObject *str,
3941 PyObject *substr,
3942 int start,
3943 int end,
3944 int direction)
3945{
3946 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003947
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948 str = PyUnicode_FromObject(str);
3949 if (str == NULL)
3950 return -1;
3951 substr = PyUnicode_FromObject(substr);
3952 if (substr == NULL) {
3953 Py_DECREF(substr);
3954 return -1;
3955 }
Tim Petersced69f82003-09-16 20:30:58 +00003956
Guido van Rossumd57fd912000-03-10 22:53:23 +00003957 result = tailmatch((PyUnicodeObject *)str,
3958 (PyUnicodeObject *)substr,
3959 start, end, direction);
3960 Py_DECREF(str);
3961 Py_DECREF(substr);
3962 return result;
3963}
3964
Tim Petersced69f82003-09-16 20:30:58 +00003965static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003966const Py_UNICODE *findchar(const Py_UNICODE *s,
3967 int size,
3968 Py_UNICODE ch)
3969{
3970 /* like wcschr, but doesn't stop at NULL characters */
3971
3972 while (size-- > 0) {
3973 if (*s == ch)
3974 return s;
3975 s++;
3976 }
3977
3978 return NULL;
3979}
3980
3981/* Apply fixfct filter to the Unicode object self and return a
3982 reference to the modified object */
3983
Tim Petersced69f82003-09-16 20:30:58 +00003984static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003985PyObject *fixup(PyUnicodeObject *self,
3986 int (*fixfct)(PyUnicodeObject *s))
3987{
3988
3989 PyUnicodeObject *u;
3990
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003991 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992 if (u == NULL)
3993 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003994
3995 Py_UNICODE_COPY(u->str, self->str, self->length);
3996
Tim Peters7a29bd52001-09-12 03:03:31 +00003997 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 /* fixfct should return TRUE if it modified the buffer. If
3999 FALSE, return a reference to the original buffer instead
4000 (to save space, not time) */
4001 Py_INCREF(self);
4002 Py_DECREF(u);
4003 return (PyObject*) self;
4004 }
4005 return (PyObject*) u;
4006}
4007
Tim Petersced69f82003-09-16 20:30:58 +00004008static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009int fixupper(PyUnicodeObject *self)
4010{
4011 int len = self->length;
4012 Py_UNICODE *s = self->str;
4013 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004014
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015 while (len-- > 0) {
4016 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004017
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018 ch = Py_UNICODE_TOUPPER(*s);
4019 if (ch != *s) {
4020 status = 1;
4021 *s = ch;
4022 }
4023 s++;
4024 }
4025
4026 return status;
4027}
4028
Tim Petersced69f82003-09-16 20:30:58 +00004029static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004030int fixlower(PyUnicodeObject *self)
4031{
4032 int len = self->length;
4033 Py_UNICODE *s = self->str;
4034 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 while (len-- > 0) {
4037 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00004038
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 ch = Py_UNICODE_TOLOWER(*s);
4040 if (ch != *s) {
4041 status = 1;
4042 *s = ch;
4043 }
4044 s++;
4045 }
4046
4047 return status;
4048}
4049
Tim Petersced69f82003-09-16 20:30:58 +00004050static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051int fixswapcase(PyUnicodeObject *self)
4052{
4053 int len = self->length;
4054 Py_UNICODE *s = self->str;
4055 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004056
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057 while (len-- > 0) {
4058 if (Py_UNICODE_ISUPPER(*s)) {
4059 *s = Py_UNICODE_TOLOWER(*s);
4060 status = 1;
4061 } else if (Py_UNICODE_ISLOWER(*s)) {
4062 *s = Py_UNICODE_TOUPPER(*s);
4063 status = 1;
4064 }
4065 s++;
4066 }
4067
4068 return status;
4069}
4070
Tim Petersced69f82003-09-16 20:30:58 +00004071static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072int fixcapitalize(PyUnicodeObject *self)
4073{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004074 int len = self->length;
4075 Py_UNICODE *s = self->str;
4076 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004077
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004078 if (len == 0)
4079 return 0;
4080 if (Py_UNICODE_ISLOWER(*s)) {
4081 *s = Py_UNICODE_TOUPPER(*s);
4082 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004083 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004084 s++;
4085 while (--len > 0) {
4086 if (Py_UNICODE_ISUPPER(*s)) {
4087 *s = Py_UNICODE_TOLOWER(*s);
4088 status = 1;
4089 }
4090 s++;
4091 }
4092 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093}
4094
4095static
4096int fixtitle(PyUnicodeObject *self)
4097{
4098 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4099 register Py_UNICODE *e;
4100 int previous_is_cased;
4101
4102 /* Shortcut for single character strings */
4103 if (PyUnicode_GET_SIZE(self) == 1) {
4104 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4105 if (*p != ch) {
4106 *p = ch;
4107 return 1;
4108 }
4109 else
4110 return 0;
4111 }
Tim Petersced69f82003-09-16 20:30:58 +00004112
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 e = p + PyUnicode_GET_SIZE(self);
4114 previous_is_cased = 0;
4115 for (; p < e; p++) {
4116 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 if (previous_is_cased)
4119 *p = Py_UNICODE_TOLOWER(ch);
4120 else
4121 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004122
4123 if (Py_UNICODE_ISLOWER(ch) ||
4124 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 Py_UNICODE_ISTITLE(ch))
4126 previous_is_cased = 1;
4127 else
4128 previous_is_cased = 0;
4129 }
4130 return 1;
4131}
4132
Tim Peters8ce9f162004-08-27 01:49:32 +00004133PyObject *
4134PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135{
Tim Peters8ce9f162004-08-27 01:49:32 +00004136 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004137 const Py_UNICODE blank = ' ';
4138 const Py_UNICODE *sep = &blank;
4139 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004140 PyUnicodeObject *res = NULL; /* the result */
4141 size_t res_alloc = 100; /* # allocated bytes for string in res */
4142 size_t res_used; /* # used bytes */
4143 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4144 PyObject *fseq; /* PySequence_Fast(seq) */
4145 int seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004146 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 int i;
4148
Tim Peters05eba1f2004-08-27 21:32:02 +00004149 fseq = PySequence_Fast(seq, "");
4150 if (fseq == NULL) {
Tim Peters05eba1f2004-08-27 21:32:02 +00004151 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004152 }
4153
Tim Peters91879ab2004-08-27 22:35:44 +00004154 /* Grrrr. A codec may be invoked to convert str objects to
4155 * Unicode, and so it's possible to call back into Python code
4156 * during PyUnicode_FromObject(), and so it's possible for a sick
4157 * codec to change the size of fseq (if seq is a list). Therefore
4158 * we have to keep refetching the size -- can't assume seqlen
4159 * is invariant.
4160 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004161 seqlen = PySequence_Fast_GET_SIZE(fseq);
4162 /* If empty sequence, return u"". */
4163 if (seqlen == 0) {
4164 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4165 goto Done;
4166 }
4167 /* If singleton sequence with an exact Unicode, return that. */
4168 if (seqlen == 1) {
4169 item = PySequence_Fast_GET_ITEM(fseq, 0);
4170 if (PyUnicode_CheckExact(item)) {
4171 Py_INCREF(item);
4172 res = (PyUnicodeObject *)item;
4173 goto Done;
4174 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004175 }
4176
Tim Peters05eba1f2004-08-27 21:32:02 +00004177 /* At least two items to join, or one that isn't exact Unicode. */
4178 if (seqlen > 1) {
4179 /* Set up sep and seplen -- they're needed. */
4180 if (separator == NULL) {
4181 sep = &blank;
4182 seplen = 1;
4183 }
4184 else {
4185 internal_separator = PyUnicode_FromObject(separator);
4186 if (internal_separator == NULL)
4187 goto onError;
4188 sep = PyUnicode_AS_UNICODE(internal_separator);
4189 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004190 /* In case PyUnicode_FromObject() mutated seq. */
4191 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004192 }
4193 }
4194
4195 /* Get space. */
4196 res = _PyUnicode_New((int)res_alloc);
4197 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004198 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004199 res_p = PyUnicode_AS_UNICODE(res);
4200 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004201
Tim Peters05eba1f2004-08-27 21:32:02 +00004202 for (i = 0; i < seqlen; ++i) {
4203 size_t itemlen;
4204 size_t new_res_used;
4205
4206 item = PySequence_Fast_GET_ITEM(fseq, i);
4207 /* Convert item to Unicode. */
4208 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4209 PyErr_Format(PyExc_TypeError,
4210 "sequence item %i: expected string or Unicode,"
4211 " %.80s found",
4212 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004213 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004214 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004215 item = PyUnicode_FromObject(item);
4216 if (item == NULL)
4217 goto onError;
4218 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004219
Tim Peters91879ab2004-08-27 22:35:44 +00004220 /* In case PyUnicode_FromObject() mutated seq. */
4221 seqlen = PySequence_Fast_GET_SIZE(fseq);
4222
Tim Peters8ce9f162004-08-27 01:49:32 +00004223 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004225 new_res_used = res_used + itemlen;
4226 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004227 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004228 if (i < seqlen - 1) {
4229 new_res_used += seplen;
4230 if (new_res_used < res_used || new_res_used > INT_MAX)
4231 goto Overflow;
4232 }
4233 if (new_res_used > res_alloc) {
4234 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004235 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004236 size_t oldsize = res_alloc;
4237 res_alloc += res_alloc;
4238 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004239 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004240 } while (new_res_used > res_alloc);
4241 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004242 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004244 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004245 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004247
4248 /* Copy item, and maybe the separator. */
4249 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4250 res_p += itemlen;
4251 if (i < seqlen - 1) {
4252 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4253 res_p += seplen;
4254 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004256 res_used = new_res_used;
4257 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004258
Tim Peters05eba1f2004-08-27 21:32:02 +00004259 /* Shrink res to match the used area; this probably can't fail,
4260 * but it's cheap to check.
4261 */
4262 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004263 goto onError;
4264
4265 Done:
4266 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004267 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268 return (PyObject *)res;
4269
Tim Peters8ce9f162004-08-27 01:49:32 +00004270 Overflow:
4271 PyErr_SetString(PyExc_OverflowError,
4272 "join() is too long for a Python string");
4273 Py_DECREF(item);
4274 /* fall through */
4275
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004277 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004278 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004279 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 return NULL;
4281}
4282
Tim Petersced69f82003-09-16 20:30:58 +00004283static
4284PyUnicodeObject *pad(PyUnicodeObject *self,
4285 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 int right,
4287 Py_UNICODE fill)
4288{
4289 PyUnicodeObject *u;
4290
4291 if (left < 0)
4292 left = 0;
4293 if (right < 0)
4294 right = 0;
4295
Tim Peters7a29bd52001-09-12 03:03:31 +00004296 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 Py_INCREF(self);
4298 return self;
4299 }
4300
4301 u = _PyUnicode_New(left + self->length + right);
4302 if (u) {
4303 if (left)
4304 Py_UNICODE_FILL(u->str, fill, left);
4305 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4306 if (right)
4307 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4308 }
4309
4310 return u;
4311}
4312
4313#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004314 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 if (!str) \
4316 goto onError; \
4317 if (PyList_Append(list, str)) { \
4318 Py_DECREF(str); \
4319 goto onError; \
4320 } \
4321 else \
4322 Py_DECREF(str);
4323
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004324#define SPLIT_INSERT(data, left, right) \
4325 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4326 if (!str) \
4327 goto onError; \
4328 if (PyList_Insert(list, 0, str)) { \
4329 Py_DECREF(str); \
4330 goto onError; \
4331 } \
4332 else \
4333 Py_DECREF(str);
4334
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335static
4336PyObject *split_whitespace(PyUnicodeObject *self,
4337 PyObject *list,
4338 int maxcount)
4339{
4340 register int i;
4341 register int j;
4342 int len = self->length;
4343 PyObject *str;
4344
4345 for (i = j = 0; i < len; ) {
4346 /* find a token */
4347 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4348 i++;
4349 j = i;
4350 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4351 i++;
4352 if (j < i) {
4353 if (maxcount-- <= 0)
4354 break;
4355 SPLIT_APPEND(self->str, j, i);
4356 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4357 i++;
4358 j = i;
4359 }
4360 }
4361 if (j < len) {
4362 SPLIT_APPEND(self->str, j, len);
4363 }
4364 return list;
4365
4366 onError:
4367 Py_DECREF(list);
4368 return NULL;
4369}
4370
4371PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004372 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373{
4374 register int i;
4375 register int j;
4376 int len;
4377 PyObject *list;
4378 PyObject *str;
4379 Py_UNICODE *data;
4380
4381 string = PyUnicode_FromObject(string);
4382 if (string == NULL)
4383 return NULL;
4384 data = PyUnicode_AS_UNICODE(string);
4385 len = PyUnicode_GET_SIZE(string);
4386
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387 list = PyList_New(0);
4388 if (!list)
4389 goto onError;
4390
4391 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004392 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004393
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394 /* Find a line and append it */
4395 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4396 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004397
4398 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004399 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400 if (i < len) {
4401 if (data[i] == '\r' && i + 1 < len &&
4402 data[i+1] == '\n')
4403 i += 2;
4404 else
4405 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004406 if (keepends)
4407 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 }
Guido van Rossum86662912000-04-11 15:38:46 +00004409 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 j = i;
4411 }
4412 if (j < len) {
4413 SPLIT_APPEND(data, j, len);
4414 }
4415
4416 Py_DECREF(string);
4417 return list;
4418
4419 onError:
4420 Py_DECREF(list);
4421 Py_DECREF(string);
4422 return NULL;
4423}
4424
Tim Petersced69f82003-09-16 20:30:58 +00004425static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426PyObject *split_char(PyUnicodeObject *self,
4427 PyObject *list,
4428 Py_UNICODE ch,
4429 int maxcount)
4430{
4431 register int i;
4432 register int j;
4433 int len = self->length;
4434 PyObject *str;
4435
4436 for (i = j = 0; i < len; ) {
4437 if (self->str[i] == ch) {
4438 if (maxcount-- <= 0)
4439 break;
4440 SPLIT_APPEND(self->str, j, i);
4441 i = j = i + 1;
4442 } else
4443 i++;
4444 }
4445 if (j <= len) {
4446 SPLIT_APPEND(self->str, j, len);
4447 }
4448 return list;
4449
4450 onError:
4451 Py_DECREF(list);
4452 return NULL;
4453}
4454
Tim Petersced69f82003-09-16 20:30:58 +00004455static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004456PyObject *split_substring(PyUnicodeObject *self,
4457 PyObject *list,
4458 PyUnicodeObject *substring,
4459 int maxcount)
4460{
4461 register int i;
4462 register int j;
4463 int len = self->length;
4464 int sublen = substring->length;
4465 PyObject *str;
4466
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004467 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468 if (Py_UNICODE_MATCH(self, i, substring)) {
4469 if (maxcount-- <= 0)
4470 break;
4471 SPLIT_APPEND(self->str, j, i);
4472 i = j = i + sublen;
4473 } else
4474 i++;
4475 }
4476 if (j <= len) {
4477 SPLIT_APPEND(self->str, j, len);
4478 }
4479 return list;
4480
4481 onError:
4482 Py_DECREF(list);
4483 return NULL;
4484}
4485
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004486static
4487PyObject *rsplit_whitespace(PyUnicodeObject *self,
4488 PyObject *list,
4489 int maxcount)
4490{
4491 register int i;
4492 register int j;
4493 int len = self->length;
4494 PyObject *str;
4495
4496 for (i = j = len - 1; i >= 0; ) {
4497 /* find a token */
4498 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4499 i--;
4500 j = i;
4501 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4502 i--;
4503 if (j > i) {
4504 if (maxcount-- <= 0)
4505 break;
4506 SPLIT_INSERT(self->str, i + 1, j + 1);
4507 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4508 i--;
4509 j = i;
4510 }
4511 }
4512 if (j >= 0) {
4513 SPLIT_INSERT(self->str, 0, j + 1);
4514 }
4515 return list;
4516
4517 onError:
4518 Py_DECREF(list);
4519 return NULL;
4520}
4521
4522static
4523PyObject *rsplit_char(PyUnicodeObject *self,
4524 PyObject *list,
4525 Py_UNICODE ch,
4526 int maxcount)
4527{
4528 register int i;
4529 register int j;
4530 int len = self->length;
4531 PyObject *str;
4532
4533 for (i = j = len - 1; i >= 0; ) {
4534 if (self->str[i] == ch) {
4535 if (maxcount-- <= 0)
4536 break;
4537 SPLIT_INSERT(self->str, i + 1, j + 1);
4538 j = i = i - 1;
4539 } else
4540 i--;
4541 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004542 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004543 SPLIT_INSERT(self->str, 0, j + 1);
4544 }
4545 return list;
4546
4547 onError:
4548 Py_DECREF(list);
4549 return NULL;
4550}
4551
4552static
4553PyObject *rsplit_substring(PyUnicodeObject *self,
4554 PyObject *list,
4555 PyUnicodeObject *substring,
4556 int maxcount)
4557{
4558 register int i;
4559 register int j;
4560 int len = self->length;
4561 int sublen = substring->length;
4562 PyObject *str;
4563
4564 for (i = len - sublen, j = len; i >= 0; ) {
4565 if (Py_UNICODE_MATCH(self, i, substring)) {
4566 if (maxcount-- <= 0)
4567 break;
4568 SPLIT_INSERT(self->str, i + sublen, j);
4569 j = i;
4570 i -= sublen;
4571 } else
4572 i--;
4573 }
4574 if (j >= 0) {
4575 SPLIT_INSERT(self->str, 0, j);
4576 }
4577 return list;
4578
4579 onError:
4580 Py_DECREF(list);
4581 return NULL;
4582}
4583
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004585#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586
4587static
4588PyObject *split(PyUnicodeObject *self,
4589 PyUnicodeObject *substring,
4590 int maxcount)
4591{
4592 PyObject *list;
4593
4594 if (maxcount < 0)
4595 maxcount = INT_MAX;
4596
4597 list = PyList_New(0);
4598 if (!list)
4599 return NULL;
4600
4601 if (substring == NULL)
4602 return split_whitespace(self,list,maxcount);
4603
4604 else if (substring->length == 1)
4605 return split_char(self,list,substring->str[0],maxcount);
4606
4607 else if (substring->length == 0) {
4608 Py_DECREF(list);
4609 PyErr_SetString(PyExc_ValueError, "empty separator");
4610 return NULL;
4611 }
4612 else
4613 return split_substring(self,list,substring,maxcount);
4614}
4615
Tim Petersced69f82003-09-16 20:30:58 +00004616static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004617PyObject *rsplit(PyUnicodeObject *self,
4618 PyUnicodeObject *substring,
4619 int maxcount)
4620{
4621 PyObject *list;
4622
4623 if (maxcount < 0)
4624 maxcount = INT_MAX;
4625
4626 list = PyList_New(0);
4627 if (!list)
4628 return NULL;
4629
4630 if (substring == NULL)
4631 return rsplit_whitespace(self,list,maxcount);
4632
4633 else if (substring->length == 1)
4634 return rsplit_char(self,list,substring->str[0],maxcount);
4635
4636 else if (substring->length == 0) {
4637 Py_DECREF(list);
4638 PyErr_SetString(PyExc_ValueError, "empty separator");
4639 return NULL;
4640 }
4641 else
4642 return rsplit_substring(self,list,substring,maxcount);
4643}
4644
4645static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646PyObject *replace(PyUnicodeObject *self,
4647 PyUnicodeObject *str1,
4648 PyUnicodeObject *str2,
4649 int maxcount)
4650{
4651 PyUnicodeObject *u;
4652
4653 if (maxcount < 0)
4654 maxcount = INT_MAX;
4655
4656 if (str1->length == 1 && str2->length == 1) {
4657 int i;
4658
4659 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004660 if (!findchar(self->str, self->length, str1->str[0]) &&
4661 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 /* nothing to replace, return original string */
4663 Py_INCREF(self);
4664 u = self;
4665 } else {
4666 Py_UNICODE u1 = str1->str[0];
4667 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004668
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004670 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004671 self->length
4672 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004673 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004674 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004675 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676 for (i = 0; i < u->length; i++)
4677 if (u->str[i] == u1) {
4678 if (--maxcount < 0)
4679 break;
4680 u->str[i] = u2;
4681 }
4682 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684
4685 } else {
4686 int n, i;
4687 Py_UNICODE *p;
4688
4689 /* replace strings */
4690 n = count(self, 0, self->length, str1);
4691 if (n > maxcount)
4692 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004693 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004695 if (PyUnicode_CheckExact(self)) {
4696 Py_INCREF(self);
4697 u = self;
4698 }
4699 else {
4700 u = (PyUnicodeObject *)
4701 PyUnicode_FromUnicode(self->str, self->length);
4702 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 } else {
4704 u = _PyUnicode_New(
4705 self->length + n * (str2->length - str1->length));
4706 if (u) {
4707 i = 0;
4708 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004709 if (str1->length > 0) {
4710 while (i <= self->length - str1->length)
4711 if (Py_UNICODE_MATCH(self, i, str1)) {
4712 /* replace string segment */
4713 Py_UNICODE_COPY(p, str2->str, str2->length);
4714 p += str2->length;
4715 i += str1->length;
4716 if (--n <= 0) {
4717 /* copy remaining part */
4718 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4719 break;
4720 }
4721 } else
4722 *p++ = self->str[i++];
4723 } else {
4724 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725 Py_UNICODE_COPY(p, str2->str, str2->length);
4726 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004727 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004730 }
4731 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4732 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 }
4734 }
4735 }
Tim Petersced69f82003-09-16 20:30:58 +00004736
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737 return (PyObject *) u;
4738}
4739
4740/* --- Unicode Object Methods --------------------------------------------- */
4741
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004742PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743"S.title() -> unicode\n\
4744\n\
4745Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004746characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747
4748static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004749unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751 return fixup(self, fixtitle);
4752}
4753
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004754PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755"S.capitalize() -> unicode\n\
4756\n\
4757Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004758have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759
4760static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004761unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763 return fixup(self, fixcapitalize);
4764}
4765
4766#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004767PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768"S.capwords() -> unicode\n\
4769\n\
4770Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004771normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772
4773static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004774unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775{
4776 PyObject *list;
4777 PyObject *item;
4778 int i;
4779
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 /* Split into words */
4781 list = split(self, NULL, -1);
4782 if (!list)
4783 return NULL;
4784
4785 /* Capitalize each word */
4786 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4787 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4788 fixcapitalize);
4789 if (item == NULL)
4790 goto onError;
4791 Py_DECREF(PyList_GET_ITEM(list, i));
4792 PyList_SET_ITEM(list, i, item);
4793 }
4794
4795 /* Join the words to form a new string */
4796 item = PyUnicode_Join(NULL, list);
4797
4798onError:
4799 Py_DECREF(list);
4800 return (PyObject *)item;
4801}
4802#endif
4803
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004804/* Argument converter. Coerces to a single unicode character */
4805
4806static int
4807convert_uc(PyObject *obj, void *addr)
4808{
4809 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4810 PyObject *uniobj;
4811 Py_UNICODE *unistr;
4812
4813 uniobj = PyUnicode_FromObject(obj);
4814 if (uniobj == NULL) {
4815 PyErr_SetString(PyExc_TypeError,
4816 "The fill character cannot be converted to Unicode");
4817 return 0;
4818 }
4819 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4820 PyErr_SetString(PyExc_TypeError,
4821 "The fill character must be exactly one character long");
4822 Py_DECREF(uniobj);
4823 return 0;
4824 }
4825 unistr = PyUnicode_AS_UNICODE(uniobj);
4826 *fillcharloc = unistr[0];
4827 Py_DECREF(uniobj);
4828 return 1;
4829}
4830
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004831PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004832"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004834Return S centered in a Unicode string of length width. Padding is\n\
4835done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836
4837static PyObject *
4838unicode_center(PyUnicodeObject *self, PyObject *args)
4839{
4840 int marg, left;
4841 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004842 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004844 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 return NULL;
4846
Tim Peters7a29bd52001-09-12 03:03:31 +00004847 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 Py_INCREF(self);
4849 return (PyObject*) self;
4850 }
4851
4852 marg = width - self->length;
4853 left = marg / 2 + (marg & width & 1);
4854
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004855 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856}
4857
Marc-André Lemburge5034372000-08-08 08:04:29 +00004858#if 0
4859
4860/* This code should go into some future Unicode collation support
4861 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004862 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004863
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004864/* speedy UTF-16 code point order comparison */
4865/* gleaned from: */
4866/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4867
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004868static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004869{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004870 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004871 0, 0, 0, 0, 0, 0, 0, 0,
4872 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004873 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004874};
4875
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876static int
4877unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4878{
4879 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004880
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881 Py_UNICODE *s1 = str1->str;
4882 Py_UNICODE *s2 = str2->str;
4883
4884 len1 = str1->length;
4885 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004886
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004888 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004889
4890 c1 = *s1++;
4891 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004892
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004893 if (c1 > (1<<11) * 26)
4894 c1 += utf16Fixup[c1>>11];
4895 if (c2 > (1<<11) * 26)
4896 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004897 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004898
4899 if (c1 != c2)
4900 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004901
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004902 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 }
4904
4905 return (len1 < len2) ? -1 : (len1 != len2);
4906}
4907
Marc-André Lemburge5034372000-08-08 08:04:29 +00004908#else
4909
4910static int
4911unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4912{
4913 register int len1, len2;
4914
4915 Py_UNICODE *s1 = str1->str;
4916 Py_UNICODE *s2 = str2->str;
4917
4918 len1 = str1->length;
4919 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004920
Marc-André Lemburge5034372000-08-08 08:04:29 +00004921 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004922 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004923
Fredrik Lundh45714e92001-06-26 16:39:36 +00004924 c1 = *s1++;
4925 c2 = *s2++;
4926
4927 if (c1 != c2)
4928 return (c1 < c2) ? -1 : 1;
4929
Marc-André Lemburge5034372000-08-08 08:04:29 +00004930 len1--; len2--;
4931 }
4932
4933 return (len1 < len2) ? -1 : (len1 != len2);
4934}
4935
4936#endif
4937
Guido van Rossumd57fd912000-03-10 22:53:23 +00004938int PyUnicode_Compare(PyObject *left,
4939 PyObject *right)
4940{
4941 PyUnicodeObject *u = NULL, *v = NULL;
4942 int result;
4943
4944 /* Coerce the two arguments */
4945 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4946 if (u == NULL)
4947 goto onError;
4948 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4949 if (v == NULL)
4950 goto onError;
4951
Thomas Wouters7e474022000-07-16 12:04:32 +00004952 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004953 if (v == u) {
4954 Py_DECREF(u);
4955 Py_DECREF(v);
4956 return 0;
4957 }
4958
4959 result = unicode_compare(u, v);
4960
4961 Py_DECREF(u);
4962 Py_DECREF(v);
4963 return result;
4964
4965onError:
4966 Py_XDECREF(u);
4967 Py_XDECREF(v);
4968 return -1;
4969}
4970
Guido van Rossum403d68b2000-03-13 15:55:09 +00004971int PyUnicode_Contains(PyObject *container,
4972 PyObject *element)
4973{
4974 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004975 int result, size;
4976 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004977
4978 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004979 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004980 if (v == NULL) {
4981 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004982 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004983 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004984 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004985 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004986 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004987 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004988
Barry Warsaw817918c2002-08-06 16:58:21 +00004989 size = PyUnicode_GET_SIZE(v);
4990 rhs = PyUnicode_AS_UNICODE(v);
4991 lhs = PyUnicode_AS_UNICODE(u);
4992
Guido van Rossum403d68b2000-03-13 15:55:09 +00004993 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004994 if (size == 1) {
4995 end = lhs + PyUnicode_GET_SIZE(u);
4996 while (lhs < end) {
4997 if (*lhs++ == *rhs) {
4998 result = 1;
4999 break;
5000 }
5001 }
5002 }
5003 else {
5004 end = lhs + (PyUnicode_GET_SIZE(u) - size);
5005 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00005006 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00005007 result = 1;
5008 break;
5009 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00005010 }
5011 }
5012
5013 Py_DECREF(u);
5014 Py_DECREF(v);
5015 return result;
5016
5017onError:
5018 Py_XDECREF(u);
5019 Py_XDECREF(v);
5020 return -1;
5021}
5022
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023/* Concat to string or Unicode object giving a new Unicode object. */
5024
5025PyObject *PyUnicode_Concat(PyObject *left,
5026 PyObject *right)
5027{
5028 PyUnicodeObject *u = NULL, *v = NULL, *w;
5029
5030 /* Coerce the two arguments */
5031 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
5032 if (u == NULL)
5033 goto onError;
5034 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
5035 if (v == NULL)
5036 goto onError;
5037
5038 /* Shortcuts */
5039 if (v == unicode_empty) {
5040 Py_DECREF(v);
5041 return (PyObject *)u;
5042 }
5043 if (u == unicode_empty) {
5044 Py_DECREF(u);
5045 return (PyObject *)v;
5046 }
5047
5048 /* Concat the two Unicode strings */
5049 w = _PyUnicode_New(u->length + v->length);
5050 if (w == NULL)
5051 goto onError;
5052 Py_UNICODE_COPY(w->str, u->str, u->length);
5053 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5054
5055 Py_DECREF(u);
5056 Py_DECREF(v);
5057 return (PyObject *)w;
5058
5059onError:
5060 Py_XDECREF(u);
5061 Py_XDECREF(v);
5062 return NULL;
5063}
5064
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005065PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066"S.count(sub[, start[, end]]) -> int\n\
5067\n\
5068Return the number of occurrences of substring sub in Unicode string\n\
5069S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005070interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071
5072static PyObject *
5073unicode_count(PyUnicodeObject *self, PyObject *args)
5074{
5075 PyUnicodeObject *substring;
5076 int start = 0;
5077 int end = INT_MAX;
5078 PyObject *result;
5079
Guido van Rossumb8872e62000-05-09 14:14:27 +00005080 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5081 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 return NULL;
5083
5084 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5085 (PyObject *)substring);
5086 if (substring == NULL)
5087 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005088
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 if (start < 0)
5090 start += self->length;
5091 if (start < 0)
5092 start = 0;
5093 if (end > self->length)
5094 end = self->length;
5095 if (end < 0)
5096 end += self->length;
5097 if (end < 0)
5098 end = 0;
5099
5100 result = PyInt_FromLong((long) count(self, start, end, substring));
5101
5102 Py_DECREF(substring);
5103 return result;
5104}
5105
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005106PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005107"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005109Encodes S using the codec registered for encoding. encoding defaults\n\
5110to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005111handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005112a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5113'xmlcharrefreplace' as well as any other name registered with\n\
5114codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115
5116static PyObject *
5117unicode_encode(PyUnicodeObject *self, PyObject *args)
5118{
5119 char *encoding = NULL;
5120 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005121 PyObject *v;
5122
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5124 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005125 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005126 if (v == NULL)
5127 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005128 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5129 PyErr_Format(PyExc_TypeError,
5130 "encoder did not return a string/unicode object "
5131 "(type=%.400s)",
5132 v->ob_type->tp_name);
5133 Py_DECREF(v);
5134 return NULL;
5135 }
5136 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005137
5138 onError:
5139 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005140}
5141
5142PyDoc_STRVAR(decode__doc__,
5143"S.decode([encoding[,errors]]) -> string or unicode\n\
5144\n\
5145Decodes S using the codec registered for encoding. encoding defaults\n\
5146to the default encoding. errors may be given to set a different error\n\
5147handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5148a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5149as well as any other name registerd with codecs.register_error that is\n\
5150able to handle UnicodeDecodeErrors.");
5151
5152static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005153unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005154{
5155 char *encoding = NULL;
5156 char *errors = NULL;
5157 PyObject *v;
5158
5159 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5160 return NULL;
5161 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005162 if (v == NULL)
5163 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005164 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5165 PyErr_Format(PyExc_TypeError,
5166 "decoder did not return a string/unicode object "
5167 "(type=%.400s)",
5168 v->ob_type->tp_name);
5169 Py_DECREF(v);
5170 return NULL;
5171 }
5172 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005173
5174 onError:
5175 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176}
5177
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005178PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179"S.expandtabs([tabsize]) -> unicode\n\
5180\n\
5181Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005182If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183
5184static PyObject*
5185unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5186{
5187 Py_UNICODE *e;
5188 Py_UNICODE *p;
5189 Py_UNICODE *q;
5190 int i, j;
5191 PyUnicodeObject *u;
5192 int tabsize = 8;
5193
5194 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5195 return NULL;
5196
Thomas Wouters7e474022000-07-16 12:04:32 +00005197 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 i = j = 0;
5199 e = self->str + self->length;
5200 for (p = self->str; p < e; p++)
5201 if (*p == '\t') {
5202 if (tabsize > 0)
5203 j += tabsize - (j % tabsize);
5204 }
5205 else {
5206 j++;
5207 if (*p == '\n' || *p == '\r') {
5208 i += j;
5209 j = 0;
5210 }
5211 }
5212
5213 /* Second pass: create output string and fill it */
5214 u = _PyUnicode_New(i + j);
5215 if (!u)
5216 return NULL;
5217
5218 j = 0;
5219 q = u->str;
5220
5221 for (p = self->str; p < e; p++)
5222 if (*p == '\t') {
5223 if (tabsize > 0) {
5224 i = tabsize - (j % tabsize);
5225 j += i;
5226 while (i--)
5227 *q++ = ' ';
5228 }
5229 }
5230 else {
5231 j++;
5232 *q++ = *p;
5233 if (*p == '\n' || *p == '\r')
5234 j = 0;
5235 }
5236
5237 return (PyObject*) u;
5238}
5239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005240PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241"S.find(sub [,start [,end]]) -> int\n\
5242\n\
5243Return the lowest index in S where substring sub is found,\n\
5244such that sub is contained within s[start,end]. Optional\n\
5245arguments start and end are interpreted as in slice notation.\n\
5246\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005247Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005248
5249static PyObject *
5250unicode_find(PyUnicodeObject *self, PyObject *args)
5251{
5252 PyUnicodeObject *substring;
5253 int start = 0;
5254 int end = INT_MAX;
5255 PyObject *result;
5256
Guido van Rossumb8872e62000-05-09 14:14:27 +00005257 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5258 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 return NULL;
5260 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5261 (PyObject *)substring);
5262 if (substring == NULL)
5263 return NULL;
5264
5265 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5266
5267 Py_DECREF(substring);
5268 return result;
5269}
5270
5271static PyObject *
5272unicode_getitem(PyUnicodeObject *self, int index)
5273{
5274 if (index < 0 || index >= self->length) {
5275 PyErr_SetString(PyExc_IndexError, "string index out of range");
5276 return NULL;
5277 }
5278
5279 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5280}
5281
5282static long
5283unicode_hash(PyUnicodeObject *self)
5284{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005285 /* Since Unicode objects compare equal to their ASCII string
5286 counterparts, they should use the individual character values
5287 as basis for their hash value. This is needed to assure that
5288 strings and Unicode objects behave in the same way as
5289 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290
Fredrik Lundhdde61642000-07-10 18:27:47 +00005291 register int len;
5292 register Py_UNICODE *p;
5293 register long x;
5294
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295 if (self->hash != -1)
5296 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005297 len = PyUnicode_GET_SIZE(self);
5298 p = PyUnicode_AS_UNICODE(self);
5299 x = *p << 7;
5300 while (--len >= 0)
5301 x = (1000003*x) ^ *p++;
5302 x ^= PyUnicode_GET_SIZE(self);
5303 if (x == -1)
5304 x = -2;
5305 self->hash = x;
5306 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307}
5308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005309PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310"S.index(sub [,start [,end]]) -> int\n\
5311\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005312Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313
5314static PyObject *
5315unicode_index(PyUnicodeObject *self, PyObject *args)
5316{
5317 int result;
5318 PyUnicodeObject *substring;
5319 int start = 0;
5320 int end = INT_MAX;
5321
Guido van Rossumb8872e62000-05-09 14:14:27 +00005322 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5323 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005325
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5327 (PyObject *)substring);
5328 if (substring == NULL)
5329 return NULL;
5330
5331 result = findstring(self, substring, start, end, 1);
5332
5333 Py_DECREF(substring);
5334 if (result < 0) {
5335 PyErr_SetString(PyExc_ValueError, "substring not found");
5336 return NULL;
5337 }
5338 return PyInt_FromLong(result);
5339}
5340
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005341PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005342"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005343\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005344Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005345at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346
5347static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005348unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349{
5350 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5351 register const Py_UNICODE *e;
5352 int cased;
5353
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354 /* Shortcut for single character strings */
5355 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005356 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005358 /* Special case for empty strings */
5359 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005360 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005361
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 e = p + PyUnicode_GET_SIZE(self);
5363 cased = 0;
5364 for (; p < e; p++) {
5365 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005366
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005368 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369 else if (!cased && Py_UNICODE_ISLOWER(ch))
5370 cased = 1;
5371 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005372 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373}
5374
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005375PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005376"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005378Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005379at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380
5381static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005382unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383{
5384 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5385 register const Py_UNICODE *e;
5386 int cased;
5387
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 /* Shortcut for single character strings */
5389 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005390 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005392 /* Special case for empty strings */
5393 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005394 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005395
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396 e = p + PyUnicode_GET_SIZE(self);
5397 cased = 0;
5398 for (; p < e; p++) {
5399 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005402 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 else if (!cased && Py_UNICODE_ISUPPER(ch))
5404 cased = 1;
5405 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005406 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005407}
5408
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005409PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005410"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005412Return True if S is a titlecased string and there is at least one\n\
5413character in S, i.e. upper- and titlecase characters may only\n\
5414follow uncased characters and lowercase characters only cased ones.\n\
5415Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416
5417static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005418unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419{
5420 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5421 register const Py_UNICODE *e;
5422 int cased, previous_is_cased;
5423
Guido van Rossumd57fd912000-03-10 22:53:23 +00005424 /* Shortcut for single character strings */
5425 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005426 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5427 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005428
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005429 /* Special case for empty strings */
5430 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005431 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005432
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 e = p + PyUnicode_GET_SIZE(self);
5434 cased = 0;
5435 previous_is_cased = 0;
5436 for (; p < e; p++) {
5437 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005438
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5440 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005441 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 previous_is_cased = 1;
5443 cased = 1;
5444 }
5445 else if (Py_UNICODE_ISLOWER(ch)) {
5446 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005447 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448 previous_is_cased = 1;
5449 cased = 1;
5450 }
5451 else
5452 previous_is_cased = 0;
5453 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005454 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455}
5456
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005457PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005458"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005460Return True if all characters in S are whitespace\n\
5461and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462
5463static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005464unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005465{
5466 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5467 register const Py_UNICODE *e;
5468
Guido van Rossumd57fd912000-03-10 22:53:23 +00005469 /* Shortcut for single character strings */
5470 if (PyUnicode_GET_SIZE(self) == 1 &&
5471 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005472 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005474 /* Special case for empty strings */
5475 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005476 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005477
Guido van Rossumd57fd912000-03-10 22:53:23 +00005478 e = p + PyUnicode_GET_SIZE(self);
5479 for (; p < e; p++) {
5480 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005481 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005483 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484}
5485
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005486PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005487"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005488\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005489Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005490and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005491
5492static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005493unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005494{
5495 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5496 register const Py_UNICODE *e;
5497
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005498 /* Shortcut for single character strings */
5499 if (PyUnicode_GET_SIZE(self) == 1 &&
5500 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005501 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005502
5503 /* Special case for empty strings */
5504 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005505 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005506
5507 e = p + PyUnicode_GET_SIZE(self);
5508 for (; p < e; p++) {
5509 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005510 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005511 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005512 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005513}
5514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005515PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005516"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005517\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005518Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005519and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005520
5521static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005522unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005523{
5524 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5525 register const Py_UNICODE *e;
5526
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005527 /* Shortcut for single character strings */
5528 if (PyUnicode_GET_SIZE(self) == 1 &&
5529 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005530 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005531
5532 /* Special case for empty strings */
5533 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005534 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005535
5536 e = p + PyUnicode_GET_SIZE(self);
5537 for (; p < e; p++) {
5538 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005539 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005540 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005541 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005542}
5543
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005544PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005545"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005546\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005547Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005548False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549
5550static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005551unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552{
5553 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5554 register const Py_UNICODE *e;
5555
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 /* Shortcut for single character strings */
5557 if (PyUnicode_GET_SIZE(self) == 1 &&
5558 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005559 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005560
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005561 /* Special case for empty strings */
5562 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005563 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005564
Guido van Rossumd57fd912000-03-10 22:53:23 +00005565 e = p + PyUnicode_GET_SIZE(self);
5566 for (; p < e; p++) {
5567 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005568 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005569 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005570 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571}
5572
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005573PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005574"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005575\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005576Return True if all characters in S are digits\n\
5577and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578
5579static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005580unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581{
5582 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5583 register const Py_UNICODE *e;
5584
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 /* Shortcut for single character strings */
5586 if (PyUnicode_GET_SIZE(self) == 1 &&
5587 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005588 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005590 /* Special case for empty strings */
5591 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005592 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005593
Guido van Rossumd57fd912000-03-10 22:53:23 +00005594 e = p + PyUnicode_GET_SIZE(self);
5595 for (; p < e; p++) {
5596 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005597 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005598 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005599 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600}
5601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005602PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005603"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005605Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005606False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607
5608static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005609unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005610{
5611 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5612 register const Py_UNICODE *e;
5613
Guido van Rossumd57fd912000-03-10 22:53:23 +00005614 /* Shortcut for single character strings */
5615 if (PyUnicode_GET_SIZE(self) == 1 &&
5616 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005617 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005619 /* Special case for empty strings */
5620 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005621 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005622
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 e = p + PyUnicode_GET_SIZE(self);
5624 for (; p < e; p++) {
5625 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005626 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005628 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629}
5630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005631PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005632"S.join(sequence) -> unicode\n\
5633\n\
5634Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005635sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636
5637static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005638unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005639{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005640 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641}
5642
5643static int
5644unicode_length(PyUnicodeObject *self)
5645{
5646 return self->length;
5647}
5648
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005649PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005650"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651\n\
5652Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005653done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654
5655static PyObject *
5656unicode_ljust(PyUnicodeObject *self, PyObject *args)
5657{
5658 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005659 Py_UNICODE fillchar = ' ';
5660
5661 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005662 return NULL;
5663
Tim Peters7a29bd52001-09-12 03:03:31 +00005664 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665 Py_INCREF(self);
5666 return (PyObject*) self;
5667 }
5668
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005669 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670}
5671
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005672PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673"S.lower() -> unicode\n\
5674\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005675Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005676
5677static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005678unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680 return fixup(self, fixlower);
5681}
5682
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005683#define LEFTSTRIP 0
5684#define RIGHTSTRIP 1
5685#define BOTHSTRIP 2
5686
5687/* Arrays indexed by above */
5688static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5689
5690#define STRIPNAME(i) (stripformat[i]+3)
5691
5692static const Py_UNICODE *
5693unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5694{
Tim Peters030a5ce2002-04-22 19:00:10 +00005695 size_t i;
5696 for (i = 0; i < n; ++i)
5697 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005698 return s+i;
5699 return NULL;
5700}
5701
5702/* externally visible for str.strip(unicode) */
5703PyObject *
5704_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5705{
5706 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5707 int len = PyUnicode_GET_SIZE(self);
5708 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5709 int seplen = PyUnicode_GET_SIZE(sepobj);
5710 int i, j;
5711
5712 i = 0;
5713 if (striptype != RIGHTSTRIP) {
5714 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5715 i++;
5716 }
5717 }
5718
5719 j = len;
5720 if (striptype != LEFTSTRIP) {
5721 do {
5722 j--;
5723 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5724 j++;
5725 }
5726
5727 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5728 Py_INCREF(self);
5729 return (PyObject*)self;
5730 }
5731 else
5732 return PyUnicode_FromUnicode(s+i, j-i);
5733}
5734
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735
5736static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005737do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005739 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5740 int len = PyUnicode_GET_SIZE(self), i, j;
5741
5742 i = 0;
5743 if (striptype != RIGHTSTRIP) {
5744 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5745 i++;
5746 }
5747 }
5748
5749 j = len;
5750 if (striptype != LEFTSTRIP) {
5751 do {
5752 j--;
5753 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5754 j++;
5755 }
5756
5757 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5758 Py_INCREF(self);
5759 return (PyObject*)self;
5760 }
5761 else
5762 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005763}
5764
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005765
5766static PyObject *
5767do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5768{
5769 PyObject *sep = NULL;
5770
5771 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5772 return NULL;
5773
5774 if (sep != NULL && sep != Py_None) {
5775 if (PyUnicode_Check(sep))
5776 return _PyUnicode_XStrip(self, striptype, sep);
5777 else if (PyString_Check(sep)) {
5778 PyObject *res;
5779 sep = PyUnicode_FromObject(sep);
5780 if (sep==NULL)
5781 return NULL;
5782 res = _PyUnicode_XStrip(self, striptype, sep);
5783 Py_DECREF(sep);
5784 return res;
5785 }
5786 else {
5787 PyErr_Format(PyExc_TypeError,
5788 "%s arg must be None, unicode or str",
5789 STRIPNAME(striptype));
5790 return NULL;
5791 }
5792 }
5793
5794 return do_strip(self, striptype);
5795}
5796
5797
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005798PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005799"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005800\n\
5801Return a copy of the string S with leading and trailing\n\
5802whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005803If chars is given and not None, remove characters in chars instead.\n\
5804If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005805
5806static PyObject *
5807unicode_strip(PyUnicodeObject *self, PyObject *args)
5808{
5809 if (PyTuple_GET_SIZE(args) == 0)
5810 return do_strip(self, BOTHSTRIP); /* Common case */
5811 else
5812 return do_argstrip(self, BOTHSTRIP, args);
5813}
5814
5815
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005816PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005817"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005818\n\
5819Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005820If chars is given and not None, remove characters in chars instead.\n\
5821If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005822
5823static PyObject *
5824unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5825{
5826 if (PyTuple_GET_SIZE(args) == 0)
5827 return do_strip(self, LEFTSTRIP); /* Common case */
5828 else
5829 return do_argstrip(self, LEFTSTRIP, args);
5830}
5831
5832
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005833PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005834"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005835\n\
5836Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005837If chars is given and not None, remove characters in chars instead.\n\
5838If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005839
5840static PyObject *
5841unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5842{
5843 if (PyTuple_GET_SIZE(args) == 0)
5844 return do_strip(self, RIGHTSTRIP); /* Common case */
5845 else
5846 return do_argstrip(self, RIGHTSTRIP, args);
5847}
5848
5849
Guido van Rossumd57fd912000-03-10 22:53:23 +00005850static PyObject*
5851unicode_repeat(PyUnicodeObject *str, int len)
5852{
5853 PyUnicodeObject *u;
5854 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005855 int nchars;
5856 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857
5858 if (len < 0)
5859 len = 0;
5860
Tim Peters7a29bd52001-09-12 03:03:31 +00005861 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862 /* no repeat, return original string */
5863 Py_INCREF(str);
5864 return (PyObject*) str;
5865 }
Tim Peters8f422462000-09-09 06:13:41 +00005866
5867 /* ensure # of chars needed doesn't overflow int and # of bytes
5868 * needed doesn't overflow size_t
5869 */
5870 nchars = len * str->length;
5871 if (len && nchars / len != str->length) {
5872 PyErr_SetString(PyExc_OverflowError,
5873 "repeated string is too long");
5874 return NULL;
5875 }
5876 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5877 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5878 PyErr_SetString(PyExc_OverflowError,
5879 "repeated string is too long");
5880 return NULL;
5881 }
5882 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005883 if (!u)
5884 return NULL;
5885
5886 p = u->str;
5887
5888 while (len-- > 0) {
5889 Py_UNICODE_COPY(p, str->str, str->length);
5890 p += str->length;
5891 }
5892
5893 return (PyObject*) u;
5894}
5895
5896PyObject *PyUnicode_Replace(PyObject *obj,
5897 PyObject *subobj,
5898 PyObject *replobj,
5899 int maxcount)
5900{
5901 PyObject *self;
5902 PyObject *str1;
5903 PyObject *str2;
5904 PyObject *result;
5905
5906 self = PyUnicode_FromObject(obj);
5907 if (self == NULL)
5908 return NULL;
5909 str1 = PyUnicode_FromObject(subobj);
5910 if (str1 == NULL) {
5911 Py_DECREF(self);
5912 return NULL;
5913 }
5914 str2 = PyUnicode_FromObject(replobj);
5915 if (str2 == NULL) {
5916 Py_DECREF(self);
5917 Py_DECREF(str1);
5918 return NULL;
5919 }
Tim Petersced69f82003-09-16 20:30:58 +00005920 result = replace((PyUnicodeObject *)self,
5921 (PyUnicodeObject *)str1,
5922 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005923 maxcount);
5924 Py_DECREF(self);
5925 Py_DECREF(str1);
5926 Py_DECREF(str2);
5927 return result;
5928}
5929
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005930PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931"S.replace (old, new[, maxsplit]) -> unicode\n\
5932\n\
5933Return a copy of S with all occurrences of substring\n\
5934old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005935given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936
5937static PyObject*
5938unicode_replace(PyUnicodeObject *self, PyObject *args)
5939{
5940 PyUnicodeObject *str1;
5941 PyUnicodeObject *str2;
5942 int maxcount = -1;
5943 PyObject *result;
5944
5945 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5946 return NULL;
5947 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5948 if (str1 == NULL)
5949 return NULL;
5950 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005951 if (str2 == NULL) {
5952 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005954 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005955
5956 result = replace(self, str1, str2, maxcount);
5957
5958 Py_DECREF(str1);
5959 Py_DECREF(str2);
5960 return result;
5961}
5962
5963static
5964PyObject *unicode_repr(PyObject *unicode)
5965{
5966 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5967 PyUnicode_GET_SIZE(unicode),
5968 1);
5969}
5970
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005971PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005972"S.rfind(sub [,start [,end]]) -> int\n\
5973\n\
5974Return the highest index in S where substring sub is found,\n\
5975such that sub is contained within s[start,end]. Optional\n\
5976arguments start and end are interpreted as in slice notation.\n\
5977\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005978Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979
5980static PyObject *
5981unicode_rfind(PyUnicodeObject *self, PyObject *args)
5982{
5983 PyUnicodeObject *substring;
5984 int start = 0;
5985 int end = INT_MAX;
5986 PyObject *result;
5987
Guido van Rossumb8872e62000-05-09 14:14:27 +00005988 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5989 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005990 return NULL;
5991 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5992 (PyObject *)substring);
5993 if (substring == NULL)
5994 return NULL;
5995
5996 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5997
5998 Py_DECREF(substring);
5999 return result;
6000}
6001
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006002PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006003"S.rindex(sub [,start [,end]]) -> int\n\
6004\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006005Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006
6007static PyObject *
6008unicode_rindex(PyUnicodeObject *self, PyObject *args)
6009{
6010 int result;
6011 PyUnicodeObject *substring;
6012 int start = 0;
6013 int end = INT_MAX;
6014
Guido van Rossumb8872e62000-05-09 14:14:27 +00006015 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
6016 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006017 return NULL;
6018 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6019 (PyObject *)substring);
6020 if (substring == NULL)
6021 return NULL;
6022
6023 result = findstring(self, substring, start, end, -1);
6024
6025 Py_DECREF(substring);
6026 if (result < 0) {
6027 PyErr_SetString(PyExc_ValueError, "substring not found");
6028 return NULL;
6029 }
6030 return PyInt_FromLong(result);
6031}
6032
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006033PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006034"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006035\n\
6036Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006037done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006038
6039static PyObject *
6040unicode_rjust(PyUnicodeObject *self, PyObject *args)
6041{
6042 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006043 Py_UNICODE fillchar = ' ';
6044
6045 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006046 return NULL;
6047
Tim Peters7a29bd52001-09-12 03:03:31 +00006048 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049 Py_INCREF(self);
6050 return (PyObject*) self;
6051 }
6052
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006053 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006054}
6055
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056static PyObject*
6057unicode_slice(PyUnicodeObject *self, int start, int end)
6058{
6059 /* standard clamping */
6060 if (start < 0)
6061 start = 0;
6062 if (end < 0)
6063 end = 0;
6064 if (end > self->length)
6065 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006066 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 /* full slice, return original string */
6068 Py_INCREF(self);
6069 return (PyObject*) self;
6070 }
6071 if (start > end)
6072 start = end;
6073 /* copy slice */
6074 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6075 end - start);
6076}
6077
6078PyObject *PyUnicode_Split(PyObject *s,
6079 PyObject *sep,
6080 int maxsplit)
6081{
6082 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006083
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 s = PyUnicode_FromObject(s);
6085 if (s == NULL)
6086 return NULL;
6087 if (sep != NULL) {
6088 sep = PyUnicode_FromObject(sep);
6089 if (sep == NULL) {
6090 Py_DECREF(s);
6091 return NULL;
6092 }
6093 }
6094
6095 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6096
6097 Py_DECREF(s);
6098 Py_XDECREF(sep);
6099 return result;
6100}
6101
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006102PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103"S.split([sep [,maxsplit]]) -> list of strings\n\
6104\n\
6105Return a list of the words in S, using sep as the\n\
6106delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006107splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006108any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109
6110static PyObject*
6111unicode_split(PyUnicodeObject *self, PyObject *args)
6112{
6113 PyObject *substring = Py_None;
6114 int maxcount = -1;
6115
6116 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6117 return NULL;
6118
6119 if (substring == Py_None)
6120 return split(self, NULL, maxcount);
6121 else if (PyUnicode_Check(substring))
6122 return split(self, (PyUnicodeObject *)substring, maxcount);
6123 else
6124 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6125}
6126
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006127PyObject *PyUnicode_RSplit(PyObject *s,
6128 PyObject *sep,
6129 int maxsplit)
6130{
6131 PyObject *result;
6132
6133 s = PyUnicode_FromObject(s);
6134 if (s == NULL)
6135 return NULL;
6136 if (sep != NULL) {
6137 sep = PyUnicode_FromObject(sep);
6138 if (sep == NULL) {
6139 Py_DECREF(s);
6140 return NULL;
6141 }
6142 }
6143
6144 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6145
6146 Py_DECREF(s);
6147 Py_XDECREF(sep);
6148 return result;
6149}
6150
6151PyDoc_STRVAR(rsplit__doc__,
6152"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6153\n\
6154Return a list of the words in S, using sep as the\n\
6155delimiter string, starting at the end of the string and\n\
6156working to the front. If maxsplit is given, at most maxsplit\n\
6157splits are done. If sep is not specified, any whitespace string\n\
6158is a separator.");
6159
6160static PyObject*
6161unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6162{
6163 PyObject *substring = Py_None;
6164 int maxcount = -1;
6165
6166 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6167 return NULL;
6168
6169 if (substring == Py_None)
6170 return rsplit(self, NULL, maxcount);
6171 else if (PyUnicode_Check(substring))
6172 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6173 else
6174 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6175}
6176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006177PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006178"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006179\n\
6180Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006181Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006182is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183
6184static PyObject*
6185unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6186{
Guido van Rossum86662912000-04-11 15:38:46 +00006187 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188
Guido van Rossum86662912000-04-11 15:38:46 +00006189 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 return NULL;
6191
Guido van Rossum86662912000-04-11 15:38:46 +00006192 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193}
6194
6195static
6196PyObject *unicode_str(PyUnicodeObject *self)
6197{
Fred Drakee4315f52000-05-09 19:53:39 +00006198 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199}
6200
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006201PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202"S.swapcase() -> unicode\n\
6203\n\
6204Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006205and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206
6207static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006208unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006209{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 return fixup(self, fixswapcase);
6211}
6212
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006213PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006214"S.translate(table) -> unicode\n\
6215\n\
6216Return a copy of the string S, where all characters have been mapped\n\
6217through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006218Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6219Unmapped characters are left untouched. Characters mapped to None\n\
6220are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006221
6222static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006223unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224{
Tim Petersced69f82003-09-16 20:30:58 +00006225 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006227 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228 "ignore");
6229}
6230
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006231PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232"S.upper() -> unicode\n\
6233\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006234Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006235
6236static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006237unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006239 return fixup(self, fixupper);
6240}
6241
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006242PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006243"S.zfill(width) -> unicode\n\
6244\n\
6245Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006246of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247
6248static PyObject *
6249unicode_zfill(PyUnicodeObject *self, PyObject *args)
6250{
6251 int fill;
6252 PyUnicodeObject *u;
6253
6254 int width;
6255 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6256 return NULL;
6257
6258 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006259 if (PyUnicode_CheckExact(self)) {
6260 Py_INCREF(self);
6261 return (PyObject*) self;
6262 }
6263 else
6264 return PyUnicode_FromUnicode(
6265 PyUnicode_AS_UNICODE(self),
6266 PyUnicode_GET_SIZE(self)
6267 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006268 }
6269
6270 fill = width - self->length;
6271
6272 u = pad(self, fill, 0, '0');
6273
Walter Dörwald068325e2002-04-15 13:36:47 +00006274 if (u == NULL)
6275 return NULL;
6276
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 if (u->str[fill] == '+' || u->str[fill] == '-') {
6278 /* move sign to beginning of string */
6279 u->str[0] = u->str[fill];
6280 u->str[fill] = '0';
6281 }
6282
6283 return (PyObject*) u;
6284}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006285
6286#if 0
6287static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006288unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006289{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006290 return PyInt_FromLong(unicode_freelist_size);
6291}
6292#endif
6293
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006294PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006295"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006297Return True if S starts with the specified prefix, False otherwise.\n\
6298With optional start, test S beginning at that position.\n\
6299With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006300
6301static PyObject *
6302unicode_startswith(PyUnicodeObject *self,
6303 PyObject *args)
6304{
6305 PyUnicodeObject *substring;
6306 int start = 0;
6307 int end = INT_MAX;
6308 PyObject *result;
6309
Guido van Rossumb8872e62000-05-09 14:14:27 +00006310 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6311 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312 return NULL;
6313 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6314 (PyObject *)substring);
6315 if (substring == NULL)
6316 return NULL;
6317
Guido van Rossum77f6a652002-04-03 22:41:51 +00006318 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006319
6320 Py_DECREF(substring);
6321 return result;
6322}
6323
6324
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006325PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006326"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006327\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006328Return True if S ends with the specified suffix, False otherwise.\n\
6329With optional start, test S beginning at that position.\n\
6330With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006331
6332static PyObject *
6333unicode_endswith(PyUnicodeObject *self,
6334 PyObject *args)
6335{
6336 PyUnicodeObject *substring;
6337 int start = 0;
6338 int end = INT_MAX;
6339 PyObject *result;
6340
Guido van Rossumb8872e62000-05-09 14:14:27 +00006341 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6342 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006343 return NULL;
6344 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6345 (PyObject *)substring);
6346 if (substring == NULL)
6347 return NULL;
6348
Guido van Rossum77f6a652002-04-03 22:41:51 +00006349 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006350
6351 Py_DECREF(substring);
6352 return result;
6353}
6354
6355
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006356
6357static PyObject *
6358unicode_getnewargs(PyUnicodeObject *v)
6359{
6360 return Py_BuildValue("(u#)", v->str, v->length);
6361}
6362
6363
Guido van Rossumd57fd912000-03-10 22:53:23 +00006364static PyMethodDef unicode_methods[] = {
6365
6366 /* Order is according to common usage: often used methods should
6367 appear first, since lookup is done sequentially. */
6368
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006369 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6370 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6371 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006372 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006373 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6374 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6375 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6376 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6377 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6378 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6379 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6380 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6381 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6382 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006383 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006384 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006385/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6386 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6387 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6388 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006389 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006390 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006391 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006392 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6393 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6394 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6395 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6396 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6397 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6398 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6399 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6400 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6401 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6402 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6403 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6404 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6405 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006406 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006407#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006408 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409#endif
6410
6411#if 0
6412 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006413 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414#endif
6415
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006416 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006417 {NULL, NULL}
6418};
6419
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006420static PyObject *
6421unicode_mod(PyObject *v, PyObject *w)
6422{
6423 if (!PyUnicode_Check(v)) {
6424 Py_INCREF(Py_NotImplemented);
6425 return Py_NotImplemented;
6426 }
6427 return PyUnicode_Format(v, w);
6428}
6429
6430static PyNumberMethods unicode_as_number = {
6431 0, /*nb_add*/
6432 0, /*nb_subtract*/
6433 0, /*nb_multiply*/
6434 0, /*nb_divide*/
6435 unicode_mod, /*nb_remainder*/
6436};
6437
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438static PySequenceMethods unicode_as_sequence = {
6439 (inquiry) unicode_length, /* sq_length */
6440 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6441 (intargfunc) unicode_repeat, /* sq_repeat */
6442 (intargfunc) unicode_getitem, /* sq_item */
6443 (intintargfunc) unicode_slice, /* sq_slice */
6444 0, /* sq_ass_item */
6445 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006446 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447};
6448
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006449static PyObject*
6450unicode_subscript(PyUnicodeObject* self, PyObject* item)
6451{
6452 if (PyInt_Check(item)) {
6453 long i = PyInt_AS_LONG(item);
6454 if (i < 0)
6455 i += PyString_GET_SIZE(self);
6456 return unicode_getitem(self, i);
6457 } else if (PyLong_Check(item)) {
6458 long i = PyLong_AsLong(item);
6459 if (i == -1 && PyErr_Occurred())
6460 return NULL;
6461 if (i < 0)
6462 i += PyString_GET_SIZE(self);
6463 return unicode_getitem(self, i);
6464 } else if (PySlice_Check(item)) {
6465 int start, stop, step, slicelength, cur, i;
6466 Py_UNICODE* source_buf;
6467 Py_UNICODE* result_buf;
6468 PyObject* result;
6469
6470 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6471 &start, &stop, &step, &slicelength) < 0) {
6472 return NULL;
6473 }
6474
6475 if (slicelength <= 0) {
6476 return PyUnicode_FromUnicode(NULL, 0);
6477 } else {
6478 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6479 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6480
6481 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6482 result_buf[i] = source_buf[cur];
6483 }
Tim Petersced69f82003-09-16 20:30:58 +00006484
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006485 result = PyUnicode_FromUnicode(result_buf, slicelength);
6486 PyMem_FREE(result_buf);
6487 return result;
6488 }
6489 } else {
6490 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6491 return NULL;
6492 }
6493}
6494
6495static PyMappingMethods unicode_as_mapping = {
6496 (inquiry)unicode_length, /* mp_length */
6497 (binaryfunc)unicode_subscript, /* mp_subscript */
6498 (objobjargproc)0, /* mp_ass_subscript */
6499};
6500
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501static int
6502unicode_buffer_getreadbuf(PyUnicodeObject *self,
6503 int index,
6504 const void **ptr)
6505{
6506 if (index != 0) {
6507 PyErr_SetString(PyExc_SystemError,
6508 "accessing non-existent unicode segment");
6509 return -1;
6510 }
6511 *ptr = (void *) self->str;
6512 return PyUnicode_GET_DATA_SIZE(self);
6513}
6514
6515static int
6516unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6517 const void **ptr)
6518{
6519 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006520 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006521 return -1;
6522}
6523
6524static int
6525unicode_buffer_getsegcount(PyUnicodeObject *self,
6526 int *lenp)
6527{
6528 if (lenp)
6529 *lenp = PyUnicode_GET_DATA_SIZE(self);
6530 return 1;
6531}
6532
6533static int
6534unicode_buffer_getcharbuf(PyUnicodeObject *self,
6535 int index,
6536 const void **ptr)
6537{
6538 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006539
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 if (index != 0) {
6541 PyErr_SetString(PyExc_SystemError,
6542 "accessing non-existent unicode segment");
6543 return -1;
6544 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006545 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 if (str == NULL)
6547 return -1;
6548 *ptr = (void *) PyString_AS_STRING(str);
6549 return PyString_GET_SIZE(str);
6550}
6551
6552/* Helpers for PyUnicode_Format() */
6553
6554static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006555getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006556{
6557 int argidx = *p_argidx;
6558 if (argidx < arglen) {
6559 (*p_argidx)++;
6560 if (arglen < 0)
6561 return args;
6562 else
6563 return PyTuple_GetItem(args, argidx);
6564 }
6565 PyErr_SetString(PyExc_TypeError,
6566 "not enough arguments for format string");
6567 return NULL;
6568}
6569
6570#define F_LJUST (1<<0)
6571#define F_SIGN (1<<1)
6572#define F_BLANK (1<<2)
6573#define F_ALT (1<<3)
6574#define F_ZERO (1<<4)
6575
6576static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578{
6579 register int i;
6580 int len;
6581 va_list va;
6582 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006583 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584
6585 /* First, format the string as char array, then expand to Py_UNICODE
6586 array. */
6587 charbuffer = (char *)buffer;
6588 len = vsprintf(charbuffer, format, va);
6589 for (i = len - 1; i >= 0; i--)
6590 buffer[i] = (Py_UNICODE) charbuffer[i];
6591
6592 va_end(va);
6593 return len;
6594}
6595
Guido van Rossum078151d2002-08-11 04:24:12 +00006596/* XXX To save some code duplication, formatfloat/long/int could have been
6597 shared with stringobject.c, converting from 8-bit to Unicode after the
6598 formatting is done. */
6599
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600static int
6601formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006602 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006603 int flags,
6604 int prec,
6605 int type,
6606 PyObject *v)
6607{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006608 /* fmt = '%#.' + `prec` + `type`
6609 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006610 char fmt[20];
6611 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006612
Guido van Rossumd57fd912000-03-10 22:53:23 +00006613 x = PyFloat_AsDouble(v);
6614 if (x == -1.0 && PyErr_Occurred())
6615 return -1;
6616 if (prec < 0)
6617 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6619 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006620 /* Worst case length calc to ensure no buffer overrun:
6621
6622 'g' formats:
6623 fmt = %#.<prec>g
6624 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6625 for any double rep.)
6626 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6627
6628 'f' formats:
6629 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6630 len = 1 + 50 + 1 + prec = 52 + prec
6631
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006632 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006633 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006634
6635 */
6636 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6637 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006638 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006639 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006640 return -1;
6641 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006642 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6643 (flags&F_ALT) ? "#" : "",
6644 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645 return usprintf(buf, fmt, x);
6646}
6647
Tim Peters38fd5b62000-09-21 05:43:11 +00006648static PyObject*
6649formatlong(PyObject *val, int flags, int prec, int type)
6650{
6651 char *buf;
6652 int i, len;
6653 PyObject *str; /* temporary string object. */
6654 PyUnicodeObject *result;
6655
6656 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6657 if (!str)
6658 return NULL;
6659 result = _PyUnicode_New(len);
6660 for (i = 0; i < len; i++)
6661 result->str[i] = buf[i];
6662 result->str[len] = 0;
6663 Py_DECREF(str);
6664 return (PyObject*)result;
6665}
6666
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667static int
6668formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006669 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 int flags,
6671 int prec,
6672 int type,
6673 PyObject *v)
6674{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006675 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006676 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6677 * + 1 + 1
6678 * = 24
6679 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006680 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006681 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006682 long x;
6683
6684 x = PyInt_AsLong(v);
6685 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006686 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006687 if (x < 0 && type == 'u') {
6688 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006689 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006690 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6691 sign = "-";
6692 else
6693 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006695 prec = 1;
6696
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006697 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6698 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006699 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006700 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006701 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006702 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006703 return -1;
6704 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006705
6706 if ((flags & F_ALT) &&
6707 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006708 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006709 * of issues that cause pain:
6710 * - when 0 is being converted, the C standard leaves off
6711 * the '0x' or '0X', which is inconsistent with other
6712 * %#x/%#X conversions and inconsistent with Python's
6713 * hex() function
6714 * - there are platforms that violate the standard and
6715 * convert 0 with the '0x' or '0X'
6716 * (Metrowerks, Compaq Tru64)
6717 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006718 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006719 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006720 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006721 * We can achieve the desired consistency by inserting our
6722 * own '0x' or '0X' prefix, and substituting %x/%X in place
6723 * of %#x/%#X.
6724 *
6725 * Note that this is the same approach as used in
6726 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006727 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006728 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6729 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006730 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006731 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006732 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6733 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006734 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006735 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006736 if (sign[0])
6737 return usprintf(buf, fmt, -x);
6738 else
6739 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740}
6741
6742static int
6743formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006744 size_t buflen,
6745 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006747 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006748 if (PyUnicode_Check(v)) {
6749 if (PyUnicode_GET_SIZE(v) != 1)
6750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006751 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006752 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006753
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006754 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006755 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006756 goto onError;
6757 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006759
6760 else {
6761 /* Integer input truncated to a character */
6762 long x;
6763 x = PyInt_AsLong(v);
6764 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006765 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006766#ifdef Py_UNICODE_WIDE
6767 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006768 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006769 "%c arg not in range(0x110000) "
6770 "(wide Python build)");
6771 return -1;
6772 }
6773#else
6774 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006775 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006776 "%c arg not in range(0x10000) "
6777 "(narrow Python build)");
6778 return -1;
6779 }
6780#endif
6781 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006782 }
6783 buf[1] = '\0';
6784 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006785
6786 onError:
6787 PyErr_SetString(PyExc_TypeError,
6788 "%c requires int or char");
6789 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006790}
6791
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006792/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6793
6794 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6795 chars are formatted. XXX This is a magic number. Each formatting
6796 routine does bounds checking to ensure no overflow, but a better
6797 solution may be to malloc a buffer of appropriate size for each
6798 format. For now, the current solution is sufficient.
6799*/
6800#define FORMATBUFLEN (size_t)120
6801
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802PyObject *PyUnicode_Format(PyObject *format,
6803 PyObject *args)
6804{
6805 Py_UNICODE *fmt, *res;
6806 int fmtcnt, rescnt, reslen, arglen, argidx;
6807 int args_owned = 0;
6808 PyUnicodeObject *result = NULL;
6809 PyObject *dict = NULL;
6810 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006811
Guido van Rossumd57fd912000-03-10 22:53:23 +00006812 if (format == NULL || args == NULL) {
6813 PyErr_BadInternalCall();
6814 return NULL;
6815 }
6816 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006817 if (uformat == NULL)
6818 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006819 fmt = PyUnicode_AS_UNICODE(uformat);
6820 fmtcnt = PyUnicode_GET_SIZE(uformat);
6821
6822 reslen = rescnt = fmtcnt + 100;
6823 result = _PyUnicode_New(reslen);
6824 if (result == NULL)
6825 goto onError;
6826 res = PyUnicode_AS_UNICODE(result);
6827
6828 if (PyTuple_Check(args)) {
6829 arglen = PyTuple_Size(args);
6830 argidx = 0;
6831 }
6832 else {
6833 arglen = -1;
6834 argidx = -2;
6835 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006836 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6837 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006838 dict = args;
6839
6840 while (--fmtcnt >= 0) {
6841 if (*fmt != '%') {
6842 if (--rescnt < 0) {
6843 rescnt = fmtcnt + 100;
6844 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006845 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006846 return NULL;
6847 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6848 --rescnt;
6849 }
6850 *res++ = *fmt++;
6851 }
6852 else {
6853 /* Got a format specifier */
6854 int flags = 0;
6855 int width = -1;
6856 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006857 Py_UNICODE c = '\0';
6858 Py_UNICODE fill;
6859 PyObject *v = NULL;
6860 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006861 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006862 Py_UNICODE sign;
6863 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006864 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006865
6866 fmt++;
6867 if (*fmt == '(') {
6868 Py_UNICODE *keystart;
6869 int keylen;
6870 PyObject *key;
6871 int pcount = 1;
6872
6873 if (dict == NULL) {
6874 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006875 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 goto onError;
6877 }
6878 ++fmt;
6879 --fmtcnt;
6880 keystart = fmt;
6881 /* Skip over balanced parentheses */
6882 while (pcount > 0 && --fmtcnt >= 0) {
6883 if (*fmt == ')')
6884 --pcount;
6885 else if (*fmt == '(')
6886 ++pcount;
6887 fmt++;
6888 }
6889 keylen = fmt - keystart - 1;
6890 if (fmtcnt < 0 || pcount > 0) {
6891 PyErr_SetString(PyExc_ValueError,
6892 "incomplete format key");
6893 goto onError;
6894 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006895#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006896 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006897 then looked up since Python uses strings to hold
6898 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006899 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006900 key = PyUnicode_EncodeUTF8(keystart,
6901 keylen,
6902 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006903#else
6904 key = PyUnicode_FromUnicode(keystart, keylen);
6905#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006906 if (key == NULL)
6907 goto onError;
6908 if (args_owned) {
6909 Py_DECREF(args);
6910 args_owned = 0;
6911 }
6912 args = PyObject_GetItem(dict, key);
6913 Py_DECREF(key);
6914 if (args == NULL) {
6915 goto onError;
6916 }
6917 args_owned = 1;
6918 arglen = -1;
6919 argidx = -2;
6920 }
6921 while (--fmtcnt >= 0) {
6922 switch (c = *fmt++) {
6923 case '-': flags |= F_LJUST; continue;
6924 case '+': flags |= F_SIGN; continue;
6925 case ' ': flags |= F_BLANK; continue;
6926 case '#': flags |= F_ALT; continue;
6927 case '0': flags |= F_ZERO; continue;
6928 }
6929 break;
6930 }
6931 if (c == '*') {
6932 v = getnextarg(args, arglen, &argidx);
6933 if (v == NULL)
6934 goto onError;
6935 if (!PyInt_Check(v)) {
6936 PyErr_SetString(PyExc_TypeError,
6937 "* wants int");
6938 goto onError;
6939 }
6940 width = PyInt_AsLong(v);
6941 if (width < 0) {
6942 flags |= F_LJUST;
6943 width = -width;
6944 }
6945 if (--fmtcnt >= 0)
6946 c = *fmt++;
6947 }
6948 else if (c >= '0' && c <= '9') {
6949 width = c - '0';
6950 while (--fmtcnt >= 0) {
6951 c = *fmt++;
6952 if (c < '0' || c > '9')
6953 break;
6954 if ((width*10) / 10 != width) {
6955 PyErr_SetString(PyExc_ValueError,
6956 "width too big");
6957 goto onError;
6958 }
6959 width = width*10 + (c - '0');
6960 }
6961 }
6962 if (c == '.') {
6963 prec = 0;
6964 if (--fmtcnt >= 0)
6965 c = *fmt++;
6966 if (c == '*') {
6967 v = getnextarg(args, arglen, &argidx);
6968 if (v == NULL)
6969 goto onError;
6970 if (!PyInt_Check(v)) {
6971 PyErr_SetString(PyExc_TypeError,
6972 "* wants int");
6973 goto onError;
6974 }
6975 prec = PyInt_AsLong(v);
6976 if (prec < 0)
6977 prec = 0;
6978 if (--fmtcnt >= 0)
6979 c = *fmt++;
6980 }
6981 else if (c >= '0' && c <= '9') {
6982 prec = c - '0';
6983 while (--fmtcnt >= 0) {
6984 c = Py_CHARMASK(*fmt++);
6985 if (c < '0' || c > '9')
6986 break;
6987 if ((prec*10) / 10 != prec) {
6988 PyErr_SetString(PyExc_ValueError,
6989 "prec too big");
6990 goto onError;
6991 }
6992 prec = prec*10 + (c - '0');
6993 }
6994 }
6995 } /* prec */
6996 if (fmtcnt >= 0) {
6997 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006998 if (--fmtcnt >= 0)
6999 c = *fmt++;
7000 }
7001 }
7002 if (fmtcnt < 0) {
7003 PyErr_SetString(PyExc_ValueError,
7004 "incomplete format");
7005 goto onError;
7006 }
7007 if (c != '%') {
7008 v = getnextarg(args, arglen, &argidx);
7009 if (v == NULL)
7010 goto onError;
7011 }
7012 sign = 0;
7013 fill = ' ';
7014 switch (c) {
7015
7016 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007017 pbuf = formatbuf;
7018 /* presume that buffer length is at least 1 */
7019 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007020 len = 1;
7021 break;
7022
7023 case 's':
7024 case 'r':
7025 if (PyUnicode_Check(v) && c == 's') {
7026 temp = v;
7027 Py_INCREF(temp);
7028 }
7029 else {
7030 PyObject *unicode;
7031 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007032 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007033 else
7034 temp = PyObject_Repr(v);
7035 if (temp == NULL)
7036 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007037 if (PyUnicode_Check(temp))
7038 /* nothing to do */;
7039 else if (PyString_Check(temp)) {
7040 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00007041 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00007043 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007044 "strict");
7045 Py_DECREF(temp);
7046 temp = unicode;
7047 if (temp == NULL)
7048 goto onError;
7049 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007050 else {
7051 Py_DECREF(temp);
7052 PyErr_SetString(PyExc_TypeError,
7053 "%s argument has non-string str()");
7054 goto onError;
7055 }
7056 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007057 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007058 len = PyUnicode_GET_SIZE(temp);
7059 if (prec >= 0 && len > prec)
7060 len = prec;
7061 break;
7062
7063 case 'i':
7064 case 'd':
7065 case 'u':
7066 case 'o':
7067 case 'x':
7068 case 'X':
7069 if (c == 'i')
7070 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007071 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007072 temp = formatlong(v, flags, prec, c);
7073 if (!temp)
7074 goto onError;
7075 pbuf = PyUnicode_AS_UNICODE(temp);
7076 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007077 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007079 else {
7080 pbuf = formatbuf;
7081 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7082 flags, prec, c, v);
7083 if (len < 0)
7084 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007085 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007086 }
7087 if (flags & F_ZERO)
7088 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089 break;
7090
7091 case 'e':
7092 case 'E':
7093 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007094 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007095 case 'g':
7096 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007097 if (c == 'F')
7098 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007099 pbuf = formatbuf;
7100 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7101 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 if (len < 0)
7103 goto onError;
7104 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007105 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007106 fill = '0';
7107 break;
7108
7109 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007110 pbuf = formatbuf;
7111 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 if (len < 0)
7113 goto onError;
7114 break;
7115
7116 default:
7117 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007118 "unsupported format character '%c' (0x%x) "
7119 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007120 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007121 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007122 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007123 goto onError;
7124 }
7125 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007126 if (*pbuf == '-' || *pbuf == '+') {
7127 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007128 len--;
7129 }
7130 else if (flags & F_SIGN)
7131 sign = '+';
7132 else if (flags & F_BLANK)
7133 sign = ' ';
7134 else
7135 sign = 0;
7136 }
7137 if (width < len)
7138 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007139 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007140 reslen -= rescnt;
7141 rescnt = width + fmtcnt + 100;
7142 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007143 if (reslen < 0) {
7144 Py_DECREF(result);
7145 return PyErr_NoMemory();
7146 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007147 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007148 return NULL;
7149 res = PyUnicode_AS_UNICODE(result)
7150 + reslen - rescnt;
7151 }
7152 if (sign) {
7153 if (fill != ' ')
7154 *res++ = sign;
7155 rescnt--;
7156 if (width > len)
7157 width--;
7158 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007159 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7160 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007161 assert(pbuf[1] == c);
7162 if (fill != ' ') {
7163 *res++ = *pbuf++;
7164 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007165 }
Tim Petersfff53252001-04-12 18:38:48 +00007166 rescnt -= 2;
7167 width -= 2;
7168 if (width < 0)
7169 width = 0;
7170 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007171 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007172 if (width > len && !(flags & F_LJUST)) {
7173 do {
7174 --rescnt;
7175 *res++ = fill;
7176 } while (--width > len);
7177 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007178 if (fill == ' ') {
7179 if (sign)
7180 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007181 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007182 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007183 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007184 *res++ = *pbuf++;
7185 *res++ = *pbuf++;
7186 }
7187 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007188 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007189 res += len;
7190 rescnt -= len;
7191 while (--width >= len) {
7192 --rescnt;
7193 *res++ = ' ';
7194 }
7195 if (dict && (argidx < arglen) && c != '%') {
7196 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007197 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198 goto onError;
7199 }
7200 Py_XDECREF(temp);
7201 } /* '%' */
7202 } /* until end */
7203 if (argidx < arglen && !dict) {
7204 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007205 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007206 goto onError;
7207 }
7208
7209 if (args_owned) {
7210 Py_DECREF(args);
7211 }
7212 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007213 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007214 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007215 return (PyObject *)result;
7216
7217 onError:
7218 Py_XDECREF(result);
7219 Py_DECREF(uformat);
7220 if (args_owned) {
7221 Py_DECREF(args);
7222 }
7223 return NULL;
7224}
7225
7226static PyBufferProcs unicode_as_buffer = {
7227 (getreadbufferproc) unicode_buffer_getreadbuf,
7228 (getwritebufferproc) unicode_buffer_getwritebuf,
7229 (getsegcountproc) unicode_buffer_getsegcount,
7230 (getcharbufferproc) unicode_buffer_getcharbuf,
7231};
7232
Jeremy Hylton938ace62002-07-17 16:30:39 +00007233static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007234unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7235
Tim Peters6d6c1a32001-08-02 04:15:00 +00007236static PyObject *
7237unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7238{
7239 PyObject *x = NULL;
7240 static char *kwlist[] = {"string", "encoding", "errors", 0};
7241 char *encoding = NULL;
7242 char *errors = NULL;
7243
Guido van Rossume023fe02001-08-30 03:12:59 +00007244 if (type != &PyUnicode_Type)
7245 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007246 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7247 kwlist, &x, &encoding, &errors))
7248 return NULL;
7249 if (x == NULL)
7250 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007251 if (encoding == NULL && errors == NULL)
7252 return PyObject_Unicode(x);
7253 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007254 return PyUnicode_FromEncodedObject(x, encoding, errors);
7255}
7256
Guido van Rossume023fe02001-08-30 03:12:59 +00007257static PyObject *
7258unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7259{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007260 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007261 int n;
7262
7263 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7264 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7265 if (tmp == NULL)
7266 return NULL;
7267 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007268 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007269 if (pnew == NULL) {
7270 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007271 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007272 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007273 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7274 if (pnew->str == NULL) {
7275 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007276 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007277 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007278 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007279 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007280 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7281 pnew->length = n;
7282 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007283 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007284 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007285}
7286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007287PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007288"unicode(string [, encoding[, errors]]) -> object\n\
7289\n\
7290Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007291encoding defaults to the current default string encoding.\n\
7292errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007293
Guido van Rossumd57fd912000-03-10 22:53:23 +00007294PyTypeObject PyUnicode_Type = {
7295 PyObject_HEAD_INIT(&PyType_Type)
7296 0, /* ob_size */
7297 "unicode", /* tp_name */
7298 sizeof(PyUnicodeObject), /* tp_size */
7299 0, /* tp_itemsize */
7300 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007301 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007302 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007303 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007304 0, /* tp_setattr */
7305 (cmpfunc) unicode_compare, /* tp_compare */
7306 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007307 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007308 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007309 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007310 (hashfunc) unicode_hash, /* tp_hash*/
7311 0, /* tp_call*/
7312 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007313 PyObject_GenericGetAttr, /* tp_getattro */
7314 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007315 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007316 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7317 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007318 unicode_doc, /* tp_doc */
7319 0, /* tp_traverse */
7320 0, /* tp_clear */
7321 0, /* tp_richcompare */
7322 0, /* tp_weaklistoffset */
7323 0, /* tp_iter */
7324 0, /* tp_iternext */
7325 unicode_methods, /* tp_methods */
7326 0, /* tp_members */
7327 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007328 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007329 0, /* tp_dict */
7330 0, /* tp_descr_get */
7331 0, /* tp_descr_set */
7332 0, /* tp_dictoffset */
7333 0, /* tp_init */
7334 0, /* tp_alloc */
7335 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007336 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007337};
7338
7339/* Initialize the Unicode implementation */
7340
Thomas Wouters78890102000-07-22 19:25:51 +00007341void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007342{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007343 int i;
7344
Fred Drakee4315f52000-05-09 19:53:39 +00007345 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007346 unicode_freelist = NULL;
7347 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007349 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007350 for (i = 0; i < 256; i++)
7351 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007352 if (PyType_Ready(&PyUnicode_Type) < 0)
7353 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007354}
7355
7356/* Finalize the Unicode implementation */
7357
7358void
Thomas Wouters78890102000-07-22 19:25:51 +00007359_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007360{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007361 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007362 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007363
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007364 Py_XDECREF(unicode_empty);
7365 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007366
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007367 for (i = 0; i < 256; i++) {
7368 if (unicode_latin1[i]) {
7369 Py_DECREF(unicode_latin1[i]);
7370 unicode_latin1[i] = NULL;
7371 }
7372 }
7373
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007374 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007375 PyUnicodeObject *v = u;
7376 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007377 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007378 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007379 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007380 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007381 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007382 unicode_freelist = NULL;
7383 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007384}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007385
7386/*
7387Local variables:
7388c-basic-offset: 4
7389indent-tabs-mode: nil
7390End:
7391*/