blob: 5d096edee63d4073a7a9395ba63f373330b5f4db [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000371 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000387
388 /* If possible, try to copy the 0-termination as well */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 if (size > PyUnicode_GET_SIZE(unicode))
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000390 size = PyUnicode_GET_SIZE(unicode) + 1;
391
Guido van Rossumd57fd912000-03-10 22:53:23 +0000392#ifdef HAVE_USABLE_WCHAR_T
393 memcpy(w, unicode->str, size * sizeof(wchar_t));
394#else
395 {
396 register Py_UNICODE *u;
397 register int i;
398 u = PyUnicode_AS_UNICODE(unicode);
Marc-André Lemburg204bd6d2004-10-15 07:45:05 +0000399 for (i = size; i > 0; i--)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 *w++ = *u++;
401 }
402#endif
403
Marc-André Lemburga9cadcd2004-11-22 13:02:31 +0000404 if (size > PyUnicode_GET_SIZE(unicode))
405 return PyUnicode_GET_SIZE(unicode);
406 else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000407 return size;
408}
409
410#endif
411
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000412PyObject *PyUnicode_FromOrdinal(int ordinal)
413{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000414 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000415
416#ifdef Py_UNICODE_WIDE
417 if (ordinal < 0 || ordinal > 0x10ffff) {
418 PyErr_SetString(PyExc_ValueError,
419 "unichr() arg not in range(0x110000) "
420 "(wide Python build)");
421 return NULL;
422 }
423#else
424 if (ordinal < 0 || ordinal > 0xffff) {
425 PyErr_SetString(PyExc_ValueError,
426 "unichr() arg not in range(0x10000) "
427 "(narrow Python build)");
428 return NULL;
429 }
430#endif
431
Hye-Shik Chang40574832004-04-06 07:24:51 +0000432 s[0] = (Py_UNICODE)ordinal;
433 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000434}
435
Guido van Rossumd57fd912000-03-10 22:53:23 +0000436PyObject *PyUnicode_FromObject(register PyObject *obj)
437{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 /* XXX Perhaps we should make this API an alias of
439 PyObject_Unicode() instead ?! */
440 if (PyUnicode_CheckExact(obj)) {
441 Py_INCREF(obj);
442 return obj;
443 }
444 if (PyUnicode_Check(obj)) {
445 /* For a Unicode subtype that's not a Unicode object,
446 return a true Unicode object with the same data. */
447 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
448 PyUnicode_GET_SIZE(obj));
449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
451}
452
453PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
454 const char *encoding,
455 const char *errors)
456{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000457 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000460
Guido van Rossumd57fd912000-03-10 22:53:23 +0000461 if (obj == NULL) {
462 PyErr_BadInternalCall();
463 return NULL;
464 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000465
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000466#if 0
467 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000468 that no encodings is given and then redirect to
469 PyObject_Unicode() which then applies the additional logic for
470 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000471
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000472 NOTE: This API should really only be used for object which
473 represent *encoded* Unicode !
474
475 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000476 if (PyUnicode_Check(obj)) {
477 if (encoding) {
478 PyErr_SetString(PyExc_TypeError,
479 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000481 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000483 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000484#else
485 if (PyUnicode_Check(obj)) {
486 PyErr_SetString(PyExc_TypeError,
487 "decoding Unicode is not supported");
488 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000489 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000490#endif
491
492 /* Coerce object */
493 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000494 s = PyString_AS_STRING(obj);
495 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000496 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000497 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
498 /* Overwrite the error message with something more useful in
499 case of a TypeError. */
500 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000501 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000502 "coercing to Unicode: need string or buffer, "
503 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000504 obj->ob_type->tp_name);
505 goto onError;
506 }
Tim Petersced69f82003-09-16 20:30:58 +0000507
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000508 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000509 if (len == 0) {
510 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000511 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000512 }
Tim Petersced69f82003-09-16 20:30:58 +0000513 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000515
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000516 return v;
517
518 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000520}
521
522PyObject *PyUnicode_Decode(const char *s,
523 int size,
524 const char *encoding,
525 const char *errors)
526{
527 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000528
529 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000530 encoding = PyUnicode_GetDefaultEncoding();
531
532 /* Shortcuts for common default encodings */
533 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000534 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000535 else if (strcmp(encoding, "latin-1") == 0)
536 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000537#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
538 else if (strcmp(encoding, "mbcs") == 0)
539 return PyUnicode_DecodeMBCS(s, size, errors);
540#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000541 else if (strcmp(encoding, "ascii") == 0)
542 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543
544 /* Decode via the codec registry */
545 buffer = PyBuffer_FromMemory((void *)s, size);
546 if (buffer == NULL)
547 goto onError;
548 unicode = PyCodec_Decode(buffer, encoding, errors);
549 if (unicode == NULL)
550 goto onError;
551 if (!PyUnicode_Check(unicode)) {
552 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000553 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554 unicode->ob_type->tp_name);
555 Py_DECREF(unicode);
556 goto onError;
557 }
558 Py_DECREF(buffer);
559 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000560
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 onError:
562 Py_XDECREF(buffer);
563 return NULL;
564}
565
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000566PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
567 const char *encoding,
568 const char *errors)
569{
570 PyObject *v;
571
572 if (!PyUnicode_Check(unicode)) {
573 PyErr_BadArgument();
574 goto onError;
575 }
576
577 if (encoding == NULL)
578 encoding = PyUnicode_GetDefaultEncoding();
579
580 /* Decode via the codec registry */
581 v = PyCodec_Decode(unicode, encoding, errors);
582 if (v == NULL)
583 goto onError;
584 return v;
585
586 onError:
587 return NULL;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590PyObject *PyUnicode_Encode(const Py_UNICODE *s,
591 int size,
592 const char *encoding,
593 const char *errors)
594{
595 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000596
Guido van Rossumd57fd912000-03-10 22:53:23 +0000597 unicode = PyUnicode_FromUnicode(s, size);
598 if (unicode == NULL)
599 return NULL;
600 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
601 Py_DECREF(unicode);
602 return v;
603}
604
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000605PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
606 const char *encoding,
607 const char *errors)
608{
609 PyObject *v;
610
611 if (!PyUnicode_Check(unicode)) {
612 PyErr_BadArgument();
613 goto onError;
614 }
615
616 if (encoding == NULL)
617 encoding = PyUnicode_GetDefaultEncoding();
618
619 /* Encode via the codec registry */
620 v = PyCodec_Encode(unicode, encoding, errors);
621 if (v == NULL)
622 goto onError;
623 return v;
624
625 onError:
626 return NULL;
627}
628
Guido van Rossumd57fd912000-03-10 22:53:23 +0000629PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
630 const char *encoding,
631 const char *errors)
632{
633 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000634
Guido van Rossumd57fd912000-03-10 22:53:23 +0000635 if (!PyUnicode_Check(unicode)) {
636 PyErr_BadArgument();
637 goto onError;
638 }
Fred Drakee4315f52000-05-09 19:53:39 +0000639
Tim Petersced69f82003-09-16 20:30:58 +0000640 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000641 encoding = PyUnicode_GetDefaultEncoding();
642
643 /* Shortcuts for common default encodings */
644 if (errors == NULL) {
645 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000646 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000647 else if (strcmp(encoding, "latin-1") == 0)
648 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000649#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
650 else if (strcmp(encoding, "mbcs") == 0)
651 return PyUnicode_AsMBCSString(unicode);
652#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000653 else if (strcmp(encoding, "ascii") == 0)
654 return PyUnicode_AsASCIIString(unicode);
655 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000656
657 /* Encode via the codec registry */
658 v = PyCodec_Encode(unicode, encoding, errors);
659 if (v == NULL)
660 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000661 if (!PyString_Check(v)) {
662 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000663 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000664 v->ob_type->tp_name);
665 Py_DECREF(v);
666 goto onError;
667 }
668 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000669
Guido van Rossumd57fd912000-03-10 22:53:23 +0000670 onError:
671 return NULL;
672}
673
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000674PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
675 const char *errors)
676{
677 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
678
679 if (v)
680 return v;
681 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
682 if (v && errors == NULL)
683 ((PyUnicodeObject *)unicode)->defenc = v;
684 return v;
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
688{
689 if (!PyUnicode_Check(unicode)) {
690 PyErr_BadArgument();
691 goto onError;
692 }
693 return PyUnicode_AS_UNICODE(unicode);
694
695 onError:
696 return NULL;
697}
698
699int PyUnicode_GetSize(PyObject *unicode)
700{
701 if (!PyUnicode_Check(unicode)) {
702 PyErr_BadArgument();
703 goto onError;
704 }
705 return PyUnicode_GET_SIZE(unicode);
706
707 onError:
708 return -1;
709}
710
Thomas Wouters78890102000-07-22 19:25:51 +0000711const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000712{
713 return unicode_default_encoding;
714}
715
716int PyUnicode_SetDefaultEncoding(const char *encoding)
717{
718 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000719
Fred Drakee4315f52000-05-09 19:53:39 +0000720 /* Make sure the encoding is valid. As side effect, this also
721 loads the encoding into the codec registry cache. */
722 v = _PyCodec_Lookup(encoding);
723 if (v == NULL)
724 goto onError;
725 Py_DECREF(v);
726 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000727 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000728 sizeof(unicode_default_encoding));
729 return 0;
730
731 onError:
732 return -1;
733}
734
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000735/* error handling callback helper:
736 build arguments, call the callback and check the arguments,
737 if no exception occured, copy the replacement to the output
738 and adjust various state variables.
739 return 0 on success, -1 on error
740*/
741
742static
743int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
744 const char *encoding, const char *reason,
745 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
746 PyObject **output, int *outpos, Py_UNICODE **outptr)
747{
748 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
749
750 PyObject *restuple = NULL;
751 PyObject *repunicode = NULL;
752 int outsize = PyUnicode_GET_SIZE(*output);
753 int requiredsize;
754 int newpos;
755 Py_UNICODE *repptr;
756 int repsize;
757 int res = -1;
758
759 if (*errorHandler == NULL) {
760 *errorHandler = PyCodec_LookupError(errors);
761 if (*errorHandler == NULL)
762 goto onError;
763 }
764
765 if (*exceptionObject == NULL) {
766 *exceptionObject = PyUnicodeDecodeError_Create(
767 encoding, input, insize, *startinpos, *endinpos, reason);
768 if (*exceptionObject == NULL)
769 goto onError;
770 }
771 else {
772 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
773 goto onError;
774 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
775 goto onError;
776 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
777 goto onError;
778 }
779
780 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
781 if (restuple == NULL)
782 goto onError;
783 if (!PyTuple_Check(restuple)) {
784 PyErr_Format(PyExc_TypeError, &argparse[4]);
785 goto onError;
786 }
787 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
788 goto onError;
789 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000790 newpos = insize+newpos;
791 if (newpos<0 || newpos>insize) {
792 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
793 goto onError;
794 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000795
796 /* need more space? (at least enough for what we
797 have+the replacement+the rest of the string (starting
798 at the new input position), so we won't have to check space
799 when there are no errors in the rest of the string) */
800 repptr = PyUnicode_AS_UNICODE(repunicode);
801 repsize = PyUnicode_GET_SIZE(repunicode);
802 requiredsize = *outpos + repsize + insize-newpos;
803 if (requiredsize > outsize) {
804 if (requiredsize<2*outsize)
805 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000806 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000807 goto onError;
808 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
809 }
810 *endinpos = newpos;
811 *inptr = input + newpos;
812 Py_UNICODE_COPY(*outptr, repptr, repsize);
813 *outptr += repsize;
814 *outpos += repsize;
815 /* we made it! */
816 res = 0;
817
818 onError:
819 Py_XDECREF(restuple);
820 return res;
821}
822
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000823/* --- UTF-7 Codec -------------------------------------------------------- */
824
825/* see RFC2152 for details */
826
Tim Petersced69f82003-09-16 20:30:58 +0000827static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000828char utf7_special[128] = {
829 /* indicate whether a UTF-7 character is special i.e. cannot be directly
830 encoded:
831 0 - not special
832 1 - special
833 2 - whitespace (optional)
834 3 - RFC2152 Set O (optional) */
835 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
836 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
837 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
838 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
839 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
840 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
841 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
842 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
843
844};
845
846#define SPECIAL(c, encodeO, encodeWS) \
847 (((c)>127 || utf7_special[(c)] == 1) || \
848 (encodeWS && (utf7_special[(c)] == 2)) || \
849 (encodeO && (utf7_special[(c)] == 3)))
850
851#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
852#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
853#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
854 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
855
856#define ENCODE(out, ch, bits) \
857 while (bits >= 6) { \
858 *out++ = B64(ch >> (bits-6)); \
859 bits -= 6; \
860 }
861
862#define DECODE(out, ch, bits, surrogate) \
863 while (bits >= 16) { \
864 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
865 bits -= 16; \
866 if (surrogate) { \
867 /* We have already generated an error for the high surrogate
868 so let's not bother seeing if the low surrogate is correct or not */\
869 surrogate = 0; \
870 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
871 /* This is a surrogate pair. Unfortunately we can't represent \
872 it in a 16-bit character */ \
873 surrogate = 1; \
874 errmsg = "code pairs are not supported"; \
875 goto utf7Error; \
876 } else { \
877 *out++ = outCh; \
878 } \
879 } \
880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881PyObject *PyUnicode_DecodeUTF7(const char *s,
882 int size,
883 const char *errors)
884{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000885 const char *starts = s;
886 int startinpos;
887 int endinpos;
888 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889 const char *e;
890 PyUnicodeObject *unicode;
891 Py_UNICODE *p;
892 const char *errmsg = "";
893 int inShift = 0;
894 unsigned int bitsleft = 0;
895 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000896 int surrogate = 0;
897 PyObject *errorHandler = NULL;
898 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000899
900 unicode = _PyUnicode_New(size);
901 if (!unicode)
902 return NULL;
903 if (size == 0)
904 return (PyObject *)unicode;
905
906 p = unicode->str;
907 e = s + size;
908
909 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000910 Py_UNICODE ch;
911 restart:
912 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000913
914 if (inShift) {
915 if ((ch == '-') || !B64CHAR(ch)) {
916 inShift = 0;
917 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000918
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000919 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
920 if (bitsleft >= 6) {
921 /* The shift sequence has a partial character in it. If
922 bitsleft < 6 then we could just classify it as padding
923 but that is not the case here */
924
925 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000926 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000927 }
928 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000929 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000930 here so indicate the potential of a misencoded character. */
931
932 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
933 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
934 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000935 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000936 }
937
938 if (ch == '-') {
939 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000940 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941 inShift = 1;
942 }
943 } else if (SPECIAL(ch,0,0)) {
944 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000945 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946 } else {
947 *p++ = ch;
948 }
949 } else {
950 charsleft = (charsleft << 6) | UB64(ch);
951 bitsleft += 6;
952 s++;
953 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
954 }
955 }
956 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000957 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000958 s++;
959 if (s < e && *s == '-') {
960 s++;
961 *p++ = '+';
962 } else
963 {
964 inShift = 1;
965 bitsleft = 0;
966 }
967 }
968 else if (SPECIAL(ch,0,0)) {
969 errmsg = "unexpected special character";
970 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000971 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000972 }
973 else {
974 *p++ = ch;
975 s++;
976 }
977 continue;
978 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000979 outpos = p-PyUnicode_AS_UNICODE(unicode);
980 endinpos = s-starts;
981 if (unicode_decode_call_errorhandler(
982 errors, &errorHandler,
983 "utf7", errmsg,
984 starts, size, &startinpos, &endinpos, &exc, &s,
985 (PyObject **)&unicode, &outpos, &p))
986 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000987 }
988
989 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000990 outpos = p-PyUnicode_AS_UNICODE(unicode);
991 endinpos = size;
992 if (unicode_decode_call_errorhandler(
993 errors, &errorHandler,
994 "utf7", "unterminated shift sequence",
995 starts, size, &startinpos, &endinpos, &exc, &s,
996 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000997 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000998 if (s < e)
999 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001000 }
1001
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001002 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001003 goto onError;
1004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001005 Py_XDECREF(errorHandler);
1006 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001007 return (PyObject *)unicode;
1008
1009onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001010 Py_XDECREF(errorHandler);
1011 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001012 Py_DECREF(unicode);
1013 return NULL;
1014}
1015
1016
1017PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1018 int size,
1019 int encodeSetO,
1020 int encodeWhiteSpace,
1021 const char *errors)
1022{
1023 PyObject *v;
1024 /* It might be possible to tighten this worst case */
1025 unsigned int cbAllocated = 5 * size;
1026 int inShift = 0;
1027 int i = 0;
1028 unsigned int bitsleft = 0;
1029 unsigned long charsleft = 0;
1030 char * out;
1031 char * start;
1032
1033 if (size == 0)
1034 return PyString_FromStringAndSize(NULL, 0);
1035
1036 v = PyString_FromStringAndSize(NULL, cbAllocated);
1037 if (v == NULL)
1038 return NULL;
1039
1040 start = out = PyString_AS_STRING(v);
1041 for (;i < size; ++i) {
1042 Py_UNICODE ch = s[i];
1043
1044 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001045 if (ch == '+') {
1046 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 *out++ = '-';
1048 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1049 charsleft = ch;
1050 bitsleft = 16;
1051 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001052 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001053 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001054 } else {
1055 *out++ = (char) ch;
1056 }
1057 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001058 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1059 *out++ = B64(charsleft << (6-bitsleft));
1060 charsleft = 0;
1061 bitsleft = 0;
1062 /* Characters not in the BASE64 set implicitly unshift the sequence
1063 so no '-' is required, except if the character is itself a '-' */
1064 if (B64CHAR(ch) || ch == '-') {
1065 *out++ = '-';
1066 }
1067 inShift = 0;
1068 *out++ = (char) ch;
1069 } else {
1070 bitsleft += 16;
1071 charsleft = (charsleft << 16) | ch;
1072 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1073
1074 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001075 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001076 or '-' then the shift sequence will be terminated implicitly and we
1077 don't have to insert a '-'. */
1078
1079 if (bitsleft == 0) {
1080 if (i + 1 < size) {
1081 Py_UNICODE ch2 = s[i+1];
1082
1083 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001084
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001085 } else if (B64CHAR(ch2) || ch2 == '-') {
1086 *out++ = '-';
1087 inShift = 0;
1088 } else {
1089 inShift = 0;
1090 }
1091
1092 }
1093 else {
1094 *out++ = '-';
1095 inShift = 0;
1096 }
1097 }
Tim Petersced69f82003-09-16 20:30:58 +00001098 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001099 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001100 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001101 if (bitsleft) {
1102 *out++= B64(charsleft << (6-bitsleft) );
1103 *out++ = '-';
1104 }
1105
Tim Peters5de98422002-04-27 18:44:32 +00001106 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001107 return v;
1108}
1109
1110#undef SPECIAL
1111#undef B64
1112#undef B64CHAR
1113#undef UB64
1114#undef ENCODE
1115#undef DECODE
1116
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117/* --- UTF-8 Codec -------------------------------------------------------- */
1118
Tim Petersced69f82003-09-16 20:30:58 +00001119static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001120char utf8_code_length[256] = {
1121 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1122 illegal prefix. see RFC 2279 for details */
1123 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1124 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1125 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1126 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1127 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1128 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1129 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1130 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1135 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1136 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1137 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1138 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1139};
1140
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141PyObject *PyUnicode_DecodeUTF8(const char *s,
1142 int size,
1143 const char *errors)
1144{
Walter Dörwald69652032004-09-07 20:24:22 +00001145 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1146}
1147
1148PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1149 int size,
1150 const char *errors,
1151 int *consumed)
1152{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001153 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001155 int startinpos;
1156 int endinpos;
1157 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 const char *e;
1159 PyUnicodeObject *unicode;
1160 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001161 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001162 PyObject *errorHandler = NULL;
1163 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164
1165 /* Note: size will always be longer than the resulting Unicode
1166 character count */
1167 unicode = _PyUnicode_New(size);
1168 if (!unicode)
1169 return NULL;
Walter Dörwald69652032004-09-07 20:24:22 +00001170 if (size == 0) {
1171 if (consumed)
1172 *consumed = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173 return (PyObject *)unicode;
Walter Dörwald69652032004-09-07 20:24:22 +00001174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175
1176 /* Unpack UTF-8 encoded data */
1177 p = unicode->str;
1178 e = s + size;
1179
1180 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001181 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182
1183 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001184 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 s++;
1186 continue;
1187 }
1188
1189 n = utf8_code_length[ch];
1190
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001191 if (s + n > e) {
Walter Dörwald69652032004-09-07 20:24:22 +00001192 if (consumed)
1193 break;
1194 else {
1195 errmsg = "unexpected end of data";
1196 startinpos = s-starts;
1197 endinpos = size;
1198 goto utf8Error;
1199 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001200 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
1202 switch (n) {
1203
1204 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001205 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001206 startinpos = s-starts;
1207 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001208 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209
1210 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001211 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001212 startinpos = s-starts;
1213 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215
1216 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001217 if ((s[1] & 0xc0) != 0x80) {
1218 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001219 startinpos = s-starts;
1220 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001221 goto utf8Error;
1222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 errmsg = "illegal encoding";
1228 goto utf8Error;
1229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001231 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001232 break;
1233
1234 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001235 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001236 (s[2] & 0xc0) != 0x80) {
1237 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001238 startinpos = s-starts;
1239 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001240 goto utf8Error;
1241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001243 if (ch < 0x0800) {
1244 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001245 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001246
1247 XXX For wide builds (UCS-4) we should probably try
1248 to recombine the surrogates into a single code
1249 unit.
1250 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001251 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001252 startinpos = s-starts;
1253 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001254 goto utf8Error;
1255 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001257 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001258 break;
1259
1260 case 4:
1261 if ((s[1] & 0xc0) != 0x80 ||
1262 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001263 (s[3] & 0xc0) != 0x80) {
1264 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001265 startinpos = s-starts;
1266 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001267 goto utf8Error;
1268 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001269 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1270 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1271 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001272 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001273 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001274 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001275 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001276 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001277 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001278 startinpos = s-starts;
1279 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001280 goto utf8Error;
1281 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001282#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001283 *p++ = (Py_UNICODE)ch;
1284#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001285 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001286
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001287 /* translate from 10000..10FFFF to 0..FFFF */
1288 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001289
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001290 /* high surrogate = top 10 bits added to D800 */
1291 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001292
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001293 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001294 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001295#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296 break;
1297
1298 default:
1299 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001300 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001301 startinpos = s-starts;
1302 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001303 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001304 }
1305 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001306 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001307
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001309 outpos = p-PyUnicode_AS_UNICODE(unicode);
1310 if (unicode_decode_call_errorhandler(
1311 errors, &errorHandler,
1312 "utf8", errmsg,
1313 starts, size, &startinpos, &endinpos, &exc, &s,
1314 (PyObject **)&unicode, &outpos, &p))
1315 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 }
Walter Dörwald69652032004-09-07 20:24:22 +00001317 if (consumed)
1318 *consumed = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319
1320 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001321 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322 goto onError;
1323
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001324 Py_XDECREF(errorHandler);
1325 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326 return (PyObject *)unicode;
1327
1328onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001329 Py_XDECREF(errorHandler);
1330 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001331 Py_DECREF(unicode);
1332 return NULL;
1333}
1334
Tim Peters602f7402002-04-27 18:03:26 +00001335/* Allocation strategy: if the string is short, convert into a stack buffer
1336 and allocate exactly as much space needed at the end. Else allocate the
1337 maximum possible needed (4 result bytes per Unicode character), and return
1338 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001339*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001340PyObject *
1341PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1342 int size,
1343 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001344{
Tim Peters602f7402002-04-27 18:03:26 +00001345#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001346
Tim Peters602f7402002-04-27 18:03:26 +00001347 int i; /* index into s of next input byte */
1348 PyObject *v; /* result string object */
1349 char *p; /* next free byte in output buffer */
1350 int nallocated; /* number of result bytes allocated */
1351 int nneeded; /* number of result bytes needed */
1352 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001353
Tim Peters602f7402002-04-27 18:03:26 +00001354 assert(s != NULL);
1355 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356
Tim Peters602f7402002-04-27 18:03:26 +00001357 if (size <= MAX_SHORT_UNICHARS) {
1358 /* Write into the stack buffer; nallocated can't overflow.
1359 * At the end, we'll allocate exactly as much heap space as it
1360 * turns out we need.
1361 */
1362 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1363 v = NULL; /* will allocate after we're done */
1364 p = stackbuf;
1365 }
1366 else {
1367 /* Overallocate on the heap, and give the excess back at the end. */
1368 nallocated = size * 4;
1369 if (nallocated / 4 != size) /* overflow! */
1370 return PyErr_NoMemory();
1371 v = PyString_FromStringAndSize(NULL, nallocated);
1372 if (v == NULL)
1373 return NULL;
1374 p = PyString_AS_STRING(v);
1375 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001376
Tim Peters602f7402002-04-27 18:03:26 +00001377 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001378 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001379
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001380 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001381 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001383
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001385 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001386 *p++ = (char)(0xc0 | (ch >> 6));
1387 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001388 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001389 else {
Tim Peters602f7402002-04-27 18:03:26 +00001390 /* Encode UCS2 Unicode ordinals */
1391 if (ch < 0x10000) {
1392 /* Special case: check for high surrogate */
1393 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1394 Py_UCS4 ch2 = s[i];
1395 /* Check for low surrogate and combine the two to
1396 form a UCS4 value */
1397 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001398 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001399 i++;
1400 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001401 }
Tim Peters602f7402002-04-27 18:03:26 +00001402 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001403 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001404 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001405 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1406 *p++ = (char)(0x80 | (ch & 0x3f));
1407 continue;
1408 }
1409encodeUCS4:
1410 /* Encode UCS4 Unicode ordinals */
1411 *p++ = (char)(0xf0 | (ch >> 18));
1412 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1413 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1414 *p++ = (char)(0x80 | (ch & 0x3f));
1415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001417
Tim Peters602f7402002-04-27 18:03:26 +00001418 if (v == NULL) {
1419 /* This was stack allocated. */
1420 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1421 assert(nneeded <= nallocated);
1422 v = PyString_FromStringAndSize(stackbuf, nneeded);
1423 }
1424 else {
1425 /* Cut back to size actually needed. */
1426 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1427 assert(nneeded <= nallocated);
1428 _PyString_Resize(&v, nneeded);
1429 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001431
Tim Peters602f7402002-04-27 18:03:26 +00001432#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433}
1434
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1436{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437 if (!PyUnicode_Check(unicode)) {
1438 PyErr_BadArgument();
1439 return NULL;
1440 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001441 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1442 PyUnicode_GET_SIZE(unicode),
1443 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444}
1445
1446/* --- UTF-16 Codec ------------------------------------------------------- */
1447
Tim Peters772747b2001-08-09 22:21:55 +00001448PyObject *
1449PyUnicode_DecodeUTF16(const char *s,
1450 int size,
1451 const char *errors,
1452 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453{
Walter Dörwald69652032004-09-07 20:24:22 +00001454 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
1455}
1456
1457PyObject *
1458PyUnicode_DecodeUTF16Stateful(const char *s,
1459 int size,
1460 const char *errors,
1461 int *byteorder,
1462 int *consumed)
1463{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001464 const char *starts = s;
1465 int startinpos;
1466 int endinpos;
1467 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001468 PyUnicodeObject *unicode;
1469 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001470 const unsigned char *q, *e;
1471 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001472 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001473 /* Offsets from q for retrieving byte pairs in the right order. */
1474#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1475 int ihi = 1, ilo = 0;
1476#else
1477 int ihi = 0, ilo = 1;
1478#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001479 PyObject *errorHandler = NULL;
1480 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481
1482 /* Note: size will always be longer than the resulting Unicode
1483 character count */
1484 unicode = _PyUnicode_New(size);
1485 if (!unicode)
1486 return NULL;
1487 if (size == 0)
1488 return (PyObject *)unicode;
1489
1490 /* Unpack UTF-16 encoded data */
1491 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001492 q = (unsigned char *)s;
1493 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494
1495 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001496 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001498 /* Check for BOM marks (U+FEFF) in the input and adjust current
1499 byte order setting accordingly. In native mode, the leading BOM
1500 mark is skipped, in all other modes, it is copied to the output
1501 stream as-is (giving a ZWNBSP character). */
1502 if (bo == 0) {
Walter Dörwald69652032004-09-07 20:24:22 +00001503 if (size >= 2) {
1504 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001505#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Walter Dörwald69652032004-09-07 20:24:22 +00001506 if (bom == 0xFEFF) {
1507 q += 2;
1508 bo = -1;
1509 }
1510 else if (bom == 0xFFFE) {
1511 q += 2;
1512 bo = 1;
1513 }
Tim Petersced69f82003-09-16 20:30:58 +00001514#else
Walter Dörwald69652032004-09-07 20:24:22 +00001515 if (bom == 0xFEFF) {
1516 q += 2;
1517 bo = 1;
1518 }
1519 else if (bom == 0xFFFE) {
1520 q += 2;
1521 bo = -1;
1522 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001523#endif
Walter Dörwald69652032004-09-07 20:24:22 +00001524 }
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001525 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526
Tim Peters772747b2001-08-09 22:21:55 +00001527 if (bo == -1) {
1528 /* force LE */
1529 ihi = 1;
1530 ilo = 0;
1531 }
1532 else if (bo == 1) {
1533 /* force BE */
1534 ihi = 0;
1535 ilo = 1;
1536 }
1537
1538 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001539 Py_UNICODE ch;
Walter Dörwald69652032004-09-07 20:24:22 +00001540 /* remaining bytes at the end? (size should be even) */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001541 if (e-q<2) {
Walter Dörwald69652032004-09-07 20:24:22 +00001542 if (consumed)
1543 break;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001544 errmsg = "truncated data";
1545 startinpos = ((const char *)q)-starts;
1546 endinpos = ((const char *)e)-starts;
1547 goto utf16Error;
1548 /* The remaining input chars are ignored if the callback
1549 chooses to skip the input */
1550 }
1551 ch = (q[ihi] << 8) | q[ilo];
1552
Tim Peters772747b2001-08-09 22:21:55 +00001553 q += 2;
1554
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 if (ch < 0xD800 || ch > 0xDFFF) {
1556 *p++ = ch;
1557 continue;
1558 }
1559
1560 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001561 if (q >= e) {
1562 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001563 startinpos = (((const char *)q)-2)-starts;
1564 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001565 goto utf16Error;
1566 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001567 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001568 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1569 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001570 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001571#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001572 *p++ = ch;
1573 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001574#else
1575 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001576#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001577 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001578 }
1579 else {
1580 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001581 startinpos = (((const char *)q)-4)-starts;
1582 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001583 goto utf16Error;
1584 }
1585
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001587 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001588 startinpos = (((const char *)q)-2)-starts;
1589 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001590 /* Fall through to report the error */
1591
1592 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001593 outpos = p-PyUnicode_AS_UNICODE(unicode);
1594 if (unicode_decode_call_errorhandler(
1595 errors, &errorHandler,
1596 "utf16", errmsg,
1597 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1598 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001599 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600 }
1601
1602 if (byteorder)
1603 *byteorder = bo;
1604
Walter Dörwald69652032004-09-07 20:24:22 +00001605 if (consumed)
1606 *consumed = (const char *)q-starts;
1607
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001609 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 goto onError;
1611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001612 Py_XDECREF(errorHandler);
1613 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 return (PyObject *)unicode;
1615
1616onError:
1617 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001618 Py_XDECREF(errorHandler);
1619 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 return NULL;
1621}
1622
Tim Peters772747b2001-08-09 22:21:55 +00001623PyObject *
1624PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1625 int size,
1626 const char *errors,
1627 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628{
1629 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001630 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001631#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001632 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001633#else
1634 const int pairs = 0;
1635#endif
Tim Peters772747b2001-08-09 22:21:55 +00001636 /* Offsets from p for storing byte pairs in the right order. */
1637#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1638 int ihi = 1, ilo = 0;
1639#else
1640 int ihi = 0, ilo = 1;
1641#endif
1642
1643#define STORECHAR(CH) \
1644 do { \
1645 p[ihi] = ((CH) >> 8) & 0xff; \
1646 p[ilo] = (CH) & 0xff; \
1647 p += 2; \
1648 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001649
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001650#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001651 for (i = pairs = 0; i < size; i++)
1652 if (s[i] >= 0x10000)
1653 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001654#endif
Tim Petersced69f82003-09-16 20:30:58 +00001655 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001656 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657 if (v == NULL)
1658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659
Tim Peters772747b2001-08-09 22:21:55 +00001660 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001662 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001663 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001664 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001665
1666 if (byteorder == -1) {
1667 /* force LE */
1668 ihi = 1;
1669 ilo = 0;
1670 }
1671 else if (byteorder == 1) {
1672 /* force BE */
1673 ihi = 0;
1674 ilo = 1;
1675 }
1676
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001677 while (size-- > 0) {
1678 Py_UNICODE ch = *s++;
1679 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001680#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001681 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001682 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1683 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001685#endif
Tim Peters772747b2001-08-09 22:21:55 +00001686 STORECHAR(ch);
1687 if (ch2)
1688 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001689 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001691#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692}
1693
1694PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1695{
1696 if (!PyUnicode_Check(unicode)) {
1697 PyErr_BadArgument();
1698 return NULL;
1699 }
1700 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 NULL,
1703 0);
1704}
1705
1706/* --- Unicode Escape Codec ----------------------------------------------- */
1707
Fredrik Lundh06d12682001-01-24 07:59:11 +00001708static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001709
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1711 int size,
1712 const char *errors)
1713{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 const char *starts = s;
1715 int startinpos;
1716 int endinpos;
1717 int outpos;
1718 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001720 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001721 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001722 char* message;
1723 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001724 PyObject *errorHandler = NULL;
1725 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001726
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 /* Escaped strings will always be longer than the resulting
1728 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 length after conversion to the true value.
1730 (but if the error callback returns a long replacement string
1731 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 v = _PyUnicode_New(size);
1733 if (v == NULL)
1734 goto onError;
1735 if (size == 0)
1736 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001737
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001738 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001740
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 while (s < end) {
1742 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001743 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001744 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745
1746 /* Non-escape characters are interpreted as Unicode ordinals */
1747 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001748 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 continue;
1750 }
1751
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753 /* \ - Escapes */
1754 s++;
1755 switch (*s++) {
1756
1757 /* \x escapes */
1758 case '\n': break;
1759 case '\\': *p++ = '\\'; break;
1760 case '\'': *p++ = '\''; break;
1761 case '\"': *p++ = '\"'; break;
1762 case 'b': *p++ = '\b'; break;
1763 case 'f': *p++ = '\014'; break; /* FF */
1764 case 't': *p++ = '\t'; break;
1765 case 'n': *p++ = '\n'; break;
1766 case 'r': *p++ = '\r'; break;
1767 case 'v': *p++ = '\013'; break; /* VT */
1768 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1769
1770 /* \OOO (octal) escapes */
1771 case '0': case '1': case '2': case '3':
1772 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001773 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001775 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001777 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001778 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001779 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001780 break;
1781
Fredrik Lundhccc74732001-02-18 22:13:49 +00001782 /* hex escapes */
1783 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001785 digits = 2;
1786 message = "truncated \\xXX escape";
1787 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001788
Fredrik Lundhccc74732001-02-18 22:13:49 +00001789 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001791 digits = 4;
1792 message = "truncated \\uXXXX escape";
1793 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794
Fredrik Lundhccc74732001-02-18 22:13:49 +00001795 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001796 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001797 digits = 8;
1798 message = "truncated \\UXXXXXXXX escape";
1799 hexescape:
1800 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001801 outpos = p-PyUnicode_AS_UNICODE(v);
1802 if (s+digits>end) {
1803 endinpos = size;
1804 if (unicode_decode_call_errorhandler(
1805 errors, &errorHandler,
1806 "unicodeescape", "end of string in escape sequence",
1807 starts, size, &startinpos, &endinpos, &exc, &s,
1808 (PyObject **)&v, &outpos, &p))
1809 goto onError;
1810 goto nextByte;
1811 }
1812 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001813 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001814 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 endinpos = (s+i+1)-starts;
1816 if (unicode_decode_call_errorhandler(
1817 errors, &errorHandler,
1818 "unicodeescape", message,
1819 starts, size, &startinpos, &endinpos, &exc, &s,
1820 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001821 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001822 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001823 }
1824 chr = (chr<<4) & ~0xF;
1825 if (c >= '0' && c <= '9')
1826 chr += c - '0';
1827 else if (c >= 'a' && c <= 'f')
1828 chr += 10 + c - 'a';
1829 else
1830 chr += 10 + c - 'A';
1831 }
1832 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001833 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001834 /* _decoding_error will have already written into the
1835 target buffer. */
1836 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001837 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001838 /* when we get here, chr is a 32-bit unicode character */
1839 if (chr <= 0xffff)
1840 /* UCS-2 character */
1841 *p++ = (Py_UNICODE) chr;
1842 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001843 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001844 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001845#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001846 *p++ = chr;
1847#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001848 chr -= 0x10000L;
1849 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001850 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001851#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001852 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001853 endinpos = s-starts;
1854 outpos = p-PyUnicode_AS_UNICODE(v);
1855 if (unicode_decode_call_errorhandler(
1856 errors, &errorHandler,
1857 "unicodeescape", "illegal Unicode character",
1858 starts, size, &startinpos, &endinpos, &exc, &s,
1859 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001860 goto onError;
1861 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862 break;
1863
1864 /* \N{name} */
1865 case 'N':
1866 message = "malformed \\N character escape";
1867 if (ucnhash_CAPI == NULL) {
1868 /* load the unicode data module */
1869 PyObject *m, *v;
1870 m = PyImport_ImportModule("unicodedata");
1871 if (m == NULL)
1872 goto ucnhashError;
1873 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1874 Py_DECREF(m);
1875 if (v == NULL)
1876 goto ucnhashError;
1877 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1878 Py_DECREF(v);
1879 if (ucnhash_CAPI == NULL)
1880 goto ucnhashError;
1881 }
1882 if (*s == '{') {
1883 const char *start = s+1;
1884 /* look for the closing brace */
1885 while (*s != '}' && s < end)
1886 s++;
1887 if (s > start && s < end && *s == '}') {
1888 /* found a name. look it up in the unicode database */
1889 message = "unknown Unicode character name";
1890 s++;
1891 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1892 goto store;
1893 }
1894 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001895 endinpos = s-starts;
1896 outpos = p-PyUnicode_AS_UNICODE(v);
1897 if (unicode_decode_call_errorhandler(
1898 errors, &errorHandler,
1899 "unicodeescape", message,
1900 starts, size, &startinpos, &endinpos, &exc, &s,
1901 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001902 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001903 break;
1904
1905 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001906 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001907 message = "\\ at end of string";
1908 s--;
1909 endinpos = s-starts;
1910 outpos = p-PyUnicode_AS_UNICODE(v);
1911 if (unicode_decode_call_errorhandler(
1912 errors, &errorHandler,
1913 "unicodeescape", message,
1914 starts, size, &startinpos, &endinpos, &exc, &s,
1915 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001916 goto onError;
1917 }
1918 else {
1919 *p++ = '\\';
1920 *p++ = (unsigned char)s[-1];
1921 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001922 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001924 nextByte:
1925 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001926 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001927 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001928 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001929 Py_XDECREF(errorHandler);
1930 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001932
Fredrik Lundhccc74732001-02-18 22:13:49 +00001933ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001934 PyErr_SetString(
1935 PyExc_UnicodeError,
1936 "\\N escapes not supported (can't load unicodedata module)"
1937 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001938 Py_XDECREF(errorHandler);
1939 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001940 return NULL;
1941
Fredrik Lundhccc74732001-02-18 22:13:49 +00001942onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001943 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001944 Py_XDECREF(errorHandler);
1945 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946 return NULL;
1947}
1948
1949/* Return a Unicode-Escape string version of the Unicode object.
1950
1951 If quotes is true, the string is enclosed in u"" or u'' quotes as
1952 appropriate.
1953
1954*/
1955
Barry Warsaw51ac5802000-03-20 16:36:48 +00001956static const Py_UNICODE *findchar(const Py_UNICODE *s,
1957 int size,
1958 Py_UNICODE ch);
1959
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960static
1961PyObject *unicodeescape_string(const Py_UNICODE *s,
1962 int size,
1963 int quotes)
1964{
1965 PyObject *repr;
1966 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001968 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969
1970 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1971 if (repr == NULL)
1972 return NULL;
1973
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001974 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975
1976 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001978 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979 !findchar(s, size, '"')) ? '"' : '\'';
1980 }
1981 while (size-- > 0) {
1982 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001983
Guido van Rossumd57fd912000-03-10 22:53:23 +00001984 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001985 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 *p++ = '\\';
1988 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001989 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001990 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001991
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001992#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001993 /* Map 21-bit characters to '\U00xxxxxx' */
1994 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001995 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001996
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001997 /* Resize the string if necessary */
1998 if (offset + 12 > PyString_GET_SIZE(repr)) {
1999 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00002000 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002001 p = PyString_AS_STRING(repr) + offset;
2002 }
2003
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002004 *p++ = '\\';
2005 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002006 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
2007 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
2008 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
2009 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
2010 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
2011 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
2012 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002013 *p++ = hexdigit[ch & 0x0000000F];
2014 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002015 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002016#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002017 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2018 else if (ch >= 0xD800 && ch < 0xDC00) {
2019 Py_UNICODE ch2;
2020 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002021
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002022 ch2 = *s++;
2023 size--;
2024 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2025 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2026 *p++ = '\\';
2027 *p++ = 'U';
2028 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2029 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2030 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2031 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2032 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2033 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2034 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2035 *p++ = hexdigit[ucs & 0x0000000F];
2036 continue;
2037 }
2038 /* Fall through: isolated surrogates are copied as-is */
2039 s--;
2040 size++;
2041 }
2042
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002044 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 *p++ = '\\';
2046 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002047 *p++ = hexdigit[(ch >> 12) & 0x000F];
2048 *p++ = hexdigit[(ch >> 8) & 0x000F];
2049 *p++ = hexdigit[(ch >> 4) & 0x000F];
2050 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002052
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002053 /* Map special whitespace to '\t', \n', '\r' */
2054 else if (ch == '\t') {
2055 *p++ = '\\';
2056 *p++ = 't';
2057 }
2058 else if (ch == '\n') {
2059 *p++ = '\\';
2060 *p++ = 'n';
2061 }
2062 else if (ch == '\r') {
2063 *p++ = '\\';
2064 *p++ = 'r';
2065 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002066
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002067 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002068 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002070 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002071 *p++ = hexdigit[(ch >> 4) & 0x000F];
2072 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002073 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002074
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 /* Copy everything else as-is */
2076 else
2077 *p++ = (char) ch;
2078 }
2079 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002080 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081
2082 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002083 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002084 return repr;
2085}
2086
2087PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2088 int size)
2089{
2090 return unicodeescape_string(s, size, 0);
2091}
2092
2093PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2094{
2095 if (!PyUnicode_Check(unicode)) {
2096 PyErr_BadArgument();
2097 return NULL;
2098 }
2099 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2100 PyUnicode_GET_SIZE(unicode));
2101}
2102
2103/* --- Raw Unicode Escape Codec ------------------------------------------- */
2104
2105PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2106 int size,
2107 const char *errors)
2108{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002109 const char *starts = s;
2110 int startinpos;
2111 int endinpos;
2112 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002114 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 const char *end;
2116 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002117 PyObject *errorHandler = NULL;
2118 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002119
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 /* Escaped strings will always be longer than the resulting
2121 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002122 length after conversion to the true value. (But decoding error
2123 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 v = _PyUnicode_New(size);
2125 if (v == NULL)
2126 goto onError;
2127 if (size == 0)
2128 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002129 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002130 end = s + size;
2131 while (s < end) {
2132 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002133 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002134 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002135 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136
2137 /* Non-escape characters are interpreted as Unicode ordinals */
2138 if (*s != '\\') {
2139 *p++ = (unsigned char)*s++;
2140 continue;
2141 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002142 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143
2144 /* \u-escapes are only interpreted iff the number of leading
2145 backslashes if odd */
2146 bs = s;
2147 for (;s < end;) {
2148 if (*s != '\\')
2149 break;
2150 *p++ = (unsigned char)*s++;
2151 }
2152 if (((s - bs) & 1) == 0 ||
2153 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002154 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155 continue;
2156 }
2157 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002158 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 s++;
2160
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002161 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002162 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002163 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002164 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002166 endinpos = s-starts;
2167 if (unicode_decode_call_errorhandler(
2168 errors, &errorHandler,
2169 "rawunicodeescape", "truncated \\uXXXX",
2170 starts, size, &startinpos, &endinpos, &exc, &s,
2171 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002173 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002174 }
2175 x = (x<<4) & ~0xF;
2176 if (c >= '0' && c <= '9')
2177 x += c - '0';
2178 else if (c >= 'a' && c <= 'f')
2179 x += 10 + c - 'a';
2180 else
2181 x += 10 + c - 'A';
2182 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002183#ifndef Py_UNICODE_WIDE
2184 if (x > 0x10000) {
2185 if (unicode_decode_call_errorhandler(
2186 errors, &errorHandler,
2187 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2188 starts, size, &startinpos, &endinpos, &exc, &s,
2189 (PyObject **)&v, &outpos, &p))
2190 goto onError;
2191 }
2192#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002193 *p++ = x;
2194 nextByte:
2195 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002197 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002198 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002199 Py_XDECREF(errorHandler);
2200 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002202
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 onError:
2204 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002205 Py_XDECREF(errorHandler);
2206 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 return NULL;
2208}
2209
2210PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2211 int size)
2212{
2213 PyObject *repr;
2214 char *p;
2215 char *q;
2216
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002217 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002218
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002219#ifdef Py_UNICODE_WIDE
2220 repr = PyString_FromStringAndSize(NULL, 10 * size);
2221#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002223#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 if (repr == NULL)
2225 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002226 if (size == 0)
2227 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228
2229 p = q = PyString_AS_STRING(repr);
2230 while (size-- > 0) {
2231 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002232#ifdef Py_UNICODE_WIDE
2233 /* Map 32-bit characters to '\Uxxxxxxxx' */
2234 if (ch >= 0x10000) {
2235 *p++ = '\\';
2236 *p++ = 'U';
2237 *p++ = hexdigit[(ch >> 28) & 0xf];
2238 *p++ = hexdigit[(ch >> 24) & 0xf];
2239 *p++ = hexdigit[(ch >> 20) & 0xf];
2240 *p++ = hexdigit[(ch >> 16) & 0xf];
2241 *p++ = hexdigit[(ch >> 12) & 0xf];
2242 *p++ = hexdigit[(ch >> 8) & 0xf];
2243 *p++ = hexdigit[(ch >> 4) & 0xf];
2244 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002245 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002246 else
2247#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 /* Map 16-bit characters to '\uxxxx' */
2249 if (ch >= 256) {
2250 *p++ = '\\';
2251 *p++ = 'u';
2252 *p++ = hexdigit[(ch >> 12) & 0xf];
2253 *p++ = hexdigit[(ch >> 8) & 0xf];
2254 *p++ = hexdigit[(ch >> 4) & 0xf];
2255 *p++ = hexdigit[ch & 15];
2256 }
2257 /* Copy everything else as-is */
2258 else
2259 *p++ = (char) ch;
2260 }
2261 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002262 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 return repr;
2264}
2265
2266PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2267{
2268 if (!PyUnicode_Check(unicode)) {
2269 PyErr_BadArgument();
2270 return NULL;
2271 }
2272 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2273 PyUnicode_GET_SIZE(unicode));
2274}
2275
Walter Dörwalda47d1c02005-08-30 10:23:14 +00002276/* --- Unicode Internal Codec ------------------------------------------- */
2277
2278PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
2279 int size,
2280 const char *errors)
2281{
2282 const char *starts = s;
2283 int startinpos;
2284 int endinpos;
2285 int outpos;
2286 Py_UNICODE unimax;
2287 PyUnicodeObject *v;
2288 Py_UNICODE *p;
2289 const char *end;
2290 const char *reason;
2291 PyObject *errorHandler = NULL;
2292 PyObject *exc = NULL;
2293
2294 unimax = PyUnicode_GetMax();
2295 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
2296 if (v == NULL)
2297 goto onError;
2298 if (PyUnicode_GetSize((PyObject *)v) == 0)
2299 return (PyObject *)v;
2300 p = PyUnicode_AS_UNICODE(v);
2301 end = s + size;
2302
2303 while (s < end) {
2304 *p = *(Py_UNICODE *)s;
2305 /* We have to sanity check the raw data, otherwise doom looms for
2306 some malformed UCS-4 data. */
2307 if (
2308 #ifdef Py_UNICODE_WIDE
2309 *p > unimax || *p < 0 ||
2310 #endif
2311 end-s < Py_UNICODE_SIZE
2312 )
2313 {
2314 startinpos = s - starts;
2315 if (end-s < Py_UNICODE_SIZE) {
2316 endinpos = end-starts;
2317 reason = "truncated input";
2318 }
2319 else {
2320 endinpos = s - starts + Py_UNICODE_SIZE;
2321 reason = "illegal code point (> 0x10FFFF)";
2322 }
2323 outpos = p - PyUnicode_AS_UNICODE(v);
2324 if (unicode_decode_call_errorhandler(
2325 errors, &errorHandler,
2326 "unicode_internal", reason,
2327 starts, size, &startinpos, &endinpos, &exc, &s,
2328 (PyObject **)&v, &outpos, &p)) {
2329 goto onError;
2330 }
2331 }
2332 else {
2333 p++;
2334 s += Py_UNICODE_SIZE;
2335 }
2336 }
2337
2338 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
2339 goto onError;
2340 Py_XDECREF(errorHandler);
2341 Py_XDECREF(exc);
2342 return (PyObject *)v;
2343
2344 onError:
2345 Py_XDECREF(v);
2346 Py_XDECREF(errorHandler);
2347 Py_XDECREF(exc);
2348 return NULL;
2349}
2350
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351/* --- Latin-1 Codec ------------------------------------------------------ */
2352
2353PyObject *PyUnicode_DecodeLatin1(const char *s,
2354 int size,
2355 const char *errors)
2356{
2357 PyUnicodeObject *v;
2358 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002359
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002361 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002362 Py_UNICODE r = *(unsigned char*)s;
2363 return PyUnicode_FromUnicode(&r, 1);
2364 }
2365
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 v = _PyUnicode_New(size);
2367 if (v == NULL)
2368 goto onError;
2369 if (size == 0)
2370 return (PyObject *)v;
2371 p = PyUnicode_AS_UNICODE(v);
2372 while (size-- > 0)
2373 *p++ = (unsigned char)*s++;
2374 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002375
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 onError:
2377 Py_XDECREF(v);
2378 return NULL;
2379}
2380
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002381/* create or adjust a UnicodeEncodeError */
2382static void make_encode_exception(PyObject **exceptionObject,
2383 const char *encoding,
2384 const Py_UNICODE *unicode, int size,
2385 int startpos, int endpos,
2386 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002388 if (*exceptionObject == NULL) {
2389 *exceptionObject = PyUnicodeEncodeError_Create(
2390 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002391 }
2392 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002393 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2394 goto onError;
2395 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2396 goto onError;
2397 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2398 goto onError;
2399 return;
2400 onError:
2401 Py_DECREF(*exceptionObject);
2402 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 }
2404}
2405
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002406/* raises a UnicodeEncodeError */
2407static void raise_encode_exception(PyObject **exceptionObject,
2408 const char *encoding,
2409 const Py_UNICODE *unicode, int size,
2410 int startpos, int endpos,
2411 const char *reason)
2412{
2413 make_encode_exception(exceptionObject,
2414 encoding, unicode, size, startpos, endpos, reason);
2415 if (*exceptionObject != NULL)
2416 PyCodec_StrictErrors(*exceptionObject);
2417}
2418
2419/* error handling callback helper:
2420 build arguments, call the callback and check the arguments,
2421 put the result into newpos and return the replacement string, which
2422 has to be freed by the caller */
2423static PyObject *unicode_encode_call_errorhandler(const char *errors,
2424 PyObject **errorHandler,
2425 const char *encoding, const char *reason,
2426 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2427 int startpos, int endpos,
2428 int *newpos)
2429{
2430 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2431
2432 PyObject *restuple;
2433 PyObject *resunicode;
2434
2435 if (*errorHandler == NULL) {
2436 *errorHandler = PyCodec_LookupError(errors);
2437 if (*errorHandler == NULL)
2438 return NULL;
2439 }
2440
2441 make_encode_exception(exceptionObject,
2442 encoding, unicode, size, startpos, endpos, reason);
2443 if (*exceptionObject == NULL)
2444 return NULL;
2445
2446 restuple = PyObject_CallFunctionObjArgs(
2447 *errorHandler, *exceptionObject, NULL);
2448 if (restuple == NULL)
2449 return NULL;
2450 if (!PyTuple_Check(restuple)) {
2451 PyErr_Format(PyExc_TypeError, &argparse[4]);
2452 Py_DECREF(restuple);
2453 return NULL;
2454 }
2455 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2456 &resunicode, newpos)) {
2457 Py_DECREF(restuple);
2458 return NULL;
2459 }
2460 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002461 *newpos = size+*newpos;
2462 if (*newpos<0 || *newpos>size) {
2463 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2464 Py_DECREF(restuple);
2465 return NULL;
2466 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002467 Py_INCREF(resunicode);
2468 Py_DECREF(restuple);
2469 return resunicode;
2470}
2471
2472static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2473 int size,
2474 const char *errors,
2475 int limit)
2476{
2477 /* output object */
2478 PyObject *res;
2479 /* pointers to the beginning and end+1 of input */
2480 const Py_UNICODE *startp = p;
2481 const Py_UNICODE *endp = p + size;
2482 /* pointer to the beginning of the unencodable characters */
2483 /* const Py_UNICODE *badp = NULL; */
2484 /* pointer into the output */
2485 char *str;
2486 /* current output position */
2487 int respos = 0;
2488 int ressize;
2489 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2490 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2491 PyObject *errorHandler = NULL;
2492 PyObject *exc = NULL;
2493 /* the following variable is used for caching string comparisons
2494 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2495 int known_errorHandler = -1;
2496
2497 /* allocate enough for a simple encoding without
2498 replacements, if we need more, we'll resize */
2499 res = PyString_FromStringAndSize(NULL, size);
2500 if (res == NULL)
2501 goto onError;
2502 if (size == 0)
2503 return res;
2504 str = PyString_AS_STRING(res);
2505 ressize = size;
2506
2507 while (p<endp) {
2508 Py_UNICODE c = *p;
2509
2510 /* can we encode this? */
2511 if (c<limit) {
2512 /* no overflow check, because we know that the space is enough */
2513 *str++ = (char)c;
2514 ++p;
2515 }
2516 else {
2517 int unicodepos = p-startp;
2518 int requiredsize;
2519 PyObject *repunicode;
2520 int repsize;
2521 int newpos;
2522 int respos;
2523 Py_UNICODE *uni2;
2524 /* startpos for collecting unencodable chars */
2525 const Py_UNICODE *collstart = p;
2526 const Py_UNICODE *collend = p;
2527 /* find all unecodable characters */
2528 while ((collend < endp) && ((*collend)>=limit))
2529 ++collend;
2530 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2531 if (known_errorHandler==-1) {
2532 if ((errors==NULL) || (!strcmp(errors, "strict")))
2533 known_errorHandler = 1;
2534 else if (!strcmp(errors, "replace"))
2535 known_errorHandler = 2;
2536 else if (!strcmp(errors, "ignore"))
2537 known_errorHandler = 3;
2538 else if (!strcmp(errors, "xmlcharrefreplace"))
2539 known_errorHandler = 4;
2540 else
2541 known_errorHandler = 0;
2542 }
2543 switch (known_errorHandler) {
2544 case 1: /* strict */
2545 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2546 goto onError;
2547 case 2: /* replace */
2548 while (collstart++<collend)
2549 *str++ = '?'; /* fall through */
2550 case 3: /* ignore */
2551 p = collend;
2552 break;
2553 case 4: /* xmlcharrefreplace */
2554 respos = str-PyString_AS_STRING(res);
2555 /* determine replacement size (temporarily (mis)uses p) */
2556 for (p = collstart, repsize = 0; p < collend; ++p) {
2557 if (*p<10)
2558 repsize += 2+1+1;
2559 else if (*p<100)
2560 repsize += 2+2+1;
2561 else if (*p<1000)
2562 repsize += 2+3+1;
2563 else if (*p<10000)
2564 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002565#ifndef Py_UNICODE_WIDE
2566 else
2567 repsize += 2+5+1;
2568#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002569 else if (*p<100000)
2570 repsize += 2+5+1;
2571 else if (*p<1000000)
2572 repsize += 2+6+1;
2573 else
2574 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002575#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002576 }
2577 requiredsize = respos+repsize+(endp-collend);
2578 if (requiredsize > ressize) {
2579 if (requiredsize<2*ressize)
2580 requiredsize = 2*ressize;
2581 if (_PyString_Resize(&res, requiredsize))
2582 goto onError;
2583 str = PyString_AS_STRING(res) + respos;
2584 ressize = requiredsize;
2585 }
2586 /* generate replacement (temporarily (mis)uses p) */
2587 for (p = collstart; p < collend; ++p) {
2588 str += sprintf(str, "&#%d;", (int)*p);
2589 }
2590 p = collend;
2591 break;
2592 default:
2593 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2594 encoding, reason, startp, size, &exc,
2595 collstart-startp, collend-startp, &newpos);
2596 if (repunicode == NULL)
2597 goto onError;
2598 /* need more space? (at least enough for what we
2599 have+the replacement+the rest of the string, so
2600 we won't have to check space for encodable characters) */
2601 respos = str-PyString_AS_STRING(res);
2602 repsize = PyUnicode_GET_SIZE(repunicode);
2603 requiredsize = respos+repsize+(endp-collend);
2604 if (requiredsize > ressize) {
2605 if (requiredsize<2*ressize)
2606 requiredsize = 2*ressize;
2607 if (_PyString_Resize(&res, requiredsize)) {
2608 Py_DECREF(repunicode);
2609 goto onError;
2610 }
2611 str = PyString_AS_STRING(res) + respos;
2612 ressize = requiredsize;
2613 }
2614 /* check if there is anything unencodable in the replacement
2615 and copy it to the output */
2616 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2617 c = *uni2;
2618 if (c >= limit) {
2619 raise_encode_exception(&exc, encoding, startp, size,
2620 unicodepos, unicodepos+1, reason);
2621 Py_DECREF(repunicode);
2622 goto onError;
2623 }
2624 *str = (char)c;
2625 }
2626 p = startp + newpos;
2627 Py_DECREF(repunicode);
2628 }
2629 }
2630 }
2631 /* Resize if we allocated to much */
2632 respos = str-PyString_AS_STRING(res);
2633 if (respos<ressize)
2634 /* If this falls res will be NULL */
2635 _PyString_Resize(&res, respos);
2636 Py_XDECREF(errorHandler);
2637 Py_XDECREF(exc);
2638 return res;
2639
2640 onError:
2641 Py_XDECREF(res);
2642 Py_XDECREF(errorHandler);
2643 Py_XDECREF(exc);
2644 return NULL;
2645}
2646
Guido van Rossumd57fd912000-03-10 22:53:23 +00002647PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2648 int size,
2649 const char *errors)
2650{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002651 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002652}
2653
2654PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2655{
2656 if (!PyUnicode_Check(unicode)) {
2657 PyErr_BadArgument();
2658 return NULL;
2659 }
2660 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2661 PyUnicode_GET_SIZE(unicode),
2662 NULL);
2663}
2664
2665/* --- 7-bit ASCII Codec -------------------------------------------------- */
2666
Guido van Rossumd57fd912000-03-10 22:53:23 +00002667PyObject *PyUnicode_DecodeASCII(const char *s,
2668 int size,
2669 const char *errors)
2670{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002671 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002672 PyUnicodeObject *v;
2673 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 int startinpos;
2675 int endinpos;
2676 int outpos;
2677 const char *e;
2678 PyObject *errorHandler = NULL;
2679 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002680
Guido van Rossumd57fd912000-03-10 22:53:23 +00002681 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002682 if (size == 1 && *(unsigned char*)s < 128) {
2683 Py_UNICODE r = *(unsigned char*)s;
2684 return PyUnicode_FromUnicode(&r, 1);
2685 }
Tim Petersced69f82003-09-16 20:30:58 +00002686
Guido van Rossumd57fd912000-03-10 22:53:23 +00002687 v = _PyUnicode_New(size);
2688 if (v == NULL)
2689 goto onError;
2690 if (size == 0)
2691 return (PyObject *)v;
2692 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002693 e = s + size;
2694 while (s < e) {
2695 register unsigned char c = (unsigned char)*s;
2696 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002698 ++s;
2699 }
2700 else {
2701 startinpos = s-starts;
2702 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002703 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002704 if (unicode_decode_call_errorhandler(
2705 errors, &errorHandler,
2706 "ascii", "ordinal not in range(128)",
2707 starts, size, &startinpos, &endinpos, &exc, &s,
2708 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002709 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002712 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002713 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002714 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002715 Py_XDECREF(errorHandler);
2716 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002717 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002718
Guido van Rossumd57fd912000-03-10 22:53:23 +00002719 onError:
2720 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 Py_XDECREF(errorHandler);
2722 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002723 return NULL;
2724}
2725
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2727 int size,
2728 const char *errors)
2729{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002730 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731}
2732
2733PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2734{
2735 if (!PyUnicode_Check(unicode)) {
2736 PyErr_BadArgument();
2737 return NULL;
2738 }
2739 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2740 PyUnicode_GET_SIZE(unicode),
2741 NULL);
2742}
2743
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002744#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002745
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002746/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002747
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002748PyObject *PyUnicode_DecodeMBCS(const char *s,
2749 int size,
2750 const char *errors)
2751{
2752 PyUnicodeObject *v;
2753 Py_UNICODE *p;
2754
2755 /* First get the size of the result */
2756 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002757 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002758 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2759
2760 v = _PyUnicode_New(usize);
2761 if (v == NULL)
2762 return NULL;
2763 if (usize == 0)
2764 return (PyObject *)v;
2765 p = PyUnicode_AS_UNICODE(v);
2766 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2767 Py_DECREF(v);
2768 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2769 }
2770
2771 return (PyObject *)v;
2772}
2773
2774PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2775 int size,
2776 const char *errors)
2777{
2778 PyObject *repr;
2779 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002780 DWORD mbcssize;
2781
2782 /* If there are no characters, bail now! */
2783 if (size==0)
2784 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002785
2786 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002787 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002788 if (mbcssize==0)
2789 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2790
2791 repr = PyString_FromStringAndSize(NULL, mbcssize);
2792 if (repr == NULL)
2793 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002794 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002795 return repr;
2796
2797 /* Do the conversion */
2798 s = PyString_AS_STRING(repr);
2799 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2800 Py_DECREF(repr);
2801 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2802 }
2803 return repr;
2804}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002805
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002806PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2807{
2808 if (!PyUnicode_Check(unicode)) {
2809 PyErr_BadArgument();
2810 return NULL;
2811 }
2812 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2813 PyUnicode_GET_SIZE(unicode),
2814 NULL);
2815}
2816
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002817#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002818
Guido van Rossumd57fd912000-03-10 22:53:23 +00002819/* --- Character Mapping Codec -------------------------------------------- */
2820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821PyObject *PyUnicode_DecodeCharmap(const char *s,
2822 int size,
2823 PyObject *mapping,
2824 const char *errors)
2825{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002826 const char *starts = s;
2827 int startinpos;
2828 int endinpos;
2829 int outpos;
2830 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 PyUnicodeObject *v;
2832 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002833 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002834 PyObject *errorHandler = NULL;
2835 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002836
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 /* Default to Latin-1 */
2838 if (mapping == NULL)
2839 return PyUnicode_DecodeLatin1(s, size, errors);
2840
2841 v = _PyUnicode_New(size);
2842 if (v == NULL)
2843 goto onError;
2844 if (size == 0)
2845 return (PyObject *)v;
2846 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847 e = s + size;
2848 while (s < e) {
2849 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 PyObject *w, *x;
2851
2852 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2853 w = PyInt_FromLong((long)ch);
2854 if (w == NULL)
2855 goto onError;
2856 x = PyObject_GetItem(mapping, w);
2857 Py_DECREF(w);
2858 if (x == NULL) {
2859 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002860 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002862 x = Py_None;
2863 Py_INCREF(x);
2864 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002865 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 }
2867
2868 /* Apply mapping */
2869 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002870 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002871 if (value < 0 || value > 65535) {
2872 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002873 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 Py_DECREF(x);
2875 goto onError;
2876 }
2877 *p++ = (Py_UNICODE)value;
2878 }
2879 else if (x == Py_None) {
2880 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002881 outpos = p-PyUnicode_AS_UNICODE(v);
2882 startinpos = s-starts;
2883 endinpos = startinpos+1;
2884 if (unicode_decode_call_errorhandler(
2885 errors, &errorHandler,
2886 "charmap", "character maps to <undefined>",
2887 starts, size, &startinpos, &endinpos, &exc, &s,
2888 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002889 Py_DECREF(x);
2890 goto onError;
2891 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002892 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002893 }
2894 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002895 int targetsize = PyUnicode_GET_SIZE(x);
2896
2897 if (targetsize == 1)
2898 /* 1-1 mapping */
2899 *p++ = *PyUnicode_AS_UNICODE(x);
2900
2901 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002903 if (targetsize > extrachars) {
2904 /* resize first */
2905 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2906 int needed = (targetsize - extrachars) + \
2907 (targetsize << 2);
2908 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002909 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002910 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002911 Py_DECREF(x);
2912 goto onError;
2913 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002914 p = PyUnicode_AS_UNICODE(v) + oldpos;
2915 }
2916 Py_UNICODE_COPY(p,
2917 PyUnicode_AS_UNICODE(x),
2918 targetsize);
2919 p += targetsize;
2920 extrachars -= targetsize;
2921 }
2922 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002923 }
2924 else {
2925 /* wrong return value */
2926 PyErr_SetString(PyExc_TypeError,
2927 "character mapping must return integer, None or unicode");
2928 Py_DECREF(x);
2929 goto onError;
2930 }
2931 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002932 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002933 }
2934 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002935 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002936 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002937 Py_XDECREF(errorHandler);
2938 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002939 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002940
Guido van Rossumd57fd912000-03-10 22:53:23 +00002941 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002942 Py_XDECREF(errorHandler);
2943 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002944 Py_XDECREF(v);
2945 return NULL;
2946}
2947
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002948/* Lookup the character ch in the mapping. If the character
2949 can't be found, Py_None is returned (or NULL, if another
2950 error occured). */
2951static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002952{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002953 PyObject *w = PyInt_FromLong((long)c);
2954 PyObject *x;
2955
2956 if (w == NULL)
2957 return NULL;
2958 x = PyObject_GetItem(mapping, w);
2959 Py_DECREF(w);
2960 if (x == NULL) {
2961 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2962 /* No mapping found means: mapping is undefined. */
2963 PyErr_Clear();
2964 x = Py_None;
2965 Py_INCREF(x);
2966 return x;
2967 } else
2968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002970 else if (x == Py_None)
2971 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 else if (PyInt_Check(x)) {
2973 long value = PyInt_AS_LONG(x);
2974 if (value < 0 || value > 255) {
2975 PyErr_SetString(PyExc_TypeError,
2976 "character mapping must be in range(256)");
2977 Py_DECREF(x);
2978 return NULL;
2979 }
2980 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002981 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002982 else if (PyString_Check(x))
2983 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 /* wrong return value */
2986 PyErr_SetString(PyExc_TypeError,
2987 "character mapping must return integer, None or str");
2988 Py_DECREF(x);
2989 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002990 }
2991}
2992
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002993/* lookup the character, put the result in the output string and adjust
2994 various state variables. Reallocate the output string if not enough
2995 space is available. Return a new reference to the object that
2996 was put in the output buffer, or Py_None, if the mapping was undefined
2997 (in which case no character was written) or NULL, if a
2998 reallocation error ocurred. The called must decref the result */
2999static
3000PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
3001 PyObject **outobj, int *outpos)
3002{
3003 PyObject *rep = charmapencode_lookup(c, mapping);
3004
3005 if (rep==NULL)
3006 return NULL;
3007 else if (rep==Py_None)
3008 return rep;
3009 else {
3010 char *outstart = PyString_AS_STRING(*outobj);
3011 int outsize = PyString_GET_SIZE(*outobj);
3012 if (PyInt_Check(rep)) {
3013 int requiredsize = *outpos+1;
3014 if (outsize<requiredsize) {
3015 /* exponentially overallocate to minimize reallocations */
3016 if (requiredsize < 2*outsize)
3017 requiredsize = 2*outsize;
3018 if (_PyString_Resize(outobj, requiredsize)) {
3019 Py_DECREF(rep);
3020 return NULL;
3021 }
3022 outstart = PyString_AS_STRING(*outobj);
3023 }
3024 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
3025 }
3026 else {
3027 const char *repchars = PyString_AS_STRING(rep);
3028 int repsize = PyString_GET_SIZE(rep);
3029 int requiredsize = *outpos+repsize;
3030 if (outsize<requiredsize) {
3031 /* exponentially overallocate to minimize reallocations */
3032 if (requiredsize < 2*outsize)
3033 requiredsize = 2*outsize;
3034 if (_PyString_Resize(outobj, requiredsize)) {
3035 Py_DECREF(rep);
3036 return NULL;
3037 }
3038 outstart = PyString_AS_STRING(*outobj);
3039 }
3040 memcpy(outstart + *outpos, repchars, repsize);
3041 *outpos += repsize;
3042 }
3043 }
3044 return rep;
3045}
3046
3047/* handle an error in PyUnicode_EncodeCharmap
3048 Return 0 on success, -1 on error */
3049static
3050int charmap_encoding_error(
3051 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
3052 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003053 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003054 PyObject **res, int *respos)
3055{
3056 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3057 int repsize;
3058 int newpos;
3059 Py_UNICODE *uni2;
3060 /* startpos for collecting unencodable chars */
3061 int collstartpos = *inpos;
3062 int collendpos = *inpos+1;
3063 int collpos;
3064 char *encoding = "charmap";
3065 char *reason = "character maps to <undefined>";
3066
3067 PyObject *x;
3068 /* find all unencodable characters */
3069 while (collendpos < size) {
3070 x = charmapencode_lookup(p[collendpos], mapping);
3071 if (x==NULL)
3072 return -1;
3073 else if (x!=Py_None) {
3074 Py_DECREF(x);
3075 break;
3076 }
3077 Py_DECREF(x);
3078 ++collendpos;
3079 }
3080 /* cache callback name lookup
3081 * (if not done yet, i.e. it's the first error) */
3082 if (*known_errorHandler==-1) {
3083 if ((errors==NULL) || (!strcmp(errors, "strict")))
3084 *known_errorHandler = 1;
3085 else if (!strcmp(errors, "replace"))
3086 *known_errorHandler = 2;
3087 else if (!strcmp(errors, "ignore"))
3088 *known_errorHandler = 3;
3089 else if (!strcmp(errors, "xmlcharrefreplace"))
3090 *known_errorHandler = 4;
3091 else
3092 *known_errorHandler = 0;
3093 }
3094 switch (*known_errorHandler) {
3095 case 1: /* strict */
3096 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3097 return -1;
3098 case 2: /* replace */
3099 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3100 x = charmapencode_output('?', mapping, res, respos);
3101 if (x==NULL) {
3102 return -1;
3103 }
3104 else if (x==Py_None) {
3105 Py_DECREF(x);
3106 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3107 return -1;
3108 }
3109 Py_DECREF(x);
3110 }
3111 /* fall through */
3112 case 3: /* ignore */
3113 *inpos = collendpos;
3114 break;
3115 case 4: /* xmlcharrefreplace */
3116 /* generate replacement (temporarily (mis)uses p) */
3117 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3118 char buffer[2+29+1+1];
3119 char *cp;
3120 sprintf(buffer, "&#%d;", (int)p[collpos]);
3121 for (cp = buffer; *cp; ++cp) {
3122 x = charmapencode_output(*cp, mapping, res, respos);
3123 if (x==NULL)
3124 return -1;
3125 else if (x==Py_None) {
3126 Py_DECREF(x);
3127 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3128 return -1;
3129 }
3130 Py_DECREF(x);
3131 }
3132 }
3133 *inpos = collendpos;
3134 break;
3135 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003136 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003137 encoding, reason, p, size, exceptionObject,
3138 collstartpos, collendpos, &newpos);
3139 if (repunicode == NULL)
3140 return -1;
3141 /* generate replacement */
3142 repsize = PyUnicode_GET_SIZE(repunicode);
3143 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3144 x = charmapencode_output(*uni2, mapping, res, respos);
3145 if (x==NULL) {
3146 Py_DECREF(repunicode);
3147 return -1;
3148 }
3149 else if (x==Py_None) {
3150 Py_DECREF(repunicode);
3151 Py_DECREF(x);
3152 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3153 return -1;
3154 }
3155 Py_DECREF(x);
3156 }
3157 *inpos = newpos;
3158 Py_DECREF(repunicode);
3159 }
3160 return 0;
3161}
3162
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3164 int size,
3165 PyObject *mapping,
3166 const char *errors)
3167{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003168 /* output object */
3169 PyObject *res = NULL;
3170 /* current input position */
3171 int inpos = 0;
3172 /* current output position */
3173 int respos = 0;
3174 PyObject *errorHandler = NULL;
3175 PyObject *exc = NULL;
3176 /* the following variable is used for caching string comparisons
3177 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3178 * 3=ignore, 4=xmlcharrefreplace */
3179 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180
3181 /* Default to Latin-1 */
3182 if (mapping == NULL)
3183 return PyUnicode_EncodeLatin1(p, size, errors);
3184
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003185 /* allocate enough for a simple encoding without
3186 replacements, if we need more, we'll resize */
3187 res = PyString_FromStringAndSize(NULL, size);
3188 if (res == NULL)
3189 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003190 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003191 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003193 while (inpos<size) {
3194 /* try to encode it */
3195 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3196 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003198 if (x==Py_None) { /* unencodable character */
3199 if (charmap_encoding_error(p, size, &inpos, mapping,
3200 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003201 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003202 &res, &respos)) {
3203 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003204 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003207 else
3208 /* done with this character => adjust input position */
3209 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 Py_DECREF(x);
3211 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003212
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003213 /* Resize if we allocated to much */
3214 if (respos<PyString_GET_SIZE(res)) {
3215 if (_PyString_Resize(&res, respos))
3216 goto onError;
3217 }
3218 Py_XDECREF(exc);
3219 Py_XDECREF(errorHandler);
3220 return res;
3221
3222 onError:
3223 Py_XDECREF(res);
3224 Py_XDECREF(exc);
3225 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 return NULL;
3227}
3228
3229PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3230 PyObject *mapping)
3231{
3232 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3233 PyErr_BadArgument();
3234 return NULL;
3235 }
3236 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3237 PyUnicode_GET_SIZE(unicode),
3238 mapping,
3239 NULL);
3240}
3241
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003242/* create or adjust a UnicodeTranslateError */
3243static void make_translate_exception(PyObject **exceptionObject,
3244 const Py_UNICODE *unicode, int size,
3245 int startpos, int endpos,
3246 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 if (*exceptionObject == NULL) {
3249 *exceptionObject = PyUnicodeTranslateError_Create(
3250 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251 }
3252 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3254 goto onError;
3255 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3256 goto onError;
3257 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3258 goto onError;
3259 return;
3260 onError:
3261 Py_DECREF(*exceptionObject);
3262 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 }
3264}
3265
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003266/* raises a UnicodeTranslateError */
3267static void raise_translate_exception(PyObject **exceptionObject,
3268 const Py_UNICODE *unicode, int size,
3269 int startpos, int endpos,
3270 const char *reason)
3271{
3272 make_translate_exception(exceptionObject,
3273 unicode, size, startpos, endpos, reason);
3274 if (*exceptionObject != NULL)
3275 PyCodec_StrictErrors(*exceptionObject);
3276}
3277
3278/* error handling callback helper:
3279 build arguments, call the callback and check the arguments,
3280 put the result into newpos and return the replacement string, which
3281 has to be freed by the caller */
3282static PyObject *unicode_translate_call_errorhandler(const char *errors,
3283 PyObject **errorHandler,
3284 const char *reason,
3285 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3286 int startpos, int endpos,
3287 int *newpos)
3288{
3289 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3290
3291 PyObject *restuple;
3292 PyObject *resunicode;
3293
3294 if (*errorHandler == NULL) {
3295 *errorHandler = PyCodec_LookupError(errors);
3296 if (*errorHandler == NULL)
3297 return NULL;
3298 }
3299
3300 make_translate_exception(exceptionObject,
3301 unicode, size, startpos, endpos, reason);
3302 if (*exceptionObject == NULL)
3303 return NULL;
3304
3305 restuple = PyObject_CallFunctionObjArgs(
3306 *errorHandler, *exceptionObject, NULL);
3307 if (restuple == NULL)
3308 return NULL;
3309 if (!PyTuple_Check(restuple)) {
3310 PyErr_Format(PyExc_TypeError, &argparse[4]);
3311 Py_DECREF(restuple);
3312 return NULL;
3313 }
3314 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3315 &resunicode, newpos)) {
3316 Py_DECREF(restuple);
3317 return NULL;
3318 }
3319 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003320 *newpos = size+*newpos;
3321 if (*newpos<0 || *newpos>size) {
3322 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3323 Py_DECREF(restuple);
3324 return NULL;
3325 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 Py_INCREF(resunicode);
3327 Py_DECREF(restuple);
3328 return resunicode;
3329}
3330
3331/* Lookup the character ch in the mapping and put the result in result,
3332 which must be decrefed by the caller.
3333 Return 0 on success, -1 on error */
3334static
3335int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3336{
3337 PyObject *w = PyInt_FromLong((long)c);
3338 PyObject *x;
3339
3340 if (w == NULL)
3341 return -1;
3342 x = PyObject_GetItem(mapping, w);
3343 Py_DECREF(w);
3344 if (x == NULL) {
3345 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3346 /* No mapping found means: use 1:1 mapping. */
3347 PyErr_Clear();
3348 *result = NULL;
3349 return 0;
3350 } else
3351 return -1;
3352 }
3353 else if (x == Py_None) {
3354 *result = x;
3355 return 0;
3356 }
3357 else if (PyInt_Check(x)) {
3358 long value = PyInt_AS_LONG(x);
3359 long max = PyUnicode_GetMax();
3360 if (value < 0 || value > max) {
3361 PyErr_Format(PyExc_TypeError,
3362 "character mapping must be in range(0x%lx)", max+1);
3363 Py_DECREF(x);
3364 return -1;
3365 }
3366 *result = x;
3367 return 0;
3368 }
3369 else if (PyUnicode_Check(x)) {
3370 *result = x;
3371 return 0;
3372 }
3373 else {
3374 /* wrong return value */
3375 PyErr_SetString(PyExc_TypeError,
3376 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003377 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003378 return -1;
3379 }
3380}
3381/* ensure that *outobj is at least requiredsize characters long,
3382if not reallocate and adjust various state variables.
3383Return 0 on success, -1 on error */
3384static
Walter Dörwald4894c302003-10-24 14:25:28 +00003385int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003386 int requiredsize)
3387{
Walter Dörwald4894c302003-10-24 14:25:28 +00003388 int oldsize = PyUnicode_GET_SIZE(*outobj);
3389 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003390 /* remember old output position */
3391 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3392 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003393 if (requiredsize < 2 * oldsize)
3394 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003395 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 return -1;
3397 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003398 }
3399 return 0;
3400}
3401/* lookup the character, put the result in the output string and adjust
3402 various state variables. Return a new reference to the object that
3403 was put in the output buffer in *result, or Py_None, if the mapping was
3404 undefined (in which case no character was written).
3405 The called must decref result.
3406 Return 0 on success, -1 on error. */
3407static
Walter Dörwald4894c302003-10-24 14:25:28 +00003408int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3409 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3410 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411{
Walter Dörwald4894c302003-10-24 14:25:28 +00003412 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003413 return -1;
3414 if (*res==NULL) {
3415 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003416 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003417 }
3418 else if (*res==Py_None)
3419 ;
3420 else if (PyInt_Check(*res)) {
3421 /* no overflow check, because we know that the space is enough */
3422 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3423 }
3424 else if (PyUnicode_Check(*res)) {
3425 int repsize = PyUnicode_GET_SIZE(*res);
3426 if (repsize==1) {
3427 /* no overflow check, because we know that the space is enough */
3428 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3429 }
3430 else if (repsize!=0) {
3431 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003432 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003433 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003434 repsize - 1;
3435 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003436 return -1;
3437 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3438 *outp += repsize;
3439 }
3440 }
3441 else
3442 return -1;
3443 return 0;
3444}
3445
3446PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 int size,
3448 PyObject *mapping,
3449 const char *errors)
3450{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003451 /* output object */
3452 PyObject *res = NULL;
3453 /* pointers to the beginning and end+1 of input */
3454 const Py_UNICODE *startp = p;
3455 const Py_UNICODE *endp = p + size;
3456 /* pointer into the output */
3457 Py_UNICODE *str;
3458 /* current output position */
3459 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003460 char *reason = "character maps to <undefined>";
3461 PyObject *errorHandler = NULL;
3462 PyObject *exc = NULL;
3463 /* the following variable is used for caching string comparisons
3464 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3465 * 3=ignore, 4=xmlcharrefreplace */
3466 int known_errorHandler = -1;
3467
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 if (mapping == NULL) {
3469 PyErr_BadArgument();
3470 return NULL;
3471 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003472
3473 /* allocate enough for a simple 1:1 translation without
3474 replacements, if we need more, we'll resize */
3475 res = PyUnicode_FromUnicode(NULL, size);
3476 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003477 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003479 return res;
3480 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003482 while (p<endp) {
3483 /* try to encode it */
3484 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003485 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003487 goto onError;
3488 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003489 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 if (x!=Py_None) /* it worked => adjust input pointer */
3491 ++p;
3492 else { /* untranslatable character */
3493 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3494 int repsize;
3495 int newpos;
3496 Py_UNICODE *uni2;
3497 /* startpos for collecting untranslatable chars */
3498 const Py_UNICODE *collstart = p;
3499 const Py_UNICODE *collend = p+1;
3500 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003502 /* find all untranslatable characters */
3503 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003504 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 goto onError;
3506 Py_XDECREF(x);
3507 if (x!=Py_None)
3508 break;
3509 ++collend;
3510 }
3511 /* cache callback name lookup
3512 * (if not done yet, i.e. it's the first error) */
3513 if (known_errorHandler==-1) {
3514 if ((errors==NULL) || (!strcmp(errors, "strict")))
3515 known_errorHandler = 1;
3516 else if (!strcmp(errors, "replace"))
3517 known_errorHandler = 2;
3518 else if (!strcmp(errors, "ignore"))
3519 known_errorHandler = 3;
3520 else if (!strcmp(errors, "xmlcharrefreplace"))
3521 known_errorHandler = 4;
3522 else
3523 known_errorHandler = 0;
3524 }
3525 switch (known_errorHandler) {
3526 case 1: /* strict */
3527 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3528 goto onError;
3529 case 2: /* replace */
3530 /* No need to check for space, this is a 1:1 replacement */
3531 for (coll = collstart; coll<collend; ++coll)
3532 *str++ = '?';
3533 /* fall through */
3534 case 3: /* ignore */
3535 p = collend;
3536 break;
3537 case 4: /* xmlcharrefreplace */
3538 /* generate replacement (temporarily (mis)uses p) */
3539 for (p = collstart; p < collend; ++p) {
3540 char buffer[2+29+1+1];
3541 char *cp;
3542 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003543 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003544 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3545 goto onError;
3546 for (cp = buffer; *cp; ++cp)
3547 *str++ = *cp;
3548 }
3549 p = collend;
3550 break;
3551 default:
3552 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3553 reason, startp, size, &exc,
3554 collstart-startp, collend-startp, &newpos);
3555 if (repunicode == NULL)
3556 goto onError;
3557 /* generate replacement */
3558 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003559 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3561 Py_DECREF(repunicode);
3562 goto onError;
3563 }
3564 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3565 *str++ = *uni2;
3566 p = startp + newpos;
3567 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 }
3569 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003570 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003571 /* Resize if we allocated to much */
3572 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003573 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003574 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003575 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003576 }
3577 Py_XDECREF(exc);
3578 Py_XDECREF(errorHandler);
3579 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003581 onError:
3582 Py_XDECREF(res);
3583 Py_XDECREF(exc);
3584 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585 return NULL;
3586}
3587
3588PyObject *PyUnicode_Translate(PyObject *str,
3589 PyObject *mapping,
3590 const char *errors)
3591{
3592 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003593
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 str = PyUnicode_FromObject(str);
3595 if (str == NULL)
3596 goto onError;
3597 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3598 PyUnicode_GET_SIZE(str),
3599 mapping,
3600 errors);
3601 Py_DECREF(str);
3602 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003603
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 onError:
3605 Py_XDECREF(str);
3606 return NULL;
3607}
Tim Petersced69f82003-09-16 20:30:58 +00003608
Guido van Rossum9e896b32000-04-05 20:11:21 +00003609/* --- Decimal Encoder ---------------------------------------------------- */
3610
3611int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3612 int length,
3613 char *output,
3614 const char *errors)
3615{
3616 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003617 PyObject *errorHandler = NULL;
3618 PyObject *exc = NULL;
3619 const char *encoding = "decimal";
3620 const char *reason = "invalid decimal Unicode string";
3621 /* the following variable is used for caching string comparisons
3622 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3623 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003624
3625 if (output == NULL) {
3626 PyErr_BadArgument();
3627 return -1;
3628 }
3629
3630 p = s;
3631 end = s + length;
3632 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003633 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003634 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003635 PyObject *repunicode;
3636 int repsize;
3637 int newpos;
3638 Py_UNICODE *uni2;
3639 Py_UNICODE *collstart;
3640 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003641
Guido van Rossum9e896b32000-04-05 20:11:21 +00003642 if (Py_UNICODE_ISSPACE(ch)) {
3643 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003644 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003645 continue;
3646 }
3647 decimal = Py_UNICODE_TODECIMAL(ch);
3648 if (decimal >= 0) {
3649 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003650 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003651 continue;
3652 }
Guido van Rossumba477042000-04-06 18:18:10 +00003653 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003654 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003655 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003656 continue;
3657 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003658 /* All other characters are considered unencodable */
3659 collstart = p;
3660 collend = p+1;
3661 while (collend < end) {
3662 if ((0 < *collend && *collend < 256) ||
3663 !Py_UNICODE_ISSPACE(*collend) ||
3664 Py_UNICODE_TODECIMAL(*collend))
3665 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003666 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003667 /* cache callback name lookup
3668 * (if not done yet, i.e. it's the first error) */
3669 if (known_errorHandler==-1) {
3670 if ((errors==NULL) || (!strcmp(errors, "strict")))
3671 known_errorHandler = 1;
3672 else if (!strcmp(errors, "replace"))
3673 known_errorHandler = 2;
3674 else if (!strcmp(errors, "ignore"))
3675 known_errorHandler = 3;
3676 else if (!strcmp(errors, "xmlcharrefreplace"))
3677 known_errorHandler = 4;
3678 else
3679 known_errorHandler = 0;
3680 }
3681 switch (known_errorHandler) {
3682 case 1: /* strict */
3683 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3684 goto onError;
3685 case 2: /* replace */
3686 for (p = collstart; p < collend; ++p)
3687 *output++ = '?';
3688 /* fall through */
3689 case 3: /* ignore */
3690 p = collend;
3691 break;
3692 case 4: /* xmlcharrefreplace */
3693 /* generate replacement (temporarily (mis)uses p) */
3694 for (p = collstart; p < collend; ++p)
3695 output += sprintf(output, "&#%d;", (int)*p);
3696 p = collend;
3697 break;
3698 default:
3699 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3700 encoding, reason, s, length, &exc,
3701 collstart-s, collend-s, &newpos);
3702 if (repunicode == NULL)
3703 goto onError;
3704 /* generate replacement */
3705 repsize = PyUnicode_GET_SIZE(repunicode);
3706 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3707 Py_UNICODE ch = *uni2;
3708 if (Py_UNICODE_ISSPACE(ch))
3709 *output++ = ' ';
3710 else {
3711 decimal = Py_UNICODE_TODECIMAL(ch);
3712 if (decimal >= 0)
3713 *output++ = '0' + decimal;
3714 else if (0 < ch && ch < 256)
3715 *output++ = (char)ch;
3716 else {
3717 Py_DECREF(repunicode);
3718 raise_encode_exception(&exc, encoding,
3719 s, length, collstart-s, collend-s, reason);
3720 goto onError;
3721 }
3722 }
3723 }
3724 p = s + newpos;
3725 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003726 }
3727 }
3728 /* 0-terminate the output string */
3729 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003730 Py_XDECREF(exc);
3731 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003732 return 0;
3733
3734 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003735 Py_XDECREF(exc);
3736 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003737 return -1;
3738}
3739
Guido van Rossumd57fd912000-03-10 22:53:23 +00003740/* --- Helpers ------------------------------------------------------------ */
3741
Tim Petersced69f82003-09-16 20:30:58 +00003742static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743int count(PyUnicodeObject *self,
3744 int start,
3745 int end,
3746 PyUnicodeObject *substring)
3747{
3748 int count = 0;
3749
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003750 if (start < 0)
3751 start += self->length;
3752 if (start < 0)
3753 start = 0;
3754 if (end > self->length)
3755 end = self->length;
3756 if (end < 0)
3757 end += self->length;
3758 if (end < 0)
3759 end = 0;
3760
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003761 if (substring->length == 0)
3762 return (end - start + 1);
3763
Guido van Rossumd57fd912000-03-10 22:53:23 +00003764 end -= substring->length;
3765
3766 while (start <= end)
3767 if (Py_UNICODE_MATCH(self, start, substring)) {
3768 count++;
3769 start += substring->length;
3770 } else
3771 start++;
3772
3773 return count;
3774}
3775
3776int PyUnicode_Count(PyObject *str,
3777 PyObject *substr,
3778 int start,
3779 int end)
3780{
3781 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003782
Guido van Rossumd57fd912000-03-10 22:53:23 +00003783 str = PyUnicode_FromObject(str);
3784 if (str == NULL)
3785 return -1;
3786 substr = PyUnicode_FromObject(substr);
3787 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003788 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 return -1;
3790 }
Tim Petersced69f82003-09-16 20:30:58 +00003791
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 result = count((PyUnicodeObject *)str,
3793 start, end,
3794 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003795
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796 Py_DECREF(str);
3797 Py_DECREF(substr);
3798 return result;
3799}
3800
Tim Petersced69f82003-09-16 20:30:58 +00003801static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802int findstring(PyUnicodeObject *self,
3803 PyUnicodeObject *substring,
3804 int start,
3805 int end,
3806 int direction)
3807{
3808 if (start < 0)
3809 start += self->length;
3810 if (start < 0)
3811 start = 0;
3812
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 if (end > self->length)
3814 end = self->length;
3815 if (end < 0)
3816 end += self->length;
3817 if (end < 0)
3818 end = 0;
3819
Guido van Rossum76afbd92002-08-20 17:29:29 +00003820 if (substring->length == 0)
3821 return (direction > 0) ? start : end;
3822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 end -= substring->length;
3824
3825 if (direction < 0) {
3826 for (; end >= start; end--)
3827 if (Py_UNICODE_MATCH(self, end, substring))
3828 return end;
3829 } else {
3830 for (; start <= end; start++)
3831 if (Py_UNICODE_MATCH(self, start, substring))
3832 return start;
3833 }
3834
3835 return -1;
3836}
3837
3838int PyUnicode_Find(PyObject *str,
3839 PyObject *substr,
3840 int start,
3841 int end,
3842 int direction)
3843{
3844 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003845
Guido van Rossumd57fd912000-03-10 22:53:23 +00003846 str = PyUnicode_FromObject(str);
3847 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003848 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 substr = PyUnicode_FromObject(substr);
3850 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003851 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003852 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853 }
Tim Petersced69f82003-09-16 20:30:58 +00003854
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855 result = findstring((PyUnicodeObject *)str,
3856 (PyUnicodeObject *)substr,
3857 start, end, direction);
3858 Py_DECREF(str);
3859 Py_DECREF(substr);
3860 return result;
3861}
3862
Tim Petersced69f82003-09-16 20:30:58 +00003863static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864int tailmatch(PyUnicodeObject *self,
3865 PyUnicodeObject *substring,
3866 int start,
3867 int end,
3868 int direction)
3869{
3870 if (start < 0)
3871 start += self->length;
3872 if (start < 0)
3873 start = 0;
3874
3875 if (substring->length == 0)
3876 return 1;
3877
3878 if (end > self->length)
3879 end = self->length;
3880 if (end < 0)
3881 end += self->length;
3882 if (end < 0)
3883 end = 0;
3884
3885 end -= substring->length;
3886 if (end < start)
3887 return 0;
3888
3889 if (direction > 0) {
3890 if (Py_UNICODE_MATCH(self, end, substring))
3891 return 1;
3892 } else {
3893 if (Py_UNICODE_MATCH(self, start, substring))
3894 return 1;
3895 }
3896
3897 return 0;
3898}
3899
3900int PyUnicode_Tailmatch(PyObject *str,
3901 PyObject *substr,
3902 int start,
3903 int end,
3904 int direction)
3905{
3906 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003907
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908 str = PyUnicode_FromObject(str);
3909 if (str == NULL)
3910 return -1;
3911 substr = PyUnicode_FromObject(substr);
3912 if (substr == NULL) {
3913 Py_DECREF(substr);
3914 return -1;
3915 }
Tim Petersced69f82003-09-16 20:30:58 +00003916
Guido van Rossumd57fd912000-03-10 22:53:23 +00003917 result = tailmatch((PyUnicodeObject *)str,
3918 (PyUnicodeObject *)substr,
3919 start, end, direction);
3920 Py_DECREF(str);
3921 Py_DECREF(substr);
3922 return result;
3923}
3924
Tim Petersced69f82003-09-16 20:30:58 +00003925static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926const Py_UNICODE *findchar(const Py_UNICODE *s,
3927 int size,
3928 Py_UNICODE ch)
3929{
3930 /* like wcschr, but doesn't stop at NULL characters */
3931
3932 while (size-- > 0) {
3933 if (*s == ch)
3934 return s;
3935 s++;
3936 }
3937
3938 return NULL;
3939}
3940
3941/* Apply fixfct filter to the Unicode object self and return a
3942 reference to the modified object */
3943
Tim Petersced69f82003-09-16 20:30:58 +00003944static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945PyObject *fixup(PyUnicodeObject *self,
3946 int (*fixfct)(PyUnicodeObject *s))
3947{
3948
3949 PyUnicodeObject *u;
3950
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003951 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 if (u == NULL)
3953 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003954
3955 Py_UNICODE_COPY(u->str, self->str, self->length);
3956
Tim Peters7a29bd52001-09-12 03:03:31 +00003957 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 /* fixfct should return TRUE if it modified the buffer. If
3959 FALSE, return a reference to the original buffer instead
3960 (to save space, not time) */
3961 Py_INCREF(self);
3962 Py_DECREF(u);
3963 return (PyObject*) self;
3964 }
3965 return (PyObject*) u;
3966}
3967
Tim Petersced69f82003-09-16 20:30:58 +00003968static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969int fixupper(PyUnicodeObject *self)
3970{
3971 int len = self->length;
3972 Py_UNICODE *s = self->str;
3973 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003974
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 while (len-- > 0) {
3976 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003977
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 ch = Py_UNICODE_TOUPPER(*s);
3979 if (ch != *s) {
3980 status = 1;
3981 *s = ch;
3982 }
3983 s++;
3984 }
3985
3986 return status;
3987}
3988
Tim Petersced69f82003-09-16 20:30:58 +00003989static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003990int fixlower(PyUnicodeObject *self)
3991{
3992 int len = self->length;
3993 Py_UNICODE *s = self->str;
3994 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003995
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 while (len-- > 0) {
3997 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003998
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 ch = Py_UNICODE_TOLOWER(*s);
4000 if (ch != *s) {
4001 status = 1;
4002 *s = ch;
4003 }
4004 s++;
4005 }
4006
4007 return status;
4008}
4009
Tim Petersced69f82003-09-16 20:30:58 +00004010static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011int fixswapcase(PyUnicodeObject *self)
4012{
4013 int len = self->length;
4014 Py_UNICODE *s = self->str;
4015 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004016
Guido van Rossumd57fd912000-03-10 22:53:23 +00004017 while (len-- > 0) {
4018 if (Py_UNICODE_ISUPPER(*s)) {
4019 *s = Py_UNICODE_TOLOWER(*s);
4020 status = 1;
4021 } else if (Py_UNICODE_ISLOWER(*s)) {
4022 *s = Py_UNICODE_TOUPPER(*s);
4023 status = 1;
4024 }
4025 s++;
4026 }
4027
4028 return status;
4029}
4030
Tim Petersced69f82003-09-16 20:30:58 +00004031static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032int fixcapitalize(PyUnicodeObject *self)
4033{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004034 int len = self->length;
4035 Py_UNICODE *s = self->str;
4036 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00004037
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004038 if (len == 0)
4039 return 0;
4040 if (Py_UNICODE_ISLOWER(*s)) {
4041 *s = Py_UNICODE_TOUPPER(*s);
4042 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004043 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00004044 s++;
4045 while (--len > 0) {
4046 if (Py_UNICODE_ISUPPER(*s)) {
4047 *s = Py_UNICODE_TOLOWER(*s);
4048 status = 1;
4049 }
4050 s++;
4051 }
4052 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004053}
4054
4055static
4056int fixtitle(PyUnicodeObject *self)
4057{
4058 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4059 register Py_UNICODE *e;
4060 int previous_is_cased;
4061
4062 /* Shortcut for single character strings */
4063 if (PyUnicode_GET_SIZE(self) == 1) {
4064 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
4065 if (*p != ch) {
4066 *p = ch;
4067 return 1;
4068 }
4069 else
4070 return 0;
4071 }
Tim Petersced69f82003-09-16 20:30:58 +00004072
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 e = p + PyUnicode_GET_SIZE(self);
4074 previous_is_cased = 0;
4075 for (; p < e; p++) {
4076 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004077
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078 if (previous_is_cased)
4079 *p = Py_UNICODE_TOLOWER(ch);
4080 else
4081 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00004082
4083 if (Py_UNICODE_ISLOWER(ch) ||
4084 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085 Py_UNICODE_ISTITLE(ch))
4086 previous_is_cased = 1;
4087 else
4088 previous_is_cased = 0;
4089 }
4090 return 1;
4091}
4092
Tim Peters8ce9f162004-08-27 01:49:32 +00004093PyObject *
4094PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095{
Tim Peters8ce9f162004-08-27 01:49:32 +00004096 PyObject *internal_separator = NULL;
Skip Montanaro6543b452004-09-16 03:28:13 +00004097 const Py_UNICODE blank = ' ';
4098 const Py_UNICODE *sep = &blank;
4099 size_t seplen = 1;
Tim Peters05eba1f2004-08-27 21:32:02 +00004100 PyUnicodeObject *res = NULL; /* the result */
4101 size_t res_alloc = 100; /* # allocated bytes for string in res */
4102 size_t res_used; /* # used bytes */
4103 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
4104 PyObject *fseq; /* PySequence_Fast(seq) */
4105 int seqlen; /* len(fseq) -- number of items in sequence */
Tim Peters8ce9f162004-08-27 01:49:32 +00004106 PyObject *item;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004107 int i;
4108
Tim Peters05eba1f2004-08-27 21:32:02 +00004109 fseq = PySequence_Fast(seq, "");
4110 if (fseq == NULL) {
4111 if (PyErr_ExceptionMatches(PyExc_TypeError))
4112 PyErr_Format(PyExc_TypeError,
4113 "sequence expected, %.80s found",
4114 seq->ob_type->tp_name);
4115 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00004116 }
4117
Tim Peters91879ab2004-08-27 22:35:44 +00004118 /* Grrrr. A codec may be invoked to convert str objects to
4119 * Unicode, and so it's possible to call back into Python code
4120 * during PyUnicode_FromObject(), and so it's possible for a sick
4121 * codec to change the size of fseq (if seq is a list). Therefore
4122 * we have to keep refetching the size -- can't assume seqlen
4123 * is invariant.
4124 */
Tim Peters05eba1f2004-08-27 21:32:02 +00004125 seqlen = PySequence_Fast_GET_SIZE(fseq);
4126 /* If empty sequence, return u"". */
4127 if (seqlen == 0) {
4128 res = _PyUnicode_New(0); /* empty sequence; return u"" */
4129 goto Done;
4130 }
4131 /* If singleton sequence with an exact Unicode, return that. */
4132 if (seqlen == 1) {
4133 item = PySequence_Fast_GET_ITEM(fseq, 0);
4134 if (PyUnicode_CheckExact(item)) {
4135 Py_INCREF(item);
4136 res = (PyUnicodeObject *)item;
4137 goto Done;
4138 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004139 }
4140
Tim Peters05eba1f2004-08-27 21:32:02 +00004141 /* At least two items to join, or one that isn't exact Unicode. */
4142 if (seqlen > 1) {
4143 /* Set up sep and seplen -- they're needed. */
4144 if (separator == NULL) {
4145 sep = &blank;
4146 seplen = 1;
4147 }
4148 else {
4149 internal_separator = PyUnicode_FromObject(separator);
4150 if (internal_separator == NULL)
4151 goto onError;
4152 sep = PyUnicode_AS_UNICODE(internal_separator);
4153 seplen = PyUnicode_GET_SIZE(internal_separator);
Tim Peters91879ab2004-08-27 22:35:44 +00004154 /* In case PyUnicode_FromObject() mutated seq. */
4155 seqlen = PySequence_Fast_GET_SIZE(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00004156 }
4157 }
4158
4159 /* Get space. */
4160 res = _PyUnicode_New((int)res_alloc);
4161 if (res == NULL)
Tim Peters8ce9f162004-08-27 01:49:32 +00004162 goto onError;
Tim Peters05eba1f2004-08-27 21:32:02 +00004163 res_p = PyUnicode_AS_UNICODE(res);
4164 res_used = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00004165
Tim Peters05eba1f2004-08-27 21:32:02 +00004166 for (i = 0; i < seqlen; ++i) {
4167 size_t itemlen;
4168 size_t new_res_used;
4169
4170 item = PySequence_Fast_GET_ITEM(fseq, i);
4171 /* Convert item to Unicode. */
4172 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
4173 PyErr_Format(PyExc_TypeError,
4174 "sequence item %i: expected string or Unicode,"
4175 " %.80s found",
4176 i, item->ob_type->tp_name);
Tim Peters2cfe3682001-05-05 05:36:48 +00004177 goto onError;
Tim Peters8ce9f162004-08-27 01:49:32 +00004178 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004179 item = PyUnicode_FromObject(item);
4180 if (item == NULL)
4181 goto onError;
4182 /* We own a reference to item from here on. */
Tim Petersced69f82003-09-16 20:30:58 +00004183
Tim Peters91879ab2004-08-27 22:35:44 +00004184 /* In case PyUnicode_FromObject() mutated seq. */
4185 seqlen = PySequence_Fast_GET_SIZE(fseq);
4186
Tim Peters8ce9f162004-08-27 01:49:32 +00004187 /* Make sure we have enough space for the separator and the item. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188 itemlen = PyUnicode_GET_SIZE(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004189 new_res_used = res_used + itemlen;
4190 if (new_res_used < res_used || new_res_used > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004191 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004192 if (i < seqlen - 1) {
4193 new_res_used += seplen;
4194 if (new_res_used < res_used || new_res_used > INT_MAX)
4195 goto Overflow;
4196 }
4197 if (new_res_used > res_alloc) {
4198 /* double allocated size until it's big enough */
Tim Peters8ce9f162004-08-27 01:49:32 +00004199 do {
Tim Peters05eba1f2004-08-27 21:32:02 +00004200 size_t oldsize = res_alloc;
4201 res_alloc += res_alloc;
4202 if (res_alloc < oldsize || res_alloc > INT_MAX)
Tim Peters8ce9f162004-08-27 01:49:32 +00004203 goto Overflow;
Tim Peters05eba1f2004-08-27 21:32:02 +00004204 } while (new_res_used > res_alloc);
4205 if (_PyUnicode_Resize(&res, (int)res_alloc) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004206 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004208 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004209 res_p = PyUnicode_AS_UNICODE(res) + res_used;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004210 }
Tim Peters05eba1f2004-08-27 21:32:02 +00004211
4212 /* Copy item, and maybe the separator. */
4213 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), (int)itemlen);
4214 res_p += itemlen;
4215 if (i < seqlen - 1) {
4216 Py_UNICODE_COPY(res_p, sep, (int)seplen);
4217 res_p += seplen;
4218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219 Py_DECREF(item);
Tim Peters05eba1f2004-08-27 21:32:02 +00004220 res_used = new_res_used;
4221 }
Tim Peters8ce9f162004-08-27 01:49:32 +00004222
Tim Peters05eba1f2004-08-27 21:32:02 +00004223 /* Shrink res to match the used area; this probably can't fail,
4224 * but it's cheap to check.
4225 */
4226 if (_PyUnicode_Resize(&res, (int)res_used) < 0)
Tim Peters8ce9f162004-08-27 01:49:32 +00004227 goto onError;
4228
4229 Done:
4230 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004231 Py_DECREF(fseq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004232 return (PyObject *)res;
4233
Tim Peters8ce9f162004-08-27 01:49:32 +00004234 Overflow:
4235 PyErr_SetString(PyExc_OverflowError,
4236 "join() is too long for a Python string");
4237 Py_DECREF(item);
4238 /* fall through */
4239
Guido van Rossumd57fd912000-03-10 22:53:23 +00004240 onError:
Tim Peters8ce9f162004-08-27 01:49:32 +00004241 Py_XDECREF(internal_separator);
Tim Peters05eba1f2004-08-27 21:32:02 +00004242 Py_DECREF(fseq);
Tim Peters8ce9f162004-08-27 01:49:32 +00004243 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244 return NULL;
4245}
4246
Tim Petersced69f82003-09-16 20:30:58 +00004247static
4248PyUnicodeObject *pad(PyUnicodeObject *self,
4249 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250 int right,
4251 Py_UNICODE fill)
4252{
4253 PyUnicodeObject *u;
4254
4255 if (left < 0)
4256 left = 0;
4257 if (right < 0)
4258 right = 0;
4259
Tim Peters7a29bd52001-09-12 03:03:31 +00004260 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 Py_INCREF(self);
4262 return self;
4263 }
4264
4265 u = _PyUnicode_New(left + self->length + right);
4266 if (u) {
4267 if (left)
4268 Py_UNICODE_FILL(u->str, fill, left);
4269 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4270 if (right)
4271 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4272 }
4273
4274 return u;
4275}
4276
4277#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004278 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 if (!str) \
4280 goto onError; \
4281 if (PyList_Append(list, str)) { \
4282 Py_DECREF(str); \
4283 goto onError; \
4284 } \
4285 else \
4286 Py_DECREF(str);
4287
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004288#define SPLIT_INSERT(data, left, right) \
4289 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4290 if (!str) \
4291 goto onError; \
4292 if (PyList_Insert(list, 0, str)) { \
4293 Py_DECREF(str); \
4294 goto onError; \
4295 } \
4296 else \
4297 Py_DECREF(str);
4298
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299static
4300PyObject *split_whitespace(PyUnicodeObject *self,
4301 PyObject *list,
4302 int maxcount)
4303{
4304 register int i;
4305 register int j;
4306 int len = self->length;
4307 PyObject *str;
4308
4309 for (i = j = 0; i < len; ) {
4310 /* find a token */
4311 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4312 i++;
4313 j = i;
4314 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4315 i++;
4316 if (j < i) {
4317 if (maxcount-- <= 0)
4318 break;
4319 SPLIT_APPEND(self->str, j, i);
4320 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4321 i++;
4322 j = i;
4323 }
4324 }
4325 if (j < len) {
4326 SPLIT_APPEND(self->str, j, len);
4327 }
4328 return list;
4329
4330 onError:
4331 Py_DECREF(list);
4332 return NULL;
4333}
4334
4335PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004336 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337{
4338 register int i;
4339 register int j;
4340 int len;
4341 PyObject *list;
4342 PyObject *str;
4343 Py_UNICODE *data;
4344
4345 string = PyUnicode_FromObject(string);
4346 if (string == NULL)
4347 return NULL;
4348 data = PyUnicode_AS_UNICODE(string);
4349 len = PyUnicode_GET_SIZE(string);
4350
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 list = PyList_New(0);
4352 if (!list)
4353 goto onError;
4354
4355 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004356 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004357
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358 /* Find a line and append it */
4359 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4360 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361
4362 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004363 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 if (i < len) {
4365 if (data[i] == '\r' && i + 1 < len &&
4366 data[i+1] == '\n')
4367 i += 2;
4368 else
4369 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004370 if (keepends)
4371 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372 }
Guido van Rossum86662912000-04-11 15:38:46 +00004373 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374 j = i;
4375 }
4376 if (j < len) {
4377 SPLIT_APPEND(data, j, len);
4378 }
4379
4380 Py_DECREF(string);
4381 return list;
4382
4383 onError:
4384 Py_DECREF(list);
4385 Py_DECREF(string);
4386 return NULL;
4387}
4388
Tim Petersced69f82003-09-16 20:30:58 +00004389static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390PyObject *split_char(PyUnicodeObject *self,
4391 PyObject *list,
4392 Py_UNICODE ch,
4393 int maxcount)
4394{
4395 register int i;
4396 register int j;
4397 int len = self->length;
4398 PyObject *str;
4399
4400 for (i = j = 0; i < len; ) {
4401 if (self->str[i] == ch) {
4402 if (maxcount-- <= 0)
4403 break;
4404 SPLIT_APPEND(self->str, j, i);
4405 i = j = i + 1;
4406 } else
4407 i++;
4408 }
4409 if (j <= len) {
4410 SPLIT_APPEND(self->str, j, len);
4411 }
4412 return list;
4413
4414 onError:
4415 Py_DECREF(list);
4416 return NULL;
4417}
4418
Tim Petersced69f82003-09-16 20:30:58 +00004419static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420PyObject *split_substring(PyUnicodeObject *self,
4421 PyObject *list,
4422 PyUnicodeObject *substring,
4423 int maxcount)
4424{
4425 register int i;
4426 register int j;
4427 int len = self->length;
4428 int sublen = substring->length;
4429 PyObject *str;
4430
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004431 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004432 if (Py_UNICODE_MATCH(self, i, substring)) {
4433 if (maxcount-- <= 0)
4434 break;
4435 SPLIT_APPEND(self->str, j, i);
4436 i = j = i + sublen;
4437 } else
4438 i++;
4439 }
4440 if (j <= len) {
4441 SPLIT_APPEND(self->str, j, len);
4442 }
4443 return list;
4444
4445 onError:
4446 Py_DECREF(list);
4447 return NULL;
4448}
4449
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004450static
4451PyObject *rsplit_whitespace(PyUnicodeObject *self,
4452 PyObject *list,
4453 int maxcount)
4454{
4455 register int i;
4456 register int j;
4457 int len = self->length;
4458 PyObject *str;
4459
4460 for (i = j = len - 1; i >= 0; ) {
4461 /* find a token */
4462 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4463 i--;
4464 j = i;
4465 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4466 i--;
4467 if (j > i) {
4468 if (maxcount-- <= 0)
4469 break;
4470 SPLIT_INSERT(self->str, i + 1, j + 1);
4471 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4472 i--;
4473 j = i;
4474 }
4475 }
4476 if (j >= 0) {
4477 SPLIT_INSERT(self->str, 0, j + 1);
4478 }
4479 return list;
4480
4481 onError:
4482 Py_DECREF(list);
4483 return NULL;
4484}
4485
4486static
4487PyObject *rsplit_char(PyUnicodeObject *self,
4488 PyObject *list,
4489 Py_UNICODE ch,
4490 int maxcount)
4491{
4492 register int i;
4493 register int j;
4494 int len = self->length;
4495 PyObject *str;
4496
4497 for (i = j = len - 1; i >= 0; ) {
4498 if (self->str[i] == ch) {
4499 if (maxcount-- <= 0)
4500 break;
4501 SPLIT_INSERT(self->str, i + 1, j + 1);
4502 j = i = i - 1;
4503 } else
4504 i--;
4505 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004506 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004507 SPLIT_INSERT(self->str, 0, j + 1);
4508 }
4509 return list;
4510
4511 onError:
4512 Py_DECREF(list);
4513 return NULL;
4514}
4515
4516static
4517PyObject *rsplit_substring(PyUnicodeObject *self,
4518 PyObject *list,
4519 PyUnicodeObject *substring,
4520 int maxcount)
4521{
4522 register int i;
4523 register int j;
4524 int len = self->length;
4525 int sublen = substring->length;
4526 PyObject *str;
4527
4528 for (i = len - sublen, j = len; i >= 0; ) {
4529 if (Py_UNICODE_MATCH(self, i, substring)) {
4530 if (maxcount-- <= 0)
4531 break;
4532 SPLIT_INSERT(self->str, i + sublen, j);
4533 j = i;
4534 i -= sublen;
4535 } else
4536 i--;
4537 }
4538 if (j >= 0) {
4539 SPLIT_INSERT(self->str, 0, j);
4540 }
4541 return list;
4542
4543 onError:
4544 Py_DECREF(list);
4545 return NULL;
4546}
4547
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004549#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550
4551static
4552PyObject *split(PyUnicodeObject *self,
4553 PyUnicodeObject *substring,
4554 int maxcount)
4555{
4556 PyObject *list;
4557
4558 if (maxcount < 0)
4559 maxcount = INT_MAX;
4560
4561 list = PyList_New(0);
4562 if (!list)
4563 return NULL;
4564
4565 if (substring == NULL)
4566 return split_whitespace(self,list,maxcount);
4567
4568 else if (substring->length == 1)
4569 return split_char(self,list,substring->str[0],maxcount);
4570
4571 else if (substring->length == 0) {
4572 Py_DECREF(list);
4573 PyErr_SetString(PyExc_ValueError, "empty separator");
4574 return NULL;
4575 }
4576 else
4577 return split_substring(self,list,substring,maxcount);
4578}
4579
Tim Petersced69f82003-09-16 20:30:58 +00004580static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004581PyObject *rsplit(PyUnicodeObject *self,
4582 PyUnicodeObject *substring,
4583 int maxcount)
4584{
4585 PyObject *list;
4586
4587 if (maxcount < 0)
4588 maxcount = INT_MAX;
4589
4590 list = PyList_New(0);
4591 if (!list)
4592 return NULL;
4593
4594 if (substring == NULL)
4595 return rsplit_whitespace(self,list,maxcount);
4596
4597 else if (substring->length == 1)
4598 return rsplit_char(self,list,substring->str[0],maxcount);
4599
4600 else if (substring->length == 0) {
4601 Py_DECREF(list);
4602 PyErr_SetString(PyExc_ValueError, "empty separator");
4603 return NULL;
4604 }
4605 else
4606 return rsplit_substring(self,list,substring,maxcount);
4607}
4608
4609static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610PyObject *replace(PyUnicodeObject *self,
4611 PyUnicodeObject *str1,
4612 PyUnicodeObject *str2,
4613 int maxcount)
4614{
4615 PyUnicodeObject *u;
4616
4617 if (maxcount < 0)
4618 maxcount = INT_MAX;
4619
4620 if (str1->length == 1 && str2->length == 1) {
4621 int i;
4622
4623 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004624 if (!findchar(self->str, self->length, str1->str[0]) &&
4625 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 /* nothing to replace, return original string */
4627 Py_INCREF(self);
4628 u = self;
4629 } else {
4630 Py_UNICODE u1 = str1->str[0];
4631 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004632
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004634 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 self->length
4636 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004637 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004638 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004639 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004640 for (i = 0; i < u->length; i++)
4641 if (u->str[i] == u1) {
4642 if (--maxcount < 0)
4643 break;
4644 u->str[i] = u2;
4645 }
4646 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004647 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648
4649 } else {
4650 int n, i;
4651 Py_UNICODE *p;
4652
4653 /* replace strings */
4654 n = count(self, 0, self->length, str1);
4655 if (n > maxcount)
4656 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004657 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004659 if (PyUnicode_CheckExact(self)) {
4660 Py_INCREF(self);
4661 u = self;
4662 }
4663 else {
4664 u = (PyUnicodeObject *)
4665 PyUnicode_FromUnicode(self->str, self->length);
4666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 } else {
4668 u = _PyUnicode_New(
4669 self->length + n * (str2->length - str1->length));
4670 if (u) {
4671 i = 0;
4672 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004673 if (str1->length > 0) {
4674 while (i <= self->length - str1->length)
4675 if (Py_UNICODE_MATCH(self, i, str1)) {
4676 /* replace string segment */
4677 Py_UNICODE_COPY(p, str2->str, str2->length);
4678 p += str2->length;
4679 i += str1->length;
4680 if (--n <= 0) {
4681 /* copy remaining part */
4682 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4683 break;
4684 }
4685 } else
4686 *p++ = self->str[i++];
4687 } else {
4688 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 Py_UNICODE_COPY(p, str2->str, str2->length);
4690 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004691 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004694 }
4695 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4696 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004697 }
4698 }
4699 }
Tim Petersced69f82003-09-16 20:30:58 +00004700
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 return (PyObject *) u;
4702}
4703
4704/* --- Unicode Object Methods --------------------------------------------- */
4705
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004706PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707"S.title() -> unicode\n\
4708\n\
4709Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004710characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711
4712static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004713unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715 return fixup(self, fixtitle);
4716}
4717
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004718PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719"S.capitalize() -> unicode\n\
4720\n\
4721Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004722have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723
4724static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004725unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727 return fixup(self, fixcapitalize);
4728}
4729
4730#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004731PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732"S.capwords() -> unicode\n\
4733\n\
4734Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004735normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004736
4737static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004738unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739{
4740 PyObject *list;
4741 PyObject *item;
4742 int i;
4743
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 /* Split into words */
4745 list = split(self, NULL, -1);
4746 if (!list)
4747 return NULL;
4748
4749 /* Capitalize each word */
4750 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4751 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4752 fixcapitalize);
4753 if (item == NULL)
4754 goto onError;
4755 Py_DECREF(PyList_GET_ITEM(list, i));
4756 PyList_SET_ITEM(list, i, item);
4757 }
4758
4759 /* Join the words to form a new string */
4760 item = PyUnicode_Join(NULL, list);
4761
4762onError:
4763 Py_DECREF(list);
4764 return (PyObject *)item;
4765}
4766#endif
4767
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004768/* Argument converter. Coerces to a single unicode character */
4769
4770static int
4771convert_uc(PyObject *obj, void *addr)
4772{
4773 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4774 PyObject *uniobj;
4775 Py_UNICODE *unistr;
4776
4777 uniobj = PyUnicode_FromObject(obj);
4778 if (uniobj == NULL) {
4779 PyErr_SetString(PyExc_TypeError,
4780 "The fill character cannot be converted to Unicode");
4781 return 0;
4782 }
4783 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4784 PyErr_SetString(PyExc_TypeError,
4785 "The fill character must be exactly one character long");
4786 Py_DECREF(uniobj);
4787 return 0;
4788 }
4789 unistr = PyUnicode_AS_UNICODE(uniobj);
4790 *fillcharloc = unistr[0];
4791 Py_DECREF(uniobj);
4792 return 1;
4793}
4794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004795PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004796"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004798Return S centered in a Unicode string of length width. Padding is\n\
4799done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800
4801static PyObject *
4802unicode_center(PyUnicodeObject *self, PyObject *args)
4803{
4804 int marg, left;
4805 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004806 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004808 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 return NULL;
4810
Tim Peters7a29bd52001-09-12 03:03:31 +00004811 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812 Py_INCREF(self);
4813 return (PyObject*) self;
4814 }
4815
4816 marg = width - self->length;
4817 left = marg / 2 + (marg & width & 1);
4818
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004819 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004820}
4821
Marc-André Lemburge5034372000-08-08 08:04:29 +00004822#if 0
4823
4824/* This code should go into some future Unicode collation support
4825 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004826 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004827
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004828/* speedy UTF-16 code point order comparison */
4829/* gleaned from: */
4830/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4831
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004832static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004833{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004834 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004835 0, 0, 0, 0, 0, 0, 0, 0,
4836 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004837 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004838};
4839
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840static int
4841unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4842{
4843 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004844
Guido van Rossumd57fd912000-03-10 22:53:23 +00004845 Py_UNICODE *s1 = str1->str;
4846 Py_UNICODE *s2 = str2->str;
4847
4848 len1 = str1->length;
4849 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004850
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004852 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004853
4854 c1 = *s1++;
4855 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004856
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004857 if (c1 > (1<<11) * 26)
4858 c1 += utf16Fixup[c1>>11];
4859 if (c2 > (1<<11) * 26)
4860 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004861 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004862
4863 if (c1 != c2)
4864 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004865
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004866 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 }
4868
4869 return (len1 < len2) ? -1 : (len1 != len2);
4870}
4871
Marc-André Lemburge5034372000-08-08 08:04:29 +00004872#else
4873
4874static int
4875unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4876{
4877 register int len1, len2;
4878
4879 Py_UNICODE *s1 = str1->str;
4880 Py_UNICODE *s2 = str2->str;
4881
4882 len1 = str1->length;
4883 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004884
Marc-André Lemburge5034372000-08-08 08:04:29 +00004885 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004886 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004887
Fredrik Lundh45714e92001-06-26 16:39:36 +00004888 c1 = *s1++;
4889 c2 = *s2++;
4890
4891 if (c1 != c2)
4892 return (c1 < c2) ? -1 : 1;
4893
Marc-André Lemburge5034372000-08-08 08:04:29 +00004894 len1--; len2--;
4895 }
4896
4897 return (len1 < len2) ? -1 : (len1 != len2);
4898}
4899
4900#endif
4901
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902int PyUnicode_Compare(PyObject *left,
4903 PyObject *right)
4904{
4905 PyUnicodeObject *u = NULL, *v = NULL;
4906 int result;
4907
4908 /* Coerce the two arguments */
4909 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4910 if (u == NULL)
4911 goto onError;
4912 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4913 if (v == NULL)
4914 goto onError;
4915
Thomas Wouters7e474022000-07-16 12:04:32 +00004916 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 if (v == u) {
4918 Py_DECREF(u);
4919 Py_DECREF(v);
4920 return 0;
4921 }
4922
4923 result = unicode_compare(u, v);
4924
4925 Py_DECREF(u);
4926 Py_DECREF(v);
4927 return result;
4928
4929onError:
4930 Py_XDECREF(u);
4931 Py_XDECREF(v);
4932 return -1;
4933}
4934
Guido van Rossum403d68b2000-03-13 15:55:09 +00004935int PyUnicode_Contains(PyObject *container,
4936 PyObject *element)
4937{
4938 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004939 int result, size;
4940 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004941
4942 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004943 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004944 if (v == NULL) {
4945 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004946 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004947 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004948 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004949 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004950 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004951 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004952
Barry Warsaw817918c2002-08-06 16:58:21 +00004953 size = PyUnicode_GET_SIZE(v);
4954 rhs = PyUnicode_AS_UNICODE(v);
4955 lhs = PyUnicode_AS_UNICODE(u);
4956
Guido van Rossum403d68b2000-03-13 15:55:09 +00004957 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004958 if (size == 1) {
4959 end = lhs + PyUnicode_GET_SIZE(u);
4960 while (lhs < end) {
4961 if (*lhs++ == *rhs) {
4962 result = 1;
4963 break;
4964 }
4965 }
4966 }
4967 else {
4968 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4969 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004970 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004971 result = 1;
4972 break;
4973 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004974 }
4975 }
4976
4977 Py_DECREF(u);
4978 Py_DECREF(v);
4979 return result;
4980
4981onError:
4982 Py_XDECREF(u);
4983 Py_XDECREF(v);
4984 return -1;
4985}
4986
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987/* Concat to string or Unicode object giving a new Unicode object. */
4988
4989PyObject *PyUnicode_Concat(PyObject *left,
4990 PyObject *right)
4991{
4992 PyUnicodeObject *u = NULL, *v = NULL, *w;
4993
4994 /* Coerce the two arguments */
4995 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4996 if (u == NULL)
4997 goto onError;
4998 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4999 if (v == NULL)
5000 goto onError;
5001
5002 /* Shortcuts */
5003 if (v == unicode_empty) {
5004 Py_DECREF(v);
5005 return (PyObject *)u;
5006 }
5007 if (u == unicode_empty) {
5008 Py_DECREF(u);
5009 return (PyObject *)v;
5010 }
5011
5012 /* Concat the two Unicode strings */
5013 w = _PyUnicode_New(u->length + v->length);
5014 if (w == NULL)
5015 goto onError;
5016 Py_UNICODE_COPY(w->str, u->str, u->length);
5017 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
5018
5019 Py_DECREF(u);
5020 Py_DECREF(v);
5021 return (PyObject *)w;
5022
5023onError:
5024 Py_XDECREF(u);
5025 Py_XDECREF(v);
5026 return NULL;
5027}
5028
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005029PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030"S.count(sub[, start[, end]]) -> int\n\
5031\n\
5032Return the number of occurrences of substring sub in Unicode string\n\
5033S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005034interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035
5036static PyObject *
5037unicode_count(PyUnicodeObject *self, PyObject *args)
5038{
5039 PyUnicodeObject *substring;
5040 int start = 0;
5041 int end = INT_MAX;
5042 PyObject *result;
5043
Guido van Rossumb8872e62000-05-09 14:14:27 +00005044 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
5045 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 return NULL;
5047
5048 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5049 (PyObject *)substring);
5050 if (substring == NULL)
5051 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005052
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053 if (start < 0)
5054 start += self->length;
5055 if (start < 0)
5056 start = 0;
5057 if (end > self->length)
5058 end = self->length;
5059 if (end < 0)
5060 end += self->length;
5061 if (end < 0)
5062 end = 0;
5063
5064 result = PyInt_FromLong((long) count(self, start, end, substring));
5065
5066 Py_DECREF(substring);
5067 return result;
5068}
5069
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005070PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005071"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005073Encodes S using the codec registered for encoding. encoding defaults\n\
5074to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00005075handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005076a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
5077'xmlcharrefreplace' as well as any other name registered with\n\
5078codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079
5080static PyObject *
5081unicode_encode(PyUnicodeObject *self, PyObject *args)
5082{
5083 char *encoding = NULL;
5084 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005085 PyObject *v;
5086
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
5088 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005089 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005090 if (v == NULL)
5091 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005092 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5093 PyErr_Format(PyExc_TypeError,
5094 "encoder did not return a string/unicode object "
5095 "(type=%.400s)",
5096 v->ob_type->tp_name);
5097 Py_DECREF(v);
5098 return NULL;
5099 }
5100 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005101
5102 onError:
5103 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005104}
5105
5106PyDoc_STRVAR(decode__doc__,
5107"S.decode([encoding[,errors]]) -> string or unicode\n\
5108\n\
5109Decodes S using the codec registered for encoding. encoding defaults\n\
5110to the default encoding. errors may be given to set a different error\n\
5111handling scheme. Default is 'strict' meaning that encoding errors raise\n\
5112a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
5113as well as any other name registerd with codecs.register_error that is\n\
5114able to handle UnicodeDecodeErrors.");
5115
5116static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00005117unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005118{
5119 char *encoding = NULL;
5120 char *errors = NULL;
5121 PyObject *v;
5122
5123 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
5124 return NULL;
5125 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005126 if (v == NULL)
5127 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00005128 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
5129 PyErr_Format(PyExc_TypeError,
5130 "decoder did not return a string/unicode object "
5131 "(type=%.400s)",
5132 v->ob_type->tp_name);
5133 Py_DECREF(v);
5134 return NULL;
5135 }
5136 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00005137
5138 onError:
5139 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140}
5141
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005142PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143"S.expandtabs([tabsize]) -> unicode\n\
5144\n\
5145Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005146If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147
5148static PyObject*
5149unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
5150{
5151 Py_UNICODE *e;
5152 Py_UNICODE *p;
5153 Py_UNICODE *q;
5154 int i, j;
5155 PyUnicodeObject *u;
5156 int tabsize = 8;
5157
5158 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5159 return NULL;
5160
Thomas Wouters7e474022000-07-16 12:04:32 +00005161 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 i = j = 0;
5163 e = self->str + self->length;
5164 for (p = self->str; p < e; p++)
5165 if (*p == '\t') {
5166 if (tabsize > 0)
5167 j += tabsize - (j % tabsize);
5168 }
5169 else {
5170 j++;
5171 if (*p == '\n' || *p == '\r') {
5172 i += j;
5173 j = 0;
5174 }
5175 }
5176
5177 /* Second pass: create output string and fill it */
5178 u = _PyUnicode_New(i + j);
5179 if (!u)
5180 return NULL;
5181
5182 j = 0;
5183 q = u->str;
5184
5185 for (p = self->str; p < e; p++)
5186 if (*p == '\t') {
5187 if (tabsize > 0) {
5188 i = tabsize - (j % tabsize);
5189 j += i;
5190 while (i--)
5191 *q++ = ' ';
5192 }
5193 }
5194 else {
5195 j++;
5196 *q++ = *p;
5197 if (*p == '\n' || *p == '\r')
5198 j = 0;
5199 }
5200
5201 return (PyObject*) u;
5202}
5203
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005204PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205"S.find(sub [,start [,end]]) -> int\n\
5206\n\
5207Return the lowest index in S where substring sub is found,\n\
5208such that sub is contained within s[start,end]. Optional\n\
5209arguments start and end are interpreted as in slice notation.\n\
5210\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005211Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005212
5213static PyObject *
5214unicode_find(PyUnicodeObject *self, PyObject *args)
5215{
5216 PyUnicodeObject *substring;
5217 int start = 0;
5218 int end = INT_MAX;
5219 PyObject *result;
5220
Guido van Rossumb8872e62000-05-09 14:14:27 +00005221 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5222 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 return NULL;
5224 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5225 (PyObject *)substring);
5226 if (substring == NULL)
5227 return NULL;
5228
5229 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5230
5231 Py_DECREF(substring);
5232 return result;
5233}
5234
5235static PyObject *
5236unicode_getitem(PyUnicodeObject *self, int index)
5237{
5238 if (index < 0 || index >= self->length) {
5239 PyErr_SetString(PyExc_IndexError, "string index out of range");
5240 return NULL;
5241 }
5242
5243 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5244}
5245
5246static long
5247unicode_hash(PyUnicodeObject *self)
5248{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005249 /* Since Unicode objects compare equal to their ASCII string
5250 counterparts, they should use the individual character values
5251 as basis for their hash value. This is needed to assure that
5252 strings and Unicode objects behave in the same way as
5253 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254
Fredrik Lundhdde61642000-07-10 18:27:47 +00005255 register int len;
5256 register Py_UNICODE *p;
5257 register long x;
5258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 if (self->hash != -1)
5260 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005261 len = PyUnicode_GET_SIZE(self);
5262 p = PyUnicode_AS_UNICODE(self);
5263 x = *p << 7;
5264 while (--len >= 0)
5265 x = (1000003*x) ^ *p++;
5266 x ^= PyUnicode_GET_SIZE(self);
5267 if (x == -1)
5268 x = -2;
5269 self->hash = x;
5270 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271}
5272
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005273PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274"S.index(sub [,start [,end]]) -> int\n\
5275\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005276Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277
5278static PyObject *
5279unicode_index(PyUnicodeObject *self, PyObject *args)
5280{
5281 int result;
5282 PyUnicodeObject *substring;
5283 int start = 0;
5284 int end = INT_MAX;
5285
Guido van Rossumb8872e62000-05-09 14:14:27 +00005286 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5287 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005289
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5291 (PyObject *)substring);
5292 if (substring == NULL)
5293 return NULL;
5294
5295 result = findstring(self, substring, start, end, 1);
5296
5297 Py_DECREF(substring);
5298 if (result < 0) {
5299 PyErr_SetString(PyExc_ValueError, "substring not found");
5300 return NULL;
5301 }
5302 return PyInt_FromLong(result);
5303}
5304
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005305PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005306"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005308Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005309at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310
5311static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005312unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313{
5314 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5315 register const Py_UNICODE *e;
5316 int cased;
5317
Guido van Rossumd57fd912000-03-10 22:53:23 +00005318 /* Shortcut for single character strings */
5319 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005320 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005321
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005322 /* Special case for empty strings */
5323 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005324 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005325
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326 e = p + PyUnicode_GET_SIZE(self);
5327 cased = 0;
5328 for (; p < e; p++) {
5329 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005330
Guido van Rossumd57fd912000-03-10 22:53:23 +00005331 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005332 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 else if (!cased && Py_UNICODE_ISLOWER(ch))
5334 cased = 1;
5335 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005336 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005337}
5338
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005339PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005340"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005342Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005343at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344
5345static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005346unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347{
5348 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5349 register const Py_UNICODE *e;
5350 int cased;
5351
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 /* Shortcut for single character strings */
5353 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005354 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005356 /* Special case for empty strings */
5357 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005358 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005359
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 e = p + PyUnicode_GET_SIZE(self);
5361 cased = 0;
5362 for (; p < e; p++) {
5363 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005364
Guido van Rossumd57fd912000-03-10 22:53:23 +00005365 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005366 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005367 else if (!cased && Py_UNICODE_ISUPPER(ch))
5368 cased = 1;
5369 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005370 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371}
5372
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005373PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005374"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005375\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005376Return True if S is a titlecased string and there is at least one\n\
5377character in S, i.e. upper- and titlecase characters may only\n\
5378follow uncased characters and lowercase characters only cased ones.\n\
5379Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005380
5381static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005382unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383{
5384 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5385 register const Py_UNICODE *e;
5386 int cased, previous_is_cased;
5387
Guido van Rossumd57fd912000-03-10 22:53:23 +00005388 /* Shortcut for single character strings */
5389 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005390 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5391 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005393 /* Special case for empty strings */
5394 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005395 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005396
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 e = p + PyUnicode_GET_SIZE(self);
5398 cased = 0;
5399 previous_is_cased = 0;
5400 for (; p < e; p++) {
5401 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005402
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5404 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005405 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 previous_is_cased = 1;
5407 cased = 1;
5408 }
5409 else if (Py_UNICODE_ISLOWER(ch)) {
5410 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005411 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412 previous_is_cased = 1;
5413 cased = 1;
5414 }
5415 else
5416 previous_is_cased = 0;
5417 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005418 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419}
5420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005421PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005422"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005424Return True if all characters in S are whitespace\n\
5425and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426
5427static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005428unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429{
5430 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5431 register const Py_UNICODE *e;
5432
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 /* Shortcut for single character strings */
5434 if (PyUnicode_GET_SIZE(self) == 1 &&
5435 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005436 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005437
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005438 /* Special case for empty strings */
5439 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005440 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005441
Guido van Rossumd57fd912000-03-10 22:53:23 +00005442 e = p + PyUnicode_GET_SIZE(self);
5443 for (; p < e; p++) {
5444 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005445 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005447 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005448}
5449
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005450PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005451"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005452\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005453Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005454and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005455
5456static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005457unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005458{
5459 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5460 register const Py_UNICODE *e;
5461
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005462 /* Shortcut for single character strings */
5463 if (PyUnicode_GET_SIZE(self) == 1 &&
5464 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005465 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005466
5467 /* Special case for empty strings */
5468 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005469 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005470
5471 e = p + PyUnicode_GET_SIZE(self);
5472 for (; p < e; p++) {
5473 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005474 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005475 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005476 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005477}
5478
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005479PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005480"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005481\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005482Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005483and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005484
5485static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005486unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005487{
5488 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5489 register const Py_UNICODE *e;
5490
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005491 /* Shortcut for single character strings */
5492 if (PyUnicode_GET_SIZE(self) == 1 &&
5493 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005494 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005495
5496 /* Special case for empty strings */
5497 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005498 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005499
5500 e = p + PyUnicode_GET_SIZE(self);
5501 for (; p < e; p++) {
5502 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005503 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005504 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005505 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005506}
5507
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005508PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005509"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005510\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005511Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005512False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513
5514static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005515unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005516{
5517 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5518 register const Py_UNICODE *e;
5519
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520 /* Shortcut for single character strings */
5521 if (PyUnicode_GET_SIZE(self) == 1 &&
5522 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005523 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005524
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005525 /* Special case for empty strings */
5526 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005527 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005528
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 e = p + PyUnicode_GET_SIZE(self);
5530 for (; p < e; p++) {
5531 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005532 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005533 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005534 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005535}
5536
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005537PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005538"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005539\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005540Return True if all characters in S are digits\n\
5541and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542
5543static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005544unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005545{
5546 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5547 register const Py_UNICODE *e;
5548
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 /* Shortcut for single character strings */
5550 if (PyUnicode_GET_SIZE(self) == 1 &&
5551 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005552 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005553
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005554 /* Special case for empty strings */
5555 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005556 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005557
Guido van Rossumd57fd912000-03-10 22:53:23 +00005558 e = p + PyUnicode_GET_SIZE(self);
5559 for (; p < e; p++) {
5560 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005561 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005563 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005564}
5565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005566PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005567"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005569Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005570False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571
5572static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005573unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005574{
5575 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5576 register const Py_UNICODE *e;
5577
Guido van Rossumd57fd912000-03-10 22:53:23 +00005578 /* Shortcut for single character strings */
5579 if (PyUnicode_GET_SIZE(self) == 1 &&
5580 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005581 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005583 /* Special case for empty strings */
5584 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005585 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005586
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 e = p + PyUnicode_GET_SIZE(self);
5588 for (; p < e; p++) {
5589 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005590 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005592 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593}
5594
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005595PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005596"S.join(sequence) -> unicode\n\
5597\n\
5598Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005599sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600
5601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005602unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005604 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605}
5606
5607static int
5608unicode_length(PyUnicodeObject *self)
5609{
5610 return self->length;
5611}
5612
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005613PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005614"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615\n\
5616Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005617done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618
5619static PyObject *
5620unicode_ljust(PyUnicodeObject *self, PyObject *args)
5621{
5622 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005623 Py_UNICODE fillchar = ' ';
5624
5625 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 return NULL;
5627
Tim Peters7a29bd52001-09-12 03:03:31 +00005628 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 Py_INCREF(self);
5630 return (PyObject*) self;
5631 }
5632
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005633 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634}
5635
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005636PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637"S.lower() -> unicode\n\
5638\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005639Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005640
5641static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005642unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005643{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 return fixup(self, fixlower);
5645}
5646
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005647#define LEFTSTRIP 0
5648#define RIGHTSTRIP 1
5649#define BOTHSTRIP 2
5650
5651/* Arrays indexed by above */
5652static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5653
5654#define STRIPNAME(i) (stripformat[i]+3)
5655
5656static const Py_UNICODE *
5657unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5658{
Tim Peters030a5ce2002-04-22 19:00:10 +00005659 size_t i;
5660 for (i = 0; i < n; ++i)
5661 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005662 return s+i;
5663 return NULL;
5664}
5665
5666/* externally visible for str.strip(unicode) */
5667PyObject *
5668_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5669{
5670 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5671 int len = PyUnicode_GET_SIZE(self);
5672 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5673 int seplen = PyUnicode_GET_SIZE(sepobj);
5674 int i, j;
5675
5676 i = 0;
5677 if (striptype != RIGHTSTRIP) {
5678 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5679 i++;
5680 }
5681 }
5682
5683 j = len;
5684 if (striptype != LEFTSTRIP) {
5685 do {
5686 j--;
5687 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5688 j++;
5689 }
5690
5691 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5692 Py_INCREF(self);
5693 return (PyObject*)self;
5694 }
5695 else
5696 return PyUnicode_FromUnicode(s+i, j-i);
5697}
5698
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699
5700static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005701do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005702{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005703 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5704 int len = PyUnicode_GET_SIZE(self), i, j;
5705
5706 i = 0;
5707 if (striptype != RIGHTSTRIP) {
5708 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5709 i++;
5710 }
5711 }
5712
5713 j = len;
5714 if (striptype != LEFTSTRIP) {
5715 do {
5716 j--;
5717 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5718 j++;
5719 }
5720
5721 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5722 Py_INCREF(self);
5723 return (PyObject*)self;
5724 }
5725 else
5726 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727}
5728
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005729
5730static PyObject *
5731do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5732{
5733 PyObject *sep = NULL;
5734
5735 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5736 return NULL;
5737
5738 if (sep != NULL && sep != Py_None) {
5739 if (PyUnicode_Check(sep))
5740 return _PyUnicode_XStrip(self, striptype, sep);
5741 else if (PyString_Check(sep)) {
5742 PyObject *res;
5743 sep = PyUnicode_FromObject(sep);
5744 if (sep==NULL)
5745 return NULL;
5746 res = _PyUnicode_XStrip(self, striptype, sep);
5747 Py_DECREF(sep);
5748 return res;
5749 }
5750 else {
5751 PyErr_Format(PyExc_TypeError,
5752 "%s arg must be None, unicode or str",
5753 STRIPNAME(striptype));
5754 return NULL;
5755 }
5756 }
5757
5758 return do_strip(self, striptype);
5759}
5760
5761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005762PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005763"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005764\n\
5765Return a copy of the string S with leading and trailing\n\
5766whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005767If chars is given and not None, remove characters in chars instead.\n\
5768If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005769
5770static PyObject *
5771unicode_strip(PyUnicodeObject *self, PyObject *args)
5772{
5773 if (PyTuple_GET_SIZE(args) == 0)
5774 return do_strip(self, BOTHSTRIP); /* Common case */
5775 else
5776 return do_argstrip(self, BOTHSTRIP, args);
5777}
5778
5779
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005780PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005781"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005782\n\
5783Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005784If chars is given and not None, remove characters in chars instead.\n\
5785If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005786
5787static PyObject *
5788unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5789{
5790 if (PyTuple_GET_SIZE(args) == 0)
5791 return do_strip(self, LEFTSTRIP); /* Common case */
5792 else
5793 return do_argstrip(self, LEFTSTRIP, args);
5794}
5795
5796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005797PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005798"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005799\n\
5800Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005801If chars is given and not None, remove characters in chars instead.\n\
5802If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005803
5804static PyObject *
5805unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5806{
5807 if (PyTuple_GET_SIZE(args) == 0)
5808 return do_strip(self, RIGHTSTRIP); /* Common case */
5809 else
5810 return do_argstrip(self, RIGHTSTRIP, args);
5811}
5812
5813
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814static PyObject*
5815unicode_repeat(PyUnicodeObject *str, int len)
5816{
5817 PyUnicodeObject *u;
5818 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005819 int nchars;
5820 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005821
5822 if (len < 0)
5823 len = 0;
5824
Tim Peters7a29bd52001-09-12 03:03:31 +00005825 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005826 /* no repeat, return original string */
5827 Py_INCREF(str);
5828 return (PyObject*) str;
5829 }
Tim Peters8f422462000-09-09 06:13:41 +00005830
5831 /* ensure # of chars needed doesn't overflow int and # of bytes
5832 * needed doesn't overflow size_t
5833 */
5834 nchars = len * str->length;
5835 if (len && nchars / len != str->length) {
5836 PyErr_SetString(PyExc_OverflowError,
5837 "repeated string is too long");
5838 return NULL;
5839 }
5840 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5841 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5842 PyErr_SetString(PyExc_OverflowError,
5843 "repeated string is too long");
5844 return NULL;
5845 }
5846 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 if (!u)
5848 return NULL;
5849
5850 p = u->str;
5851
5852 while (len-- > 0) {
5853 Py_UNICODE_COPY(p, str->str, str->length);
5854 p += str->length;
5855 }
5856
5857 return (PyObject*) u;
5858}
5859
5860PyObject *PyUnicode_Replace(PyObject *obj,
5861 PyObject *subobj,
5862 PyObject *replobj,
5863 int maxcount)
5864{
5865 PyObject *self;
5866 PyObject *str1;
5867 PyObject *str2;
5868 PyObject *result;
5869
5870 self = PyUnicode_FromObject(obj);
5871 if (self == NULL)
5872 return NULL;
5873 str1 = PyUnicode_FromObject(subobj);
5874 if (str1 == NULL) {
5875 Py_DECREF(self);
5876 return NULL;
5877 }
5878 str2 = PyUnicode_FromObject(replobj);
5879 if (str2 == NULL) {
5880 Py_DECREF(self);
5881 Py_DECREF(str1);
5882 return NULL;
5883 }
Tim Petersced69f82003-09-16 20:30:58 +00005884 result = replace((PyUnicodeObject *)self,
5885 (PyUnicodeObject *)str1,
5886 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 maxcount);
5888 Py_DECREF(self);
5889 Py_DECREF(str1);
5890 Py_DECREF(str2);
5891 return result;
5892}
5893
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005894PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895"S.replace (old, new[, maxsplit]) -> unicode\n\
5896\n\
5897Return a copy of S with all occurrences of substring\n\
5898old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005899given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005900
5901static PyObject*
5902unicode_replace(PyUnicodeObject *self, PyObject *args)
5903{
5904 PyUnicodeObject *str1;
5905 PyUnicodeObject *str2;
5906 int maxcount = -1;
5907 PyObject *result;
5908
5909 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5910 return NULL;
5911 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5912 if (str1 == NULL)
5913 return NULL;
5914 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005915 if (str2 == NULL) {
5916 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005918 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005919
5920 result = replace(self, str1, str2, maxcount);
5921
5922 Py_DECREF(str1);
5923 Py_DECREF(str2);
5924 return result;
5925}
5926
5927static
5928PyObject *unicode_repr(PyObject *unicode)
5929{
5930 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5931 PyUnicode_GET_SIZE(unicode),
5932 1);
5933}
5934
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005935PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005936"S.rfind(sub [,start [,end]]) -> int\n\
5937\n\
5938Return the highest index in S where substring sub is found,\n\
5939such that sub is contained within s[start,end]. Optional\n\
5940arguments start and end are interpreted as in slice notation.\n\
5941\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005942Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943
5944static PyObject *
5945unicode_rfind(PyUnicodeObject *self, PyObject *args)
5946{
5947 PyUnicodeObject *substring;
5948 int start = 0;
5949 int end = INT_MAX;
5950 PyObject *result;
5951
Guido van Rossumb8872e62000-05-09 14:14:27 +00005952 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5953 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005954 return NULL;
5955 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5956 (PyObject *)substring);
5957 if (substring == NULL)
5958 return NULL;
5959
5960 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5961
5962 Py_DECREF(substring);
5963 return result;
5964}
5965
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005966PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005967"S.rindex(sub [,start [,end]]) -> int\n\
5968\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005969Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005970
5971static PyObject *
5972unicode_rindex(PyUnicodeObject *self, PyObject *args)
5973{
5974 int result;
5975 PyUnicodeObject *substring;
5976 int start = 0;
5977 int end = INT_MAX;
5978
Guido van Rossumb8872e62000-05-09 14:14:27 +00005979 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5980 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005981 return NULL;
5982 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5983 (PyObject *)substring);
5984 if (substring == NULL)
5985 return NULL;
5986
5987 result = findstring(self, substring, start, end, -1);
5988
5989 Py_DECREF(substring);
5990 if (result < 0) {
5991 PyErr_SetString(PyExc_ValueError, "substring not found");
5992 return NULL;
5993 }
5994 return PyInt_FromLong(result);
5995}
5996
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005997PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005998"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999\n\
6000Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006001done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006002
6003static PyObject *
6004unicode_rjust(PyUnicodeObject *self, PyObject *args)
6005{
6006 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006007 Py_UNICODE fillchar = ' ';
6008
6009 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006010 return NULL;
6011
Tim Peters7a29bd52001-09-12 03:03:31 +00006012 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006013 Py_INCREF(self);
6014 return (PyObject*) self;
6015 }
6016
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00006017 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006018}
6019
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020static PyObject*
6021unicode_slice(PyUnicodeObject *self, int start, int end)
6022{
6023 /* standard clamping */
6024 if (start < 0)
6025 start = 0;
6026 if (end < 0)
6027 end = 0;
6028 if (end > self->length)
6029 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00006030 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 /* full slice, return original string */
6032 Py_INCREF(self);
6033 return (PyObject*) self;
6034 }
6035 if (start > end)
6036 start = end;
6037 /* copy slice */
6038 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
6039 end - start);
6040}
6041
6042PyObject *PyUnicode_Split(PyObject *s,
6043 PyObject *sep,
6044 int maxsplit)
6045{
6046 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00006047
Guido van Rossumd57fd912000-03-10 22:53:23 +00006048 s = PyUnicode_FromObject(s);
6049 if (s == NULL)
6050 return NULL;
6051 if (sep != NULL) {
6052 sep = PyUnicode_FromObject(sep);
6053 if (sep == NULL) {
6054 Py_DECREF(s);
6055 return NULL;
6056 }
6057 }
6058
6059 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6060
6061 Py_DECREF(s);
6062 Py_XDECREF(sep);
6063 return result;
6064}
6065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006066PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067"S.split([sep [,maxsplit]]) -> list of strings\n\
6068\n\
6069Return a list of the words in S, using sep as the\n\
6070delimiter string. If maxsplit is given, at most maxsplit\n\
Thomas Hellerca0d2cb2004-09-15 11:41:32 +00006071splits are done. If sep is not specified or is None,\n\
Walter Dörwald782afc52004-09-14 09:40:45 +00006072any whitespace string is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073
6074static PyObject*
6075unicode_split(PyUnicodeObject *self, PyObject *args)
6076{
6077 PyObject *substring = Py_None;
6078 int maxcount = -1;
6079
6080 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
6081 return NULL;
6082
6083 if (substring == Py_None)
6084 return split(self, NULL, maxcount);
6085 else if (PyUnicode_Check(substring))
6086 return split(self, (PyUnicodeObject *)substring, maxcount);
6087 else
6088 return PyUnicode_Split((PyObject *)self, substring, maxcount);
6089}
6090
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006091PyObject *PyUnicode_RSplit(PyObject *s,
6092 PyObject *sep,
6093 int maxsplit)
6094{
6095 PyObject *result;
6096
6097 s = PyUnicode_FromObject(s);
6098 if (s == NULL)
6099 return NULL;
6100 if (sep != NULL) {
6101 sep = PyUnicode_FromObject(sep);
6102 if (sep == NULL) {
6103 Py_DECREF(s);
6104 return NULL;
6105 }
6106 }
6107
6108 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
6109
6110 Py_DECREF(s);
6111 Py_XDECREF(sep);
6112 return result;
6113}
6114
6115PyDoc_STRVAR(rsplit__doc__,
6116"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
6117\n\
6118Return a list of the words in S, using sep as the\n\
6119delimiter string, starting at the end of the string and\n\
6120working to the front. If maxsplit is given, at most maxsplit\n\
6121splits are done. If sep is not specified, any whitespace string\n\
6122is a separator.");
6123
6124static PyObject*
6125unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6126{
6127 PyObject *substring = Py_None;
6128 int maxcount = -1;
6129
6130 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6131 return NULL;
6132
6133 if (substring == Py_None)
6134 return rsplit(self, NULL, maxcount);
6135 else if (PyUnicode_Check(substring))
6136 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6137 else
6138 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6139}
6140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006141PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006142"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143\n\
6144Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006145Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006146is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006147
6148static PyObject*
6149unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6150{
Guido van Rossum86662912000-04-11 15:38:46 +00006151 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
Guido van Rossum86662912000-04-11 15:38:46 +00006153 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006154 return NULL;
6155
Guido van Rossum86662912000-04-11 15:38:46 +00006156 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006157}
6158
6159static
6160PyObject *unicode_str(PyUnicodeObject *self)
6161{
Fred Drakee4315f52000-05-09 19:53:39 +00006162 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163}
6164
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006165PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166"S.swapcase() -> unicode\n\
6167\n\
6168Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006169and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006170
6171static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006172unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006174 return fixup(self, fixswapcase);
6175}
6176
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006177PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178"S.translate(table) -> unicode\n\
6179\n\
6180Return a copy of the string S, where all characters have been mapped\n\
6181through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006182Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6183Unmapped characters are left untouched. Characters mapped to None\n\
6184are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006185
6186static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006187unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006188{
Tim Petersced69f82003-09-16 20:30:58 +00006189 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006190 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006191 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006192 "ignore");
6193}
6194
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006195PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006196"S.upper() -> unicode\n\
6197\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006198Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199
6200static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006201unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 return fixup(self, fixupper);
6204}
6205
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006206PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006207"S.zfill(width) -> unicode\n\
6208\n\
6209Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006210of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006211
6212static PyObject *
6213unicode_zfill(PyUnicodeObject *self, PyObject *args)
6214{
6215 int fill;
6216 PyUnicodeObject *u;
6217
6218 int width;
6219 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6220 return NULL;
6221
6222 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006223 if (PyUnicode_CheckExact(self)) {
6224 Py_INCREF(self);
6225 return (PyObject*) self;
6226 }
6227 else
6228 return PyUnicode_FromUnicode(
6229 PyUnicode_AS_UNICODE(self),
6230 PyUnicode_GET_SIZE(self)
6231 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 }
6233
6234 fill = width - self->length;
6235
6236 u = pad(self, fill, 0, '0');
6237
Walter Dörwald068325e2002-04-15 13:36:47 +00006238 if (u == NULL)
6239 return NULL;
6240
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241 if (u->str[fill] == '+' || u->str[fill] == '-') {
6242 /* move sign to beginning of string */
6243 u->str[0] = u->str[fill];
6244 u->str[fill] = '0';
6245 }
6246
6247 return (PyObject*) u;
6248}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006249
6250#if 0
6251static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006252unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254 return PyInt_FromLong(unicode_freelist_size);
6255}
6256#endif
6257
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006258PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006259"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006260\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006261Return True if S starts with the specified prefix, False otherwise.\n\
6262With optional start, test S beginning at that position.\n\
6263With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006264
6265static PyObject *
6266unicode_startswith(PyUnicodeObject *self,
6267 PyObject *args)
6268{
6269 PyUnicodeObject *substring;
6270 int start = 0;
6271 int end = INT_MAX;
6272 PyObject *result;
6273
Guido van Rossumb8872e62000-05-09 14:14:27 +00006274 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6275 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 return NULL;
6277 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6278 (PyObject *)substring);
6279 if (substring == NULL)
6280 return NULL;
6281
Guido van Rossum77f6a652002-04-03 22:41:51 +00006282 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283
6284 Py_DECREF(substring);
6285 return result;
6286}
6287
6288
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006289PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006290"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006292Return True if S ends with the specified suffix, False otherwise.\n\
6293With optional start, test S beginning at that position.\n\
6294With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006295
6296static PyObject *
6297unicode_endswith(PyUnicodeObject *self,
6298 PyObject *args)
6299{
6300 PyUnicodeObject *substring;
6301 int start = 0;
6302 int end = INT_MAX;
6303 PyObject *result;
6304
Guido van Rossumb8872e62000-05-09 14:14:27 +00006305 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6306 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006307 return NULL;
6308 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6309 (PyObject *)substring);
6310 if (substring == NULL)
6311 return NULL;
6312
Guido van Rossum77f6a652002-04-03 22:41:51 +00006313 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314
6315 Py_DECREF(substring);
6316 return result;
6317}
6318
6319
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006320
6321static PyObject *
6322unicode_getnewargs(PyUnicodeObject *v)
6323{
6324 return Py_BuildValue("(u#)", v->str, v->length);
6325}
6326
6327
Guido van Rossumd57fd912000-03-10 22:53:23 +00006328static PyMethodDef unicode_methods[] = {
6329
6330 /* Order is according to common usage: often used methods should
6331 appear first, since lookup is done sequentially. */
6332
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006333 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6334 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6335 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006336 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006337 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6338 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6339 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6340 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6341 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6342 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6343 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6344 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6345 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6346 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006347 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006348 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006349/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6350 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6351 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6352 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006353 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006354 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006355 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006356 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6357 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6358 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6359 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6360 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6361 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6362 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6363 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6364 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6365 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6366 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6367 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6368 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6369 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006370 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006371#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006372 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006373#endif
6374
6375#if 0
6376 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006377 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006378#endif
6379
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006380 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006381 {NULL, NULL}
6382};
6383
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006384static PyObject *
6385unicode_mod(PyObject *v, PyObject *w)
6386{
6387 if (!PyUnicode_Check(v)) {
6388 Py_INCREF(Py_NotImplemented);
6389 return Py_NotImplemented;
6390 }
6391 return PyUnicode_Format(v, w);
6392}
6393
6394static PyNumberMethods unicode_as_number = {
6395 0, /*nb_add*/
6396 0, /*nb_subtract*/
6397 0, /*nb_multiply*/
6398 0, /*nb_divide*/
6399 unicode_mod, /*nb_remainder*/
6400};
6401
Guido van Rossumd57fd912000-03-10 22:53:23 +00006402static PySequenceMethods unicode_as_sequence = {
6403 (inquiry) unicode_length, /* sq_length */
6404 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6405 (intargfunc) unicode_repeat, /* sq_repeat */
6406 (intargfunc) unicode_getitem, /* sq_item */
6407 (intintargfunc) unicode_slice, /* sq_slice */
6408 0, /* sq_ass_item */
6409 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006410 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411};
6412
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006413static PyObject*
6414unicode_subscript(PyUnicodeObject* self, PyObject* item)
6415{
6416 if (PyInt_Check(item)) {
6417 long i = PyInt_AS_LONG(item);
6418 if (i < 0)
6419 i += PyString_GET_SIZE(self);
6420 return unicode_getitem(self, i);
6421 } else if (PyLong_Check(item)) {
6422 long i = PyLong_AsLong(item);
6423 if (i == -1 && PyErr_Occurred())
6424 return NULL;
6425 if (i < 0)
6426 i += PyString_GET_SIZE(self);
6427 return unicode_getitem(self, i);
6428 } else if (PySlice_Check(item)) {
6429 int start, stop, step, slicelength, cur, i;
6430 Py_UNICODE* source_buf;
6431 Py_UNICODE* result_buf;
6432 PyObject* result;
6433
6434 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6435 &start, &stop, &step, &slicelength) < 0) {
6436 return NULL;
6437 }
6438
6439 if (slicelength <= 0) {
6440 return PyUnicode_FromUnicode(NULL, 0);
6441 } else {
6442 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6443 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6444
6445 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6446 result_buf[i] = source_buf[cur];
6447 }
Tim Petersced69f82003-09-16 20:30:58 +00006448
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006449 result = PyUnicode_FromUnicode(result_buf, slicelength);
6450 PyMem_FREE(result_buf);
6451 return result;
6452 }
6453 } else {
6454 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6455 return NULL;
6456 }
6457}
6458
6459static PyMappingMethods unicode_as_mapping = {
6460 (inquiry)unicode_length, /* mp_length */
6461 (binaryfunc)unicode_subscript, /* mp_subscript */
6462 (objobjargproc)0, /* mp_ass_subscript */
6463};
6464
Guido van Rossumd57fd912000-03-10 22:53:23 +00006465static int
6466unicode_buffer_getreadbuf(PyUnicodeObject *self,
6467 int index,
6468 const void **ptr)
6469{
6470 if (index != 0) {
6471 PyErr_SetString(PyExc_SystemError,
6472 "accessing non-existent unicode segment");
6473 return -1;
6474 }
6475 *ptr = (void *) self->str;
6476 return PyUnicode_GET_DATA_SIZE(self);
6477}
6478
6479static int
6480unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6481 const void **ptr)
6482{
6483 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006484 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006485 return -1;
6486}
6487
6488static int
6489unicode_buffer_getsegcount(PyUnicodeObject *self,
6490 int *lenp)
6491{
6492 if (lenp)
6493 *lenp = PyUnicode_GET_DATA_SIZE(self);
6494 return 1;
6495}
6496
6497static int
6498unicode_buffer_getcharbuf(PyUnicodeObject *self,
6499 int index,
6500 const void **ptr)
6501{
6502 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006503
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504 if (index != 0) {
6505 PyErr_SetString(PyExc_SystemError,
6506 "accessing non-existent unicode segment");
6507 return -1;
6508 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006509 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006510 if (str == NULL)
6511 return -1;
6512 *ptr = (void *) PyString_AS_STRING(str);
6513 return PyString_GET_SIZE(str);
6514}
6515
6516/* Helpers for PyUnicode_Format() */
6517
6518static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006519getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006520{
6521 int argidx = *p_argidx;
6522 if (argidx < arglen) {
6523 (*p_argidx)++;
6524 if (arglen < 0)
6525 return args;
6526 else
6527 return PyTuple_GetItem(args, argidx);
6528 }
6529 PyErr_SetString(PyExc_TypeError,
6530 "not enough arguments for format string");
6531 return NULL;
6532}
6533
6534#define F_LJUST (1<<0)
6535#define F_SIGN (1<<1)
6536#define F_BLANK (1<<2)
6537#define F_ALT (1<<3)
6538#define F_ZERO (1<<4)
6539
6540static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542{
6543 register int i;
6544 int len;
6545 va_list va;
6546 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006547 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548
6549 /* First, format the string as char array, then expand to Py_UNICODE
6550 array. */
6551 charbuffer = (char *)buffer;
6552 len = vsprintf(charbuffer, format, va);
6553 for (i = len - 1; i >= 0; i--)
6554 buffer[i] = (Py_UNICODE) charbuffer[i];
6555
6556 va_end(va);
6557 return len;
6558}
6559
Guido van Rossum078151d2002-08-11 04:24:12 +00006560/* XXX To save some code duplication, formatfloat/long/int could have been
6561 shared with stringobject.c, converting from 8-bit to Unicode after the
6562 formatting is done. */
6563
Guido van Rossumd57fd912000-03-10 22:53:23 +00006564static int
6565formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006566 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 int flags,
6568 int prec,
6569 int type,
6570 PyObject *v)
6571{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006572 /* fmt = '%#.' + `prec` + `type`
6573 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006574 char fmt[20];
6575 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006576
Guido van Rossumd57fd912000-03-10 22:53:23 +00006577 x = PyFloat_AsDouble(v);
6578 if (x == -1.0 && PyErr_Occurred())
6579 return -1;
6580 if (prec < 0)
6581 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006582 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6583 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006584 /* Worst case length calc to ensure no buffer overrun:
6585
6586 'g' formats:
6587 fmt = %#.<prec>g
6588 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6589 for any double rep.)
6590 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6591
6592 'f' formats:
6593 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6594 len = 1 + 50 + 1 + prec = 52 + prec
6595
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006596 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006597 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006598
6599 */
6600 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6601 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006602 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006603 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006604 return -1;
6605 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006606 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6607 (flags&F_ALT) ? "#" : "",
6608 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 return usprintf(buf, fmt, x);
6610}
6611
Tim Peters38fd5b62000-09-21 05:43:11 +00006612static PyObject*
6613formatlong(PyObject *val, int flags, int prec, int type)
6614{
6615 char *buf;
6616 int i, len;
6617 PyObject *str; /* temporary string object. */
6618 PyUnicodeObject *result;
6619
6620 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6621 if (!str)
6622 return NULL;
6623 result = _PyUnicode_New(len);
6624 for (i = 0; i < len; i++)
6625 result->str[i] = buf[i];
6626 result->str[len] = 0;
6627 Py_DECREF(str);
6628 return (PyObject*)result;
6629}
6630
Guido van Rossumd57fd912000-03-10 22:53:23 +00006631static int
6632formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006633 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634 int flags,
6635 int prec,
6636 int type,
6637 PyObject *v)
6638{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006639 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006640 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6641 * + 1 + 1
6642 * = 24
6643 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006644 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006645 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 long x;
6647
6648 x = PyInt_AsLong(v);
6649 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006650 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006651 if (x < 0 && type == 'u') {
6652 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006653 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006654 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6655 sign = "-";
6656 else
6657 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006659 prec = 1;
6660
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006661 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6662 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006663 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006664 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006665 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006666 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006667 return -1;
6668 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006669
6670 if ((flags & F_ALT) &&
6671 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006672 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006673 * of issues that cause pain:
6674 * - when 0 is being converted, the C standard leaves off
6675 * the '0x' or '0X', which is inconsistent with other
6676 * %#x/%#X conversions and inconsistent with Python's
6677 * hex() function
6678 * - there are platforms that violate the standard and
6679 * convert 0 with the '0x' or '0X'
6680 * (Metrowerks, Compaq Tru64)
6681 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006682 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006683 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006684 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006685 * We can achieve the desired consistency by inserting our
6686 * own '0x' or '0X' prefix, and substituting %x/%X in place
6687 * of %#x/%#X.
6688 *
6689 * Note that this is the same approach as used in
6690 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006691 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006692 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6693 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006694 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006695 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006696 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6697 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006698 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006699 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006700 if (sign[0])
6701 return usprintf(buf, fmt, -x);
6702 else
6703 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704}
6705
6706static int
6707formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006708 size_t buflen,
6709 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006711 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006712 if (PyUnicode_Check(v)) {
6713 if (PyUnicode_GET_SIZE(v) != 1)
6714 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006717
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006718 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006719 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006720 goto onError;
6721 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6722 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723
6724 else {
6725 /* Integer input truncated to a character */
6726 long x;
6727 x = PyInt_AsLong(v);
6728 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006729 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006730#ifdef Py_UNICODE_WIDE
6731 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006732 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006733 "%c arg not in range(0x110000) "
6734 "(wide Python build)");
6735 return -1;
6736 }
6737#else
6738 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006739 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006740 "%c arg not in range(0x10000) "
6741 "(narrow Python build)");
6742 return -1;
6743 }
6744#endif
6745 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746 }
6747 buf[1] = '\0';
6748 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006749
6750 onError:
6751 PyErr_SetString(PyExc_TypeError,
6752 "%c requires int or char");
6753 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006754}
6755
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006756/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6757
6758 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6759 chars are formatted. XXX This is a magic number. Each formatting
6760 routine does bounds checking to ensure no overflow, but a better
6761 solution may be to malloc a buffer of appropriate size for each
6762 format. For now, the current solution is sufficient.
6763*/
6764#define FORMATBUFLEN (size_t)120
6765
Guido van Rossumd57fd912000-03-10 22:53:23 +00006766PyObject *PyUnicode_Format(PyObject *format,
6767 PyObject *args)
6768{
6769 Py_UNICODE *fmt, *res;
6770 int fmtcnt, rescnt, reslen, arglen, argidx;
6771 int args_owned = 0;
6772 PyUnicodeObject *result = NULL;
6773 PyObject *dict = NULL;
6774 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006775
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776 if (format == NULL || args == NULL) {
6777 PyErr_BadInternalCall();
6778 return NULL;
6779 }
6780 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006781 if (uformat == NULL)
6782 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006783 fmt = PyUnicode_AS_UNICODE(uformat);
6784 fmtcnt = PyUnicode_GET_SIZE(uformat);
6785
6786 reslen = rescnt = fmtcnt + 100;
6787 result = _PyUnicode_New(reslen);
6788 if (result == NULL)
6789 goto onError;
6790 res = PyUnicode_AS_UNICODE(result);
6791
6792 if (PyTuple_Check(args)) {
6793 arglen = PyTuple_Size(args);
6794 argidx = 0;
6795 }
6796 else {
6797 arglen = -1;
6798 argidx = -2;
6799 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006800 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6801 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006802 dict = args;
6803
6804 while (--fmtcnt >= 0) {
6805 if (*fmt != '%') {
6806 if (--rescnt < 0) {
6807 rescnt = fmtcnt + 100;
6808 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006809 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 return NULL;
6811 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6812 --rescnt;
6813 }
6814 *res++ = *fmt++;
6815 }
6816 else {
6817 /* Got a format specifier */
6818 int flags = 0;
6819 int width = -1;
6820 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006821 Py_UNICODE c = '\0';
6822 Py_UNICODE fill;
6823 PyObject *v = NULL;
6824 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006825 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006826 Py_UNICODE sign;
6827 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006828 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006829
6830 fmt++;
6831 if (*fmt == '(') {
6832 Py_UNICODE *keystart;
6833 int keylen;
6834 PyObject *key;
6835 int pcount = 1;
6836
6837 if (dict == NULL) {
6838 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006839 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006840 goto onError;
6841 }
6842 ++fmt;
6843 --fmtcnt;
6844 keystart = fmt;
6845 /* Skip over balanced parentheses */
6846 while (pcount > 0 && --fmtcnt >= 0) {
6847 if (*fmt == ')')
6848 --pcount;
6849 else if (*fmt == '(')
6850 ++pcount;
6851 fmt++;
6852 }
6853 keylen = fmt - keystart - 1;
6854 if (fmtcnt < 0 || pcount > 0) {
6855 PyErr_SetString(PyExc_ValueError,
6856 "incomplete format key");
6857 goto onError;
6858 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006859#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006860 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006861 then looked up since Python uses strings to hold
6862 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006863 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006864 key = PyUnicode_EncodeUTF8(keystart,
6865 keylen,
6866 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006867#else
6868 key = PyUnicode_FromUnicode(keystart, keylen);
6869#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 if (key == NULL)
6871 goto onError;
6872 if (args_owned) {
6873 Py_DECREF(args);
6874 args_owned = 0;
6875 }
6876 args = PyObject_GetItem(dict, key);
6877 Py_DECREF(key);
6878 if (args == NULL) {
6879 goto onError;
6880 }
6881 args_owned = 1;
6882 arglen = -1;
6883 argidx = -2;
6884 }
6885 while (--fmtcnt >= 0) {
6886 switch (c = *fmt++) {
6887 case '-': flags |= F_LJUST; continue;
6888 case '+': flags |= F_SIGN; continue;
6889 case ' ': flags |= F_BLANK; continue;
6890 case '#': flags |= F_ALT; continue;
6891 case '0': flags |= F_ZERO; continue;
6892 }
6893 break;
6894 }
6895 if (c == '*') {
6896 v = getnextarg(args, arglen, &argidx);
6897 if (v == NULL)
6898 goto onError;
6899 if (!PyInt_Check(v)) {
6900 PyErr_SetString(PyExc_TypeError,
6901 "* wants int");
6902 goto onError;
6903 }
6904 width = PyInt_AsLong(v);
6905 if (width < 0) {
6906 flags |= F_LJUST;
6907 width = -width;
6908 }
6909 if (--fmtcnt >= 0)
6910 c = *fmt++;
6911 }
6912 else if (c >= '0' && c <= '9') {
6913 width = c - '0';
6914 while (--fmtcnt >= 0) {
6915 c = *fmt++;
6916 if (c < '0' || c > '9')
6917 break;
6918 if ((width*10) / 10 != width) {
6919 PyErr_SetString(PyExc_ValueError,
6920 "width too big");
6921 goto onError;
6922 }
6923 width = width*10 + (c - '0');
6924 }
6925 }
6926 if (c == '.') {
6927 prec = 0;
6928 if (--fmtcnt >= 0)
6929 c = *fmt++;
6930 if (c == '*') {
6931 v = getnextarg(args, arglen, &argidx);
6932 if (v == NULL)
6933 goto onError;
6934 if (!PyInt_Check(v)) {
6935 PyErr_SetString(PyExc_TypeError,
6936 "* wants int");
6937 goto onError;
6938 }
6939 prec = PyInt_AsLong(v);
6940 if (prec < 0)
6941 prec = 0;
6942 if (--fmtcnt >= 0)
6943 c = *fmt++;
6944 }
6945 else if (c >= '0' && c <= '9') {
6946 prec = c - '0';
6947 while (--fmtcnt >= 0) {
6948 c = Py_CHARMASK(*fmt++);
6949 if (c < '0' || c > '9')
6950 break;
6951 if ((prec*10) / 10 != prec) {
6952 PyErr_SetString(PyExc_ValueError,
6953 "prec too big");
6954 goto onError;
6955 }
6956 prec = prec*10 + (c - '0');
6957 }
6958 }
6959 } /* prec */
6960 if (fmtcnt >= 0) {
6961 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 if (--fmtcnt >= 0)
6963 c = *fmt++;
6964 }
6965 }
6966 if (fmtcnt < 0) {
6967 PyErr_SetString(PyExc_ValueError,
6968 "incomplete format");
6969 goto onError;
6970 }
6971 if (c != '%') {
6972 v = getnextarg(args, arglen, &argidx);
6973 if (v == NULL)
6974 goto onError;
6975 }
6976 sign = 0;
6977 fill = ' ';
6978 switch (c) {
6979
6980 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006981 pbuf = formatbuf;
6982 /* presume that buffer length is at least 1 */
6983 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 len = 1;
6985 break;
6986
6987 case 's':
6988 case 'r':
6989 if (PyUnicode_Check(v) && c == 's') {
6990 temp = v;
6991 Py_INCREF(temp);
6992 }
6993 else {
6994 PyObject *unicode;
6995 if (c == 's')
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00006996 temp = PyObject_Unicode(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006997 else
6998 temp = PyObject_Repr(v);
6999 if (temp == NULL)
7000 goto onError;
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007001 if (PyUnicode_Check(temp))
7002 /* nothing to do */;
7003 else if (PyString_Check(temp)) {
7004 /* convert to string to Unicode */
Fred Drakee4315f52000-05-09 19:53:39 +00007005 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00007006 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00007007 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00007008 "strict");
7009 Py_DECREF(temp);
7010 temp = unicode;
7011 if (temp == NULL)
7012 goto onError;
7013 }
Marc-André Lemburgd25c6502004-07-23 16:13:25 +00007014 else {
7015 Py_DECREF(temp);
7016 PyErr_SetString(PyExc_TypeError,
7017 "%s argument has non-string str()");
7018 goto onError;
7019 }
7020 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007021 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007022 len = PyUnicode_GET_SIZE(temp);
7023 if (prec >= 0 && len > prec)
7024 len = prec;
7025 break;
7026
7027 case 'i':
7028 case 'd':
7029 case 'u':
7030 case 'o':
7031 case 'x':
7032 case 'X':
7033 if (c == 'i')
7034 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00007035 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007036 temp = formatlong(v, flags, prec, c);
7037 if (!temp)
7038 goto onError;
7039 pbuf = PyUnicode_AS_UNICODE(temp);
7040 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00007041 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007042 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007043 else {
7044 pbuf = formatbuf;
7045 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7046 flags, prec, c, v);
7047 if (len < 0)
7048 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00007049 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007050 }
7051 if (flags & F_ZERO)
7052 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00007053 break;
7054
7055 case 'e':
7056 case 'E':
7057 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007058 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 case 'g':
7060 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00007061 if (c == 'F')
7062 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007063 pbuf = formatbuf;
7064 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
7065 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007066 if (len < 0)
7067 goto onError;
7068 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00007069 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070 fill = '0';
7071 break;
7072
7073 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007074 pbuf = formatbuf;
7075 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076 if (len < 0)
7077 goto onError;
7078 break;
7079
7080 default:
7081 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00007082 "unsupported format character '%c' (0x%x) "
7083 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00007084 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00007085 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00007086 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00007087 goto onError;
7088 }
7089 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00007090 if (*pbuf == '-' || *pbuf == '+') {
7091 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 len--;
7093 }
7094 else if (flags & F_SIGN)
7095 sign = '+';
7096 else if (flags & F_BLANK)
7097 sign = ' ';
7098 else
7099 sign = 0;
7100 }
7101 if (width < len)
7102 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007103 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 reslen -= rescnt;
7105 rescnt = width + fmtcnt + 100;
7106 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00007107 if (reslen < 0) {
7108 Py_DECREF(result);
7109 return PyErr_NoMemory();
7110 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007111 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007112 return NULL;
7113 res = PyUnicode_AS_UNICODE(result)
7114 + reslen - rescnt;
7115 }
7116 if (sign) {
7117 if (fill != ' ')
7118 *res++ = sign;
7119 rescnt--;
7120 if (width > len)
7121 width--;
7122 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007123 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7124 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007125 assert(pbuf[1] == c);
7126 if (fill != ' ') {
7127 *res++ = *pbuf++;
7128 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007129 }
Tim Petersfff53252001-04-12 18:38:48 +00007130 rescnt -= 2;
7131 width -= 2;
7132 if (width < 0)
7133 width = 0;
7134 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007135 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007136 if (width > len && !(flags & F_LJUST)) {
7137 do {
7138 --rescnt;
7139 *res++ = fill;
7140 } while (--width > len);
7141 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007142 if (fill == ' ') {
7143 if (sign)
7144 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007145 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007146 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007147 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007148 *res++ = *pbuf++;
7149 *res++ = *pbuf++;
7150 }
7151 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007152 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007153 res += len;
7154 rescnt -= len;
7155 while (--width >= len) {
7156 --rescnt;
7157 *res++ = ' ';
7158 }
7159 if (dict && (argidx < arglen) && c != '%') {
7160 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007161 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007162 goto onError;
7163 }
7164 Py_XDECREF(temp);
7165 } /* '%' */
7166 } /* until end */
7167 if (argidx < arglen && !dict) {
7168 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007169 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007170 goto onError;
7171 }
7172
7173 if (args_owned) {
7174 Py_DECREF(args);
7175 }
7176 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007177 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007178 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 return (PyObject *)result;
7180
7181 onError:
7182 Py_XDECREF(result);
7183 Py_DECREF(uformat);
7184 if (args_owned) {
7185 Py_DECREF(args);
7186 }
7187 return NULL;
7188}
7189
7190static PyBufferProcs unicode_as_buffer = {
7191 (getreadbufferproc) unicode_buffer_getreadbuf,
7192 (getwritebufferproc) unicode_buffer_getwritebuf,
7193 (getsegcountproc) unicode_buffer_getsegcount,
7194 (getcharbufferproc) unicode_buffer_getcharbuf,
7195};
7196
Jeremy Hylton938ace62002-07-17 16:30:39 +00007197static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007198unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7199
Tim Peters6d6c1a32001-08-02 04:15:00 +00007200static PyObject *
7201unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7202{
7203 PyObject *x = NULL;
7204 static char *kwlist[] = {"string", "encoding", "errors", 0};
7205 char *encoding = NULL;
7206 char *errors = NULL;
7207
Guido van Rossume023fe02001-08-30 03:12:59 +00007208 if (type != &PyUnicode_Type)
7209 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007210 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7211 kwlist, &x, &encoding, &errors))
7212 return NULL;
7213 if (x == NULL)
7214 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007215 if (encoding == NULL && errors == NULL)
7216 return PyObject_Unicode(x);
7217 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007218 return PyUnicode_FromEncodedObject(x, encoding, errors);
7219}
7220
Guido van Rossume023fe02001-08-30 03:12:59 +00007221static PyObject *
7222unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7223{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007224 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007225 int n;
7226
7227 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7228 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7229 if (tmp == NULL)
7230 return NULL;
7231 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007232 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007233 if (pnew == NULL) {
7234 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007235 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007236 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007237 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7238 if (pnew->str == NULL) {
7239 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007240 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007241 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007242 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007243 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007244 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7245 pnew->length = n;
7246 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007247 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007248 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007249}
7250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007251PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007252"unicode(string [, encoding[, errors]]) -> object\n\
7253\n\
7254Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007255encoding defaults to the current default string encoding.\n\
7256errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007257
Guido van Rossumd57fd912000-03-10 22:53:23 +00007258PyTypeObject PyUnicode_Type = {
7259 PyObject_HEAD_INIT(&PyType_Type)
7260 0, /* ob_size */
7261 "unicode", /* tp_name */
7262 sizeof(PyUnicodeObject), /* tp_size */
7263 0, /* tp_itemsize */
7264 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007265 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007266 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007267 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007268 0, /* tp_setattr */
7269 (cmpfunc) unicode_compare, /* tp_compare */
7270 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007271 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007272 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007273 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007274 (hashfunc) unicode_hash, /* tp_hash*/
7275 0, /* tp_call*/
7276 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007277 PyObject_GenericGetAttr, /* tp_getattro */
7278 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007279 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007280 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7281 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007282 unicode_doc, /* tp_doc */
7283 0, /* tp_traverse */
7284 0, /* tp_clear */
7285 0, /* tp_richcompare */
7286 0, /* tp_weaklistoffset */
7287 0, /* tp_iter */
7288 0, /* tp_iternext */
7289 unicode_methods, /* tp_methods */
7290 0, /* tp_members */
7291 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007292 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007293 0, /* tp_dict */
7294 0, /* tp_descr_get */
7295 0, /* tp_descr_set */
7296 0, /* tp_dictoffset */
7297 0, /* tp_init */
7298 0, /* tp_alloc */
7299 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007300 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007301};
7302
7303/* Initialize the Unicode implementation */
7304
Thomas Wouters78890102000-07-22 19:25:51 +00007305void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007306{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007307 int i;
7308
Fred Drakee4315f52000-05-09 19:53:39 +00007309 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007310 unicode_freelist = NULL;
7311 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007312 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007313 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007314 for (i = 0; i < 256; i++)
7315 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007316 if (PyType_Ready(&PyUnicode_Type) < 0)
7317 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007318}
7319
7320/* Finalize the Unicode implementation */
7321
7322void
Thomas Wouters78890102000-07-22 19:25:51 +00007323_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007324{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007325 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007326 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007327
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007328 Py_XDECREF(unicode_empty);
7329 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007330
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007331 for (i = 0; i < 256; i++) {
7332 if (unicode_latin1[i]) {
7333 Py_DECREF(unicode_latin1[i]);
7334 unicode_latin1[i] = NULL;
7335 }
7336 }
7337
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007338 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007339 PyUnicodeObject *v = u;
7340 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007341 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007342 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007343 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007344 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007345 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007346 unicode_freelist = NULL;
7347 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007348}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007349
7350/*
7351Local variables:
7352c-basic-offset: 4
7353indent-tabs-mode: nil
7354End:
7355*/