blob: c950f8b1691c712c4385ec78cd71770cc1ccabd4 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000056 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000057
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000101 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 /* MvL said unicode->str[] may be signed. Python generally assumes
136 * an int contains at least 32 bits, and we don't use more than
137 * 32 bits even in a UCS4 build, so casting to unsigned int should
138 * be correct.
139 */
140 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000141 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000142 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000143 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 return -1;
145 }
146
147 /* We allocate one more byte to make sure the string is
148 Ux0000 terminated -- XXX is this needed ? */
149 oldstr = unicode->str;
150 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
151 if (!unicode->str) {
152 unicode->str = oldstr;
153 PyErr_NoMemory();
154 return -1;
155 }
156 unicode->str[length] = 0;
157 unicode->length = length;
158
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000159 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000160 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000161 if (unicode->defenc) {
162 Py_DECREF(unicode->defenc);
163 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 }
165 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000166
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 return 0;
168}
169
170/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000171 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000172
173 XXX This allocator could further be enhanced by assuring that the
174 free list never reduces its size below 1.
175
176*/
177
178static
179PyUnicodeObject *_PyUnicode_New(int length)
180{
181 register PyUnicodeObject *unicode;
182
Tim Petersced69f82003-09-16 20:30:58 +0000183 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (length == 0 && unicode_empty != NULL) {
185 Py_INCREF(unicode_empty);
186 return unicode_empty;
187 }
188
189 /* Unicode freelist & memory allocation */
190 if (unicode_freelist) {
191 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000192 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000195 /* Keep-Alive optimization: we only upsize the buffer,
196 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000197 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000198 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000199 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000200 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000201 }
202 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000203 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000205 }
206 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 }
208 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000209 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000210 if (unicode == NULL)
211 return NULL;
212 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
213 }
214
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000215 if (!unicode->str) {
216 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000217 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000219 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000220 * the caller fails before initializing str -- unicode_resize()
221 * reads str[0], and the Keep-Alive optimization can keep memory
222 * allocated for str alive across a call to unicode_dealloc(unicode).
223 * We don't want unicode_resize to read uninitialized memory in
224 * that case.
225 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000226 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000230 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000235 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
Guido van Rossum9475a232001-10-05 20:51:39 +0000240void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000242 if (PyUnicode_CheckExact(unicode) &&
243 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000250 if (unicode->defenc) {
251 Py_DECREF(unicode->defenc);
252 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000261 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000262 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000263 }
264}
265
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000266int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000267{
268 register PyUnicodeObject *v;
269
270 /* Argument checks */
271 if (unicode == NULL) {
272 PyErr_BadInternalCall();
273 return -1;
274 }
275 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000276 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000277 PyErr_BadInternalCall();
278 return -1;
279 }
280
281 /* Resizing unicode_empty and single character objects is not
282 possible since these are being shared. We simply return a fresh
283 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000284 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285 (v == unicode_empty || v->length == 1)) {
286 PyUnicodeObject *w = _PyUnicode_New(length);
287 if (w == NULL)
288 return -1;
289 Py_UNICODE_COPY(w->str, v->str,
290 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000291 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000292 *unicode = (PyObject *)w;
293 return 0;
294 }
295
296 /* Note that we don't have to modify *unicode for unshared Unicode
297 objects, since we can modify them in-place. */
298 return unicode_resize(v, length);
299}
300
301/* Internal API for use in unicodeobject.c only ! */
302#define _PyUnicode_Resize(unicodevar, length) \
303 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
304
Guido van Rossumd57fd912000-03-10 22:53:23 +0000305PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
306 int size)
307{
308 PyUnicodeObject *unicode;
309
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000310 /* If the Unicode data is known at construction time, we can apply
311 some optimizations which share commonly used objects. */
312 if (u != NULL) {
313
314 /* Optimization for empty strings */
315 if (size == 0 && unicode_empty != NULL) {
316 Py_INCREF(unicode_empty);
317 return (PyObject *)unicode_empty;
318 }
319
320 /* Single character Unicode objects in the Latin-1 range are
321 shared when using this constructor */
322 if (size == 1 && *u < 256) {
323 unicode = unicode_latin1[*u];
324 if (!unicode) {
325 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000326 if (!unicode)
327 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000328 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 unicode_latin1[*u] = unicode;
330 }
331 Py_INCREF(unicode);
332 return (PyObject *)unicode;
333 }
334 }
Tim Petersced69f82003-09-16 20:30:58 +0000335
Guido van Rossumd57fd912000-03-10 22:53:23 +0000336 unicode = _PyUnicode_New(size);
337 if (!unicode)
338 return NULL;
339
340 /* Copy the Unicode data into the new object */
341 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000342 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000343
344 return (PyObject *)unicode;
345}
346
347#ifdef HAVE_WCHAR_H
348
349PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
350 int size)
351{
352 PyUnicodeObject *unicode;
353
354 if (w == NULL) {
355 PyErr_BadInternalCall();
356 return NULL;
357 }
358
359 unicode = _PyUnicode_New(size);
360 if (!unicode)
361 return NULL;
362
363 /* Copy the wchar_t data into the new object */
364#ifdef HAVE_USABLE_WCHAR_T
365 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000366#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 {
368 register Py_UNICODE *u;
369 register int i;
370 u = PyUnicode_AS_UNICODE(unicode);
371 for (i = size; i >= 0; i--)
372 *u++ = *w++;
373 }
374#endif
375
376 return (PyObject *)unicode;
377}
378
379int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
380 register wchar_t *w,
381 int size)
382{
383 if (unicode == NULL) {
384 PyErr_BadInternalCall();
385 return -1;
386 }
387 if (size > PyUnicode_GET_SIZE(unicode))
388 size = PyUnicode_GET_SIZE(unicode);
389#ifdef HAVE_USABLE_WCHAR_T
390 memcpy(w, unicode->str, size * sizeof(wchar_t));
391#else
392 {
393 register Py_UNICODE *u;
394 register int i;
395 u = PyUnicode_AS_UNICODE(unicode);
396 for (i = size; i >= 0; i--)
397 *w++ = *u++;
398 }
399#endif
400
401 return size;
402}
403
404#endif
405
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000406PyObject *PyUnicode_FromOrdinal(int ordinal)
407{
408 Py_UNICODE s[2];
409
410#ifdef Py_UNICODE_WIDE
411 if (ordinal < 0 || ordinal > 0x10ffff) {
412 PyErr_SetString(PyExc_ValueError,
413 "unichr() arg not in range(0x110000) "
414 "(wide Python build)");
415 return NULL;
416 }
417#else
418 if (ordinal < 0 || ordinal > 0xffff) {
419 PyErr_SetString(PyExc_ValueError,
420 "unichr() arg not in range(0x10000) "
421 "(narrow Python build)");
422 return NULL;
423 }
424#endif
425
426 if (ordinal <= 0xffff) {
427 /* UCS-2 character */
428 s[0] = (Py_UNICODE) ordinal;
429 return PyUnicode_FromUnicode(s, 1);
430 }
431 else {
432#ifndef Py_UNICODE_WIDE
433 /* UCS-4 character. store as two surrogate characters */
434 ordinal -= 0x10000L;
435 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
436 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
437 return PyUnicode_FromUnicode(s, 2);
438#else
439 s[0] = (Py_UNICODE)ordinal;
440 return PyUnicode_FromUnicode(s, 1);
441#endif
442 }
443}
444
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445PyObject *PyUnicode_FromObject(register PyObject *obj)
446{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000447 /* XXX Perhaps we should make this API an alias of
448 PyObject_Unicode() instead ?! */
449 if (PyUnicode_CheckExact(obj)) {
450 Py_INCREF(obj);
451 return obj;
452 }
453 if (PyUnicode_Check(obj)) {
454 /* For a Unicode subtype that's not a Unicode object,
455 return a true Unicode object with the same data. */
456 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
457 PyUnicode_GET_SIZE(obj));
458 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
460}
461
462PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
463 const char *encoding,
464 const char *errors)
465{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000466 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000468 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000469
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 if (obj == NULL) {
471 PyErr_BadInternalCall();
472 return NULL;
473 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000474
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000475#if 0
476 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000477 that no encodings is given and then redirect to
478 PyObject_Unicode() which then applies the additional logic for
479 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000480
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000481 NOTE: This API should really only be used for object which
482 represent *encoded* Unicode !
483
484 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000485 if (PyUnicode_Check(obj)) {
486 if (encoding) {
487 PyErr_SetString(PyExc_TypeError,
488 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000489 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000491 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000493#else
494 if (PyUnicode_Check(obj)) {
495 PyErr_SetString(PyExc_TypeError,
496 "decoding Unicode is not supported");
497 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000498 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000499#endif
500
501 /* Coerce object */
502 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000503 s = PyString_AS_STRING(obj);
504 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000505 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000506 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
507 /* Overwrite the error message with something more useful in
508 case of a TypeError. */
509 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000510 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000511 "coercing to Unicode: need string or buffer, "
512 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000513 obj->ob_type->tp_name);
514 goto onError;
515 }
Tim Petersced69f82003-09-16 20:30:58 +0000516
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518 if (len == 0) {
519 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000520 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 }
Tim Petersced69f82003-09-16 20:30:58 +0000522 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000523 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000524
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000525 return v;
526
527 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000528 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529}
530
531PyObject *PyUnicode_Decode(const char *s,
532 int size,
533 const char *encoding,
534 const char *errors)
535{
536 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000537
538 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000543 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000544 else if (strcmp(encoding, "latin-1") == 0)
545 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000546#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
547 else if (strcmp(encoding, "mbcs") == 0)
548 return PyUnicode_DecodeMBCS(s, size, errors);
549#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000550 else if (strcmp(encoding, "ascii") == 0)
551 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552
553 /* Decode via the codec registry */
554 buffer = PyBuffer_FromMemory((void *)s, size);
555 if (buffer == NULL)
556 goto onError;
557 unicode = PyCodec_Decode(buffer, encoding, errors);
558 if (unicode == NULL)
559 goto onError;
560 if (!PyUnicode_Check(unicode)) {
561 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000562 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000563 unicode->ob_type->tp_name);
564 Py_DECREF(unicode);
565 goto onError;
566 }
567 Py_DECREF(buffer);
568 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000569
Guido van Rossumd57fd912000-03-10 22:53:23 +0000570 onError:
571 Py_XDECREF(buffer);
572 return NULL;
573}
574
575PyObject *PyUnicode_Encode(const Py_UNICODE *s,
576 int size,
577 const char *encoding,
578 const char *errors)
579{
580 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000581
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 unicode = PyUnicode_FromUnicode(s, size);
583 if (unicode == NULL)
584 return NULL;
585 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
586 Py_DECREF(unicode);
587 return v;
588}
589
590PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
591 const char *encoding,
592 const char *errors)
593{
594 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596 if (!PyUnicode_Check(unicode)) {
597 PyErr_BadArgument();
598 goto onError;
599 }
Fred Drakee4315f52000-05-09 19:53:39 +0000600
Tim Petersced69f82003-09-16 20:30:58 +0000601 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000602 encoding = PyUnicode_GetDefaultEncoding();
603
604 /* Shortcuts for common default encodings */
605 if (errors == NULL) {
606 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000607 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000608 else if (strcmp(encoding, "latin-1") == 0)
609 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000610#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
611 else if (strcmp(encoding, "mbcs") == 0)
612 return PyUnicode_AsMBCSString(unicode);
613#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000614 else if (strcmp(encoding, "ascii") == 0)
615 return PyUnicode_AsASCIIString(unicode);
616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617
618 /* Encode via the codec registry */
619 v = PyCodec_Encode(unicode, encoding, errors);
620 if (v == NULL)
621 goto onError;
622 /* XXX Should we really enforce this ? */
623 if (!PyString_Check(v)) {
624 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 v->ob_type->tp_name);
627 Py_DECREF(v);
628 goto onError;
629 }
630 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632 onError:
633 return NULL;
634}
635
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000636PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
637 const char *errors)
638{
639 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
640
641 if (v)
642 return v;
643 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
644 if (v && errors == NULL)
645 ((PyUnicodeObject *)unicode)->defenc = v;
646 return v;
647}
648
Guido van Rossumd57fd912000-03-10 22:53:23 +0000649Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
650{
651 if (!PyUnicode_Check(unicode)) {
652 PyErr_BadArgument();
653 goto onError;
654 }
655 return PyUnicode_AS_UNICODE(unicode);
656
657 onError:
658 return NULL;
659}
660
661int PyUnicode_GetSize(PyObject *unicode)
662{
663 if (!PyUnicode_Check(unicode)) {
664 PyErr_BadArgument();
665 goto onError;
666 }
667 return PyUnicode_GET_SIZE(unicode);
668
669 onError:
670 return -1;
671}
672
Thomas Wouters78890102000-07-22 19:25:51 +0000673const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000674{
675 return unicode_default_encoding;
676}
677
678int PyUnicode_SetDefaultEncoding(const char *encoding)
679{
680 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000681
Fred Drakee4315f52000-05-09 19:53:39 +0000682 /* Make sure the encoding is valid. As side effect, this also
683 loads the encoding into the codec registry cache. */
684 v = _PyCodec_Lookup(encoding);
685 if (v == NULL)
686 goto onError;
687 Py_DECREF(v);
688 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000689 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000690 sizeof(unicode_default_encoding));
691 return 0;
692
693 onError:
694 return -1;
695}
696
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000697/* error handling callback helper:
698 build arguments, call the callback and check the arguments,
699 if no exception occured, copy the replacement to the output
700 and adjust various state variables.
701 return 0 on success, -1 on error
702*/
703
704static
705int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
706 const char *encoding, const char *reason,
707 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
708 PyObject **output, int *outpos, Py_UNICODE **outptr)
709{
710 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
711
712 PyObject *restuple = NULL;
713 PyObject *repunicode = NULL;
714 int outsize = PyUnicode_GET_SIZE(*output);
715 int requiredsize;
716 int newpos;
717 Py_UNICODE *repptr;
718 int repsize;
719 int res = -1;
720
721 if (*errorHandler == NULL) {
722 *errorHandler = PyCodec_LookupError(errors);
723 if (*errorHandler == NULL)
724 goto onError;
725 }
726
727 if (*exceptionObject == NULL) {
728 *exceptionObject = PyUnicodeDecodeError_Create(
729 encoding, input, insize, *startinpos, *endinpos, reason);
730 if (*exceptionObject == NULL)
731 goto onError;
732 }
733 else {
734 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
735 goto onError;
736 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
737 goto onError;
738 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
739 goto onError;
740 }
741
742 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
743 if (restuple == NULL)
744 goto onError;
745 if (!PyTuple_Check(restuple)) {
746 PyErr_Format(PyExc_TypeError, &argparse[4]);
747 goto onError;
748 }
749 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
750 goto onError;
751 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000752 newpos = insize+newpos;
753 if (newpos<0 || newpos>insize) {
754 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
755 goto onError;
756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000757
758 /* need more space? (at least enough for what we
759 have+the replacement+the rest of the string (starting
760 at the new input position), so we won't have to check space
761 when there are no errors in the rest of the string) */
762 repptr = PyUnicode_AS_UNICODE(repunicode);
763 repsize = PyUnicode_GET_SIZE(repunicode);
764 requiredsize = *outpos + repsize + insize-newpos;
765 if (requiredsize > outsize) {
766 if (requiredsize<2*outsize)
767 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000768 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000769 goto onError;
770 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
771 }
772 *endinpos = newpos;
773 *inptr = input + newpos;
774 Py_UNICODE_COPY(*outptr, repptr, repsize);
775 *outptr += repsize;
776 *outpos += repsize;
777 /* we made it! */
778 res = 0;
779
780 onError:
781 Py_XDECREF(restuple);
782 return res;
783}
784
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000785/* --- UTF-7 Codec -------------------------------------------------------- */
786
787/* see RFC2152 for details */
788
Tim Petersced69f82003-09-16 20:30:58 +0000789static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000790char utf7_special[128] = {
791 /* indicate whether a UTF-7 character is special i.e. cannot be directly
792 encoded:
793 0 - not special
794 1 - special
795 2 - whitespace (optional)
796 3 - RFC2152 Set O (optional) */
797 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
798 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
799 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
800 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
801 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
802 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
803 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
804 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
805
806};
807
808#define SPECIAL(c, encodeO, encodeWS) \
809 (((c)>127 || utf7_special[(c)] == 1) || \
810 (encodeWS && (utf7_special[(c)] == 2)) || \
811 (encodeO && (utf7_special[(c)] == 3)))
812
813#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
814#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
815#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
816 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
817
818#define ENCODE(out, ch, bits) \
819 while (bits >= 6) { \
820 *out++ = B64(ch >> (bits-6)); \
821 bits -= 6; \
822 }
823
824#define DECODE(out, ch, bits, surrogate) \
825 while (bits >= 16) { \
826 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
827 bits -= 16; \
828 if (surrogate) { \
829 /* We have already generated an error for the high surrogate
830 so let's not bother seeing if the low surrogate is correct or not */\
831 surrogate = 0; \
832 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
833 /* This is a surrogate pair. Unfortunately we can't represent \
834 it in a 16-bit character */ \
835 surrogate = 1; \
836 errmsg = "code pairs are not supported"; \
837 goto utf7Error; \
838 } else { \
839 *out++ = outCh; \
840 } \
841 } \
842
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000843PyObject *PyUnicode_DecodeUTF7(const char *s,
844 int size,
845 const char *errors)
846{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000847 const char *starts = s;
848 int startinpos;
849 int endinpos;
850 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000851 const char *e;
852 PyUnicodeObject *unicode;
853 Py_UNICODE *p;
854 const char *errmsg = "";
855 int inShift = 0;
856 unsigned int bitsleft = 0;
857 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000858 int surrogate = 0;
859 PyObject *errorHandler = NULL;
860 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000861
862 unicode = _PyUnicode_New(size);
863 if (!unicode)
864 return NULL;
865 if (size == 0)
866 return (PyObject *)unicode;
867
868 p = unicode->str;
869 e = s + size;
870
871 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000872 Py_UNICODE ch;
873 restart:
874 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000875
876 if (inShift) {
877 if ((ch == '-') || !B64CHAR(ch)) {
878 inShift = 0;
879 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000880
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000881 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
882 if (bitsleft >= 6) {
883 /* The shift sequence has a partial character in it. If
884 bitsleft < 6 then we could just classify it as padding
885 but that is not the case here */
886
887 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000888 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000889 }
890 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000891 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000892 here so indicate the potential of a misencoded character. */
893
894 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
895 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
896 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000897 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000898 }
899
900 if (ch == '-') {
901 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000902 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000903 inShift = 1;
904 }
905 } else if (SPECIAL(ch,0,0)) {
906 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000907 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 } else {
909 *p++ = ch;
910 }
911 } else {
912 charsleft = (charsleft << 6) | UB64(ch);
913 bitsleft += 6;
914 s++;
915 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
916 }
917 }
918 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000919 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000920 s++;
921 if (s < e && *s == '-') {
922 s++;
923 *p++ = '+';
924 } else
925 {
926 inShift = 1;
927 bitsleft = 0;
928 }
929 }
930 else if (SPECIAL(ch,0,0)) {
931 errmsg = "unexpected special character";
932 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000933 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000934 }
935 else {
936 *p++ = ch;
937 s++;
938 }
939 continue;
940 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000941 outpos = p-PyUnicode_AS_UNICODE(unicode);
942 endinpos = s-starts;
943 if (unicode_decode_call_errorhandler(
944 errors, &errorHandler,
945 "utf7", errmsg,
946 starts, size, &startinpos, &endinpos, &exc, &s,
947 (PyObject **)&unicode, &outpos, &p))
948 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 }
950
951 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000952 outpos = p-PyUnicode_AS_UNICODE(unicode);
953 endinpos = size;
954 if (unicode_decode_call_errorhandler(
955 errors, &errorHandler,
956 "utf7", "unterminated shift sequence",
957 starts, size, &startinpos, &endinpos, &exc, &s,
958 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000959 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000960 if (s < e)
961 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000962 }
963
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000964 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965 goto onError;
966
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000967 Py_XDECREF(errorHandler);
968 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000969 return (PyObject *)unicode;
970
971onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000972 Py_XDECREF(errorHandler);
973 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000974 Py_DECREF(unicode);
975 return NULL;
976}
977
978
979PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
980 int size,
981 int encodeSetO,
982 int encodeWhiteSpace,
983 const char *errors)
984{
985 PyObject *v;
986 /* It might be possible to tighten this worst case */
987 unsigned int cbAllocated = 5 * size;
988 int inShift = 0;
989 int i = 0;
990 unsigned int bitsleft = 0;
991 unsigned long charsleft = 0;
992 char * out;
993 char * start;
994
995 if (size == 0)
996 return PyString_FromStringAndSize(NULL, 0);
997
998 v = PyString_FromStringAndSize(NULL, cbAllocated);
999 if (v == NULL)
1000 return NULL;
1001
1002 start = out = PyString_AS_STRING(v);
1003 for (;i < size; ++i) {
1004 Py_UNICODE ch = s[i];
1005
1006 if (!inShift) {
1007 if (ch == '+') {
1008 *out++ = '+';
1009 *out++ = '-';
1010 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1011 charsleft = ch;
1012 bitsleft = 16;
1013 *out++ = '+';
1014 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1015 inShift = bitsleft > 0;
1016 } else {
1017 *out++ = (char) ch;
1018 }
1019 } else {
1020 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1021 *out++ = B64(charsleft << (6-bitsleft));
1022 charsleft = 0;
1023 bitsleft = 0;
1024 /* Characters not in the BASE64 set implicitly unshift the sequence
1025 so no '-' is required, except if the character is itself a '-' */
1026 if (B64CHAR(ch) || ch == '-') {
1027 *out++ = '-';
1028 }
1029 inShift = 0;
1030 *out++ = (char) ch;
1031 } else {
1032 bitsleft += 16;
1033 charsleft = (charsleft << 16) | ch;
1034 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1035
1036 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001037 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001038 or '-' then the shift sequence will be terminated implicitly and we
1039 don't have to insert a '-'. */
1040
1041 if (bitsleft == 0) {
1042 if (i + 1 < size) {
1043 Py_UNICODE ch2 = s[i+1];
1044
1045 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001046
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001047 } else if (B64CHAR(ch2) || ch2 == '-') {
1048 *out++ = '-';
1049 inShift = 0;
1050 } else {
1051 inShift = 0;
1052 }
1053
1054 }
1055 else {
1056 *out++ = '-';
1057 inShift = 0;
1058 }
1059 }
Tim Petersced69f82003-09-16 20:30:58 +00001060 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001061 }
1062 }
1063 if (bitsleft) {
1064 *out++= B64(charsleft << (6-bitsleft) );
1065 *out++ = '-';
1066 }
1067
Tim Peters5de98422002-04-27 18:44:32 +00001068 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001069 return v;
1070}
1071
1072#undef SPECIAL
1073#undef B64
1074#undef B64CHAR
1075#undef UB64
1076#undef ENCODE
1077#undef DECODE
1078
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079/* --- UTF-8 Codec -------------------------------------------------------- */
1080
Tim Petersced69f82003-09-16 20:30:58 +00001081static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082char utf8_code_length[256] = {
1083 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1084 illegal prefix. see RFC 2279 for details */
1085 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1086 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1087 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1088 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1089 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1090 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1091 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1092 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1094 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1096 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1097 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1098 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1099 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1100 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1101};
1102
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103PyObject *PyUnicode_DecodeUTF8(const char *s,
1104 int size,
1105 const char *errors)
1106{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001107 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001109 int startinpos;
1110 int endinpos;
1111 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 const char *e;
1113 PyUnicodeObject *unicode;
1114 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001115 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001116 PyObject *errorHandler = NULL;
1117 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118
1119 /* Note: size will always be longer than the resulting Unicode
1120 character count */
1121 unicode = _PyUnicode_New(size);
1122 if (!unicode)
1123 return NULL;
1124 if (size == 0)
1125 return (PyObject *)unicode;
1126
1127 /* Unpack UTF-8 encoded data */
1128 p = unicode->str;
1129 e = s + size;
1130
1131 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001132 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133
1134 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001135 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 s++;
1137 continue;
1138 }
1139
1140 n = utf8_code_length[ch];
1141
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001142 if (s + n > e) {
1143 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001144 startinpos = s-starts;
1145 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001146 goto utf8Error;
1147 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148
1149 switch (n) {
1150
1151 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001152 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001153 startinpos = s-starts;
1154 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001155 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156
1157 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001158 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001159 startinpos = s-starts;
1160 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001161 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162
1163 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001164 if ((s[1] & 0xc0) != 0x80) {
1165 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001166 startinpos = s-starts;
1167 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168 goto utf8Error;
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001171 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001172 startinpos = s-starts;
1173 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001174 errmsg = "illegal encoding";
1175 goto utf8Error;
1176 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001177 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001178 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 break;
1180
1181 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001182 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001183 (s[2] & 0xc0) != 0x80) {
1184 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001185 startinpos = s-starts;
1186 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001187 goto utf8Error;
1188 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001190 if (ch < 0x0800) {
1191 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001192 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001193
1194 XXX For wide builds (UCS-4) we should probably try
1195 to recombine the surrogates into a single code
1196 unit.
1197 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001198 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001199 startinpos = s-starts;
1200 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001201 goto utf8Error;
1202 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001204 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001205 break;
1206
1207 case 4:
1208 if ((s[1] & 0xc0) != 0x80 ||
1209 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001210 (s[3] & 0xc0) != 0x80) {
1211 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001212 startinpos = s-starts;
1213 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001214 goto utf8Error;
1215 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001216 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1217 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1218 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001219 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001220 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001221 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001222 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001223 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001224 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001225 startinpos = s-starts;
1226 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001227 goto utf8Error;
1228 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001229#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001230 *p++ = (Py_UNICODE)ch;
1231#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001232 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001233
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001234 /* translate from 10000..10FFFF to 0..FFFF */
1235 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001236
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001237 /* high surrogate = top 10 bits added to D800 */
1238 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001239
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001240 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001241 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001242#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 break;
1244
1245 default:
1246 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001247 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248 startinpos = s-starts;
1249 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001250 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 }
1252 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001253 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001254
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001255 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001256 outpos = p-PyUnicode_AS_UNICODE(unicode);
1257 if (unicode_decode_call_errorhandler(
1258 errors, &errorHandler,
1259 "utf8", errmsg,
1260 starts, size, &startinpos, &endinpos, &exc, &s,
1261 (PyObject **)&unicode, &outpos, &p))
1262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 }
1264
1265 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001266 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 goto onError;
1268
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 Py_XDECREF(errorHandler);
1270 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 return (PyObject *)unicode;
1272
1273onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001274 Py_XDECREF(errorHandler);
1275 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 Py_DECREF(unicode);
1277 return NULL;
1278}
1279
Tim Peters602f7402002-04-27 18:03:26 +00001280/* Allocation strategy: if the string is short, convert into a stack buffer
1281 and allocate exactly as much space needed at the end. Else allocate the
1282 maximum possible needed (4 result bytes per Unicode character), and return
1283 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001284*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001285PyObject *
1286PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1287 int size,
1288 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001289{
Tim Peters602f7402002-04-27 18:03:26 +00001290#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001291
Tim Peters602f7402002-04-27 18:03:26 +00001292 int i; /* index into s of next input byte */
1293 PyObject *v; /* result string object */
1294 char *p; /* next free byte in output buffer */
1295 int nallocated; /* number of result bytes allocated */
1296 int nneeded; /* number of result bytes needed */
1297 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001298
Tim Peters602f7402002-04-27 18:03:26 +00001299 assert(s != NULL);
1300 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301
Tim Peters602f7402002-04-27 18:03:26 +00001302 if (size <= MAX_SHORT_UNICHARS) {
1303 /* Write into the stack buffer; nallocated can't overflow.
1304 * At the end, we'll allocate exactly as much heap space as it
1305 * turns out we need.
1306 */
1307 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1308 v = NULL; /* will allocate after we're done */
1309 p = stackbuf;
1310 }
1311 else {
1312 /* Overallocate on the heap, and give the excess back at the end. */
1313 nallocated = size * 4;
1314 if (nallocated / 4 != size) /* overflow! */
1315 return PyErr_NoMemory();
1316 v = PyString_FromStringAndSize(NULL, nallocated);
1317 if (v == NULL)
1318 return NULL;
1319 p = PyString_AS_STRING(v);
1320 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001321
Tim Peters602f7402002-04-27 18:03:26 +00001322 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001323 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001324
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001325 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001326 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001327 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001328
Guido van Rossumd57fd912000-03-10 22:53:23 +00001329 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001330 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001331 *p++ = (char)(0xc0 | (ch >> 6));
1332 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001333 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001334 else {
Tim Peters602f7402002-04-27 18:03:26 +00001335 /* Encode UCS2 Unicode ordinals */
1336 if (ch < 0x10000) {
1337 /* Special case: check for high surrogate */
1338 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1339 Py_UCS4 ch2 = s[i];
1340 /* Check for low surrogate and combine the two to
1341 form a UCS4 value */
1342 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001343 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001344 i++;
1345 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001346 }
Tim Peters602f7402002-04-27 18:03:26 +00001347 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001348 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001349 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001350 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1351 *p++ = (char)(0x80 | (ch & 0x3f));
1352 continue;
1353 }
1354encodeUCS4:
1355 /* Encode UCS4 Unicode ordinals */
1356 *p++ = (char)(0xf0 | (ch >> 18));
1357 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1358 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1359 *p++ = (char)(0x80 | (ch & 0x3f));
1360 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001362
Tim Peters602f7402002-04-27 18:03:26 +00001363 if (v == NULL) {
1364 /* This was stack allocated. */
1365 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1366 assert(nneeded <= nallocated);
1367 v = PyString_FromStringAndSize(stackbuf, nneeded);
1368 }
1369 else {
1370 /* Cut back to size actually needed. */
1371 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1372 assert(nneeded <= nallocated);
1373 _PyString_Resize(&v, nneeded);
1374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001376
Tim Peters602f7402002-04-27 18:03:26 +00001377#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001378}
1379
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1381{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 if (!PyUnicode_Check(unicode)) {
1383 PyErr_BadArgument();
1384 return NULL;
1385 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001386 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1387 PyUnicode_GET_SIZE(unicode),
1388 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389}
1390
1391/* --- UTF-16 Codec ------------------------------------------------------- */
1392
Tim Peters772747b2001-08-09 22:21:55 +00001393PyObject *
1394PyUnicode_DecodeUTF16(const char *s,
1395 int size,
1396 const char *errors,
1397 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001399 const char *starts = s;
1400 int startinpos;
1401 int endinpos;
1402 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 PyUnicodeObject *unicode;
1404 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001405 const unsigned char *q, *e;
1406 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001407 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001408 /* Offsets from q for retrieving byte pairs in the right order. */
1409#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1410 int ihi = 1, ilo = 0;
1411#else
1412 int ihi = 0, ilo = 1;
1413#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001414 PyObject *errorHandler = NULL;
1415 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416
1417 /* Note: size will always be longer than the resulting Unicode
1418 character count */
1419 unicode = _PyUnicode_New(size);
1420 if (!unicode)
1421 return NULL;
1422 if (size == 0)
1423 return (PyObject *)unicode;
1424
1425 /* Unpack UTF-16 encoded data */
1426 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001427 q = (unsigned char *)s;
1428 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001429
1430 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001431 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001433 /* Check for BOM marks (U+FEFF) in the input and adjust current
1434 byte order setting accordingly. In native mode, the leading BOM
1435 mark is skipped, in all other modes, it is copied to the output
1436 stream as-is (giving a ZWNBSP character). */
1437 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001438 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001439#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001440 if (bom == 0xFEFF) {
1441 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001442 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001443 }
1444 else if (bom == 0xFFFE) {
1445 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001446 bo = 1;
1447 }
Tim Petersced69f82003-09-16 20:30:58 +00001448#else
Tim Peters772747b2001-08-09 22:21:55 +00001449 if (bom == 0xFEFF) {
1450 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001451 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001452 }
1453 else if (bom == 0xFFFE) {
1454 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001455 bo = -1;
1456 }
1457#endif
1458 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459
Tim Peters772747b2001-08-09 22:21:55 +00001460 if (bo == -1) {
1461 /* force LE */
1462 ihi = 1;
1463 ilo = 0;
1464 }
1465 else if (bo == 1) {
1466 /* force BE */
1467 ihi = 0;
1468 ilo = 1;
1469 }
1470
1471 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001472 Py_UNICODE ch;
1473 /* remaing bytes at the end? (size should be even) */
1474 if (e-q<2) {
1475 errmsg = "truncated data";
1476 startinpos = ((const char *)q)-starts;
1477 endinpos = ((const char *)e)-starts;
1478 goto utf16Error;
1479 /* The remaining input chars are ignored if the callback
1480 chooses to skip the input */
1481 }
1482 ch = (q[ihi] << 8) | q[ilo];
1483
Tim Peters772747b2001-08-09 22:21:55 +00001484 q += 2;
1485
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486 if (ch < 0xD800 || ch > 0xDFFF) {
1487 *p++ = ch;
1488 continue;
1489 }
1490
1491 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001492 if (q >= e) {
1493 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001494 startinpos = (((const char *)q)-2)-starts;
1495 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001496 goto utf16Error;
1497 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001498 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001499 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1500 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001501 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001502#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001503 *p++ = ch;
1504 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001505#else
1506 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001507#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001508 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001509 }
1510 else {
1511 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001512 startinpos = (((const char *)q)-4)-starts;
1513 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001514 goto utf16Error;
1515 }
1516
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001518 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 startinpos = (((const char *)q)-2)-starts;
1520 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001521 /* Fall through to report the error */
1522
1523 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001524 outpos = p-PyUnicode_AS_UNICODE(unicode);
1525 if (unicode_decode_call_errorhandler(
1526 errors, &errorHandler,
1527 "utf16", errmsg,
1528 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1529 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 }
1532
1533 if (byteorder)
1534 *byteorder = bo;
1535
1536 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001537 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 goto onError;
1539
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001540 Py_XDECREF(errorHandler);
1541 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 return (PyObject *)unicode;
1543
1544onError:
1545 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001546 Py_XDECREF(errorHandler);
1547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 return NULL;
1549}
1550
Tim Peters772747b2001-08-09 22:21:55 +00001551PyObject *
1552PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1553 int size,
1554 const char *errors,
1555 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556{
1557 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001558 unsigned char *p;
1559 int i, pairs;
1560 /* Offsets from p for storing byte pairs in the right order. */
1561#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1562 int ihi = 1, ilo = 0;
1563#else
1564 int ihi = 0, ilo = 1;
1565#endif
1566
1567#define STORECHAR(CH) \
1568 do { \
1569 p[ihi] = ((CH) >> 8) & 0xff; \
1570 p[ilo] = (CH) & 0xff; \
1571 p += 2; \
1572 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001574 for (i = pairs = 0; i < size; i++)
1575 if (s[i] >= 0x10000)
1576 pairs++;
Tim Petersced69f82003-09-16 20:30:58 +00001577 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001578 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 if (v == NULL)
1580 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581
Tim Peters772747b2001-08-09 22:21:55 +00001582 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001584 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001585 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001586 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001587
1588 if (byteorder == -1) {
1589 /* force LE */
1590 ihi = 1;
1591 ilo = 0;
1592 }
1593 else if (byteorder == 1) {
1594 /* force BE */
1595 ihi = 0;
1596 ilo = 1;
1597 }
1598
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001599 while (size-- > 0) {
1600 Py_UNICODE ch = *s++;
1601 Py_UNICODE ch2 = 0;
1602 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001603 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1604 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 }
Tim Peters772747b2001-08-09 22:21:55 +00001606 STORECHAR(ch);
1607 if (ch2)
1608 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001609 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001611#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612}
1613
1614PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1615{
1616 if (!PyUnicode_Check(unicode)) {
1617 PyErr_BadArgument();
1618 return NULL;
1619 }
1620 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1621 PyUnicode_GET_SIZE(unicode),
1622 NULL,
1623 0);
1624}
1625
1626/* --- Unicode Escape Codec ----------------------------------------------- */
1627
Fredrik Lundh06d12682001-01-24 07:59:11 +00001628static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001629
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1631 int size,
1632 const char *errors)
1633{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001634 const char *starts = s;
1635 int startinpos;
1636 int endinpos;
1637 int outpos;
1638 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001639 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001640 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001641 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001642 char* message;
1643 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001644 PyObject *errorHandler = NULL;
1645 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001646
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 /* Escaped strings will always be longer than the resulting
1648 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001649 length after conversion to the true value.
1650 (but if the error callback returns a long replacement string
1651 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 v = _PyUnicode_New(size);
1653 if (v == NULL)
1654 goto onError;
1655 if (size == 0)
1656 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001657
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001658 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001659 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001660
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661 while (s < end) {
1662 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001663 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001664 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665
1666 /* Non-escape characters are interpreted as Unicode ordinals */
1667 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001668 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 continue;
1670 }
1671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001672 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 /* \ - Escapes */
1674 s++;
1675 switch (*s++) {
1676
1677 /* \x escapes */
1678 case '\n': break;
1679 case '\\': *p++ = '\\'; break;
1680 case '\'': *p++ = '\''; break;
1681 case '\"': *p++ = '\"'; break;
1682 case 'b': *p++ = '\b'; break;
1683 case 'f': *p++ = '\014'; break; /* FF */
1684 case 't': *p++ = '\t'; break;
1685 case 'n': *p++ = '\n'; break;
1686 case 'r': *p++ = '\r'; break;
1687 case 'v': *p++ = '\013'; break; /* VT */
1688 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1689
1690 /* \OOO (octal) escapes */
1691 case '0': case '1': case '2': case '3':
1692 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001693 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001695 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001697 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001699 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 break;
1701
Fredrik Lundhccc74732001-02-18 22:13:49 +00001702 /* hex escapes */
1703 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001705 digits = 2;
1706 message = "truncated \\xXX escape";
1707 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708
Fredrik Lundhccc74732001-02-18 22:13:49 +00001709 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001710 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001711 digits = 4;
1712 message = "truncated \\uXXXX escape";
1713 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001714
Fredrik Lundhccc74732001-02-18 22:13:49 +00001715 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001716 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001717 digits = 8;
1718 message = "truncated \\UXXXXXXXX escape";
1719 hexescape:
1720 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 outpos = p-PyUnicode_AS_UNICODE(v);
1722 if (s+digits>end) {
1723 endinpos = size;
1724 if (unicode_decode_call_errorhandler(
1725 errors, &errorHandler,
1726 "unicodeescape", "end of string in escape sequence",
1727 starts, size, &startinpos, &endinpos, &exc, &s,
1728 (PyObject **)&v, &outpos, &p))
1729 goto onError;
1730 goto nextByte;
1731 }
1732 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001734 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001735 endinpos = (s+i+1)-starts;
1736 if (unicode_decode_call_errorhandler(
1737 errors, &errorHandler,
1738 "unicodeescape", message,
1739 starts, size, &startinpos, &endinpos, &exc, &s,
1740 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001741 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001742 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001743 }
1744 chr = (chr<<4) & ~0xF;
1745 if (c >= '0' && c <= '9')
1746 chr += c - '0';
1747 else if (c >= 'a' && c <= 'f')
1748 chr += 10 + c - 'a';
1749 else
1750 chr += 10 + c - 'A';
1751 }
1752 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001753 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001754 /* _decoding_error will have already written into the
1755 target buffer. */
1756 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001757 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001758 /* when we get here, chr is a 32-bit unicode character */
1759 if (chr <= 0xffff)
1760 /* UCS-2 character */
1761 *p++ = (Py_UNICODE) chr;
1762 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001763 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001764 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001765#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001766 *p++ = chr;
1767#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001768 chr -= 0x10000L;
1769 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001770 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001771#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001772 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001773 endinpos = s-starts;
1774 outpos = p-PyUnicode_AS_UNICODE(v);
1775 if (unicode_decode_call_errorhandler(
1776 errors, &errorHandler,
1777 "unicodeescape", "illegal Unicode character",
1778 starts, size, &startinpos, &endinpos, &exc, &s,
1779 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001780 goto onError;
1781 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001782 break;
1783
1784 /* \N{name} */
1785 case 'N':
1786 message = "malformed \\N character escape";
1787 if (ucnhash_CAPI == NULL) {
1788 /* load the unicode data module */
1789 PyObject *m, *v;
1790 m = PyImport_ImportModule("unicodedata");
1791 if (m == NULL)
1792 goto ucnhashError;
1793 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1794 Py_DECREF(m);
1795 if (v == NULL)
1796 goto ucnhashError;
1797 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1798 Py_DECREF(v);
1799 if (ucnhash_CAPI == NULL)
1800 goto ucnhashError;
1801 }
1802 if (*s == '{') {
1803 const char *start = s+1;
1804 /* look for the closing brace */
1805 while (*s != '}' && s < end)
1806 s++;
1807 if (s > start && s < end && *s == '}') {
1808 /* found a name. look it up in the unicode database */
1809 message = "unknown Unicode character name";
1810 s++;
1811 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1812 goto store;
1813 }
1814 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001815 endinpos = s-starts;
1816 outpos = p-PyUnicode_AS_UNICODE(v);
1817 if (unicode_decode_call_errorhandler(
1818 errors, &errorHandler,
1819 "unicodeescape", message,
1820 starts, size, &startinpos, &endinpos, &exc, &s,
1821 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001823 break;
1824
1825 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001826 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001827 message = "\\ at end of string";
1828 s--;
1829 endinpos = s-starts;
1830 outpos = p-PyUnicode_AS_UNICODE(v);
1831 if (unicode_decode_call_errorhandler(
1832 errors, &errorHandler,
1833 "unicodeescape", message,
1834 starts, size, &startinpos, &endinpos, &exc, &s,
1835 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001836 goto onError;
1837 }
1838 else {
1839 *p++ = '\\';
1840 *p++ = (unsigned char)s[-1];
1841 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001842 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001844 nextByte:
1845 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001847 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001848 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001849 Py_XDECREF(errorHandler);
1850 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001852
Fredrik Lundhccc74732001-02-18 22:13:49 +00001853ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001854 PyErr_SetString(
1855 PyExc_UnicodeError,
1856 "\\N escapes not supported (can't load unicodedata module)"
1857 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001858 Py_XDECREF(errorHandler);
1859 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001860 return NULL;
1861
Fredrik Lundhccc74732001-02-18 22:13:49 +00001862onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001864 Py_XDECREF(errorHandler);
1865 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866 return NULL;
1867}
1868
1869/* Return a Unicode-Escape string version of the Unicode object.
1870
1871 If quotes is true, the string is enclosed in u"" or u'' quotes as
1872 appropriate.
1873
1874*/
1875
Barry Warsaw51ac5802000-03-20 16:36:48 +00001876static const Py_UNICODE *findchar(const Py_UNICODE *s,
1877 int size,
1878 Py_UNICODE ch);
1879
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880static
1881PyObject *unicodeescape_string(const Py_UNICODE *s,
1882 int size,
1883 int quotes)
1884{
1885 PyObject *repr;
1886 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001888 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889
1890 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1891 if (repr == NULL)
1892 return NULL;
1893
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001894 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
1896 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001897 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001898 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899 !findchar(s, size, '"')) ? '"' : '\'';
1900 }
1901 while (size-- > 0) {
1902 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001903
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001905 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001906 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001907 *p++ = '\\';
1908 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001909 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001910 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001911
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001912#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001913 /* Map 21-bit characters to '\U00xxxxxx' */
1914 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001915 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001916
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001917 /* Resize the string if necessary */
1918 if (offset + 12 > PyString_GET_SIZE(repr)) {
1919 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001920 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001921 p = PyString_AS_STRING(repr) + offset;
1922 }
1923
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001924 *p++ = '\\';
1925 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001926 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1927 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1928 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1929 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1930 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1931 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1932 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001933 *p++ = hexdigit[ch & 0x0000000F];
1934 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001935 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001936#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001937 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1938 else if (ch >= 0xD800 && ch < 0xDC00) {
1939 Py_UNICODE ch2;
1940 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00001941
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001942 ch2 = *s++;
1943 size--;
1944 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1945 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1946 *p++ = '\\';
1947 *p++ = 'U';
1948 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1949 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1950 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1951 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1952 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1953 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1954 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1955 *p++ = hexdigit[ucs & 0x0000000F];
1956 continue;
1957 }
1958 /* Fall through: isolated surrogates are copied as-is */
1959 s--;
1960 size++;
1961 }
1962
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001964 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001965 *p++ = '\\';
1966 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001967 *p++ = hexdigit[(ch >> 12) & 0x000F];
1968 *p++ = hexdigit[(ch >> 8) & 0x000F];
1969 *p++ = hexdigit[(ch >> 4) & 0x000F];
1970 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001972
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001973 /* Map special whitespace to '\t', \n', '\r' */
1974 else if (ch == '\t') {
1975 *p++ = '\\';
1976 *p++ = 't';
1977 }
1978 else if (ch == '\n') {
1979 *p++ = '\\';
1980 *p++ = 'n';
1981 }
1982 else if (ch == '\r') {
1983 *p++ = '\\';
1984 *p++ = 'r';
1985 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001987 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001988 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001990 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001991 *p++ = hexdigit[(ch >> 4) & 0x000F];
1992 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00001993 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001994
Guido van Rossumd57fd912000-03-10 22:53:23 +00001995 /* Copy everything else as-is */
1996 else
1997 *p++ = (char) ch;
1998 }
1999 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002000 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001
2002 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002003 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004 return repr;
2005}
2006
2007PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2008 int size)
2009{
2010 return unicodeescape_string(s, size, 0);
2011}
2012
2013PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2014{
2015 if (!PyUnicode_Check(unicode)) {
2016 PyErr_BadArgument();
2017 return NULL;
2018 }
2019 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2020 PyUnicode_GET_SIZE(unicode));
2021}
2022
2023/* --- Raw Unicode Escape Codec ------------------------------------------- */
2024
2025PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2026 int size,
2027 const char *errors)
2028{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002029 const char *starts = s;
2030 int startinpos;
2031 int endinpos;
2032 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002034 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 const char *end;
2036 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002037 PyObject *errorHandler = NULL;
2038 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 /* Escaped strings will always be longer than the resulting
2041 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002042 length after conversion to the true value. (But decoding error
2043 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044 v = _PyUnicode_New(size);
2045 if (v == NULL)
2046 goto onError;
2047 if (size == 0)
2048 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002049 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 end = s + size;
2051 while (s < end) {
2052 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002053 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002055 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002056
2057 /* Non-escape characters are interpreted as Unicode ordinals */
2058 if (*s != '\\') {
2059 *p++ = (unsigned char)*s++;
2060 continue;
2061 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002062 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063
2064 /* \u-escapes are only interpreted iff the number of leading
2065 backslashes if odd */
2066 bs = s;
2067 for (;s < end;) {
2068 if (*s != '\\')
2069 break;
2070 *p++ = (unsigned char)*s++;
2071 }
2072 if (((s - bs) & 1) == 0 ||
2073 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002074 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 continue;
2076 }
2077 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002078 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002079 s++;
2080
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002081 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002083 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002086 endinpos = s-starts;
2087 if (unicode_decode_call_errorhandler(
2088 errors, &errorHandler,
2089 "rawunicodeescape", "truncated \\uXXXX",
2090 starts, size, &startinpos, &endinpos, &exc, &s,
2091 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002093 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094 }
2095 x = (x<<4) & ~0xF;
2096 if (c >= '0' && c <= '9')
2097 x += c - '0';
2098 else if (c >= 'a' && c <= 'f')
2099 x += 10 + c - 'a';
2100 else
2101 x += 10 + c - 'A';
2102 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002103#ifndef Py_UNICODE_WIDE
2104 if (x > 0x10000) {
2105 if (unicode_decode_call_errorhandler(
2106 errors, &errorHandler,
2107 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2108 starts, size, &startinpos, &endinpos, &exc, &s,
2109 (PyObject **)&v, &outpos, &p))
2110 goto onError;
2111 }
2112#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002113 *p++ = x;
2114 nextByte:
2115 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002117 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002118 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002119 Py_XDECREF(errorHandler);
2120 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002122
Guido van Rossumd57fd912000-03-10 22:53:23 +00002123 onError:
2124 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002125 Py_XDECREF(errorHandler);
2126 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002127 return NULL;
2128}
2129
2130PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2131 int size)
2132{
2133 PyObject *repr;
2134 char *p;
2135 char *q;
2136
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002137 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002138
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002139#ifdef Py_UNICODE_WIDE
2140 repr = PyString_FromStringAndSize(NULL, 10 * size);
2141#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002142 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 if (repr == NULL)
2145 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002146 if (size == 0)
2147 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148
2149 p = q = PyString_AS_STRING(repr);
2150 while (size-- > 0) {
2151 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002152#ifdef Py_UNICODE_WIDE
2153 /* Map 32-bit characters to '\Uxxxxxxxx' */
2154 if (ch >= 0x10000) {
2155 *p++ = '\\';
2156 *p++ = 'U';
2157 *p++ = hexdigit[(ch >> 28) & 0xf];
2158 *p++ = hexdigit[(ch >> 24) & 0xf];
2159 *p++ = hexdigit[(ch >> 20) & 0xf];
2160 *p++ = hexdigit[(ch >> 16) & 0xf];
2161 *p++ = hexdigit[(ch >> 12) & 0xf];
2162 *p++ = hexdigit[(ch >> 8) & 0xf];
2163 *p++ = hexdigit[(ch >> 4) & 0xf];
2164 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002165 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002166 else
2167#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 /* Map 16-bit characters to '\uxxxx' */
2169 if (ch >= 256) {
2170 *p++ = '\\';
2171 *p++ = 'u';
2172 *p++ = hexdigit[(ch >> 12) & 0xf];
2173 *p++ = hexdigit[(ch >> 8) & 0xf];
2174 *p++ = hexdigit[(ch >> 4) & 0xf];
2175 *p++ = hexdigit[ch & 15];
2176 }
2177 /* Copy everything else as-is */
2178 else
2179 *p++ = (char) ch;
2180 }
2181 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002182 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002183 return repr;
2184}
2185
2186PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2187{
2188 if (!PyUnicode_Check(unicode)) {
2189 PyErr_BadArgument();
2190 return NULL;
2191 }
2192 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2193 PyUnicode_GET_SIZE(unicode));
2194}
2195
2196/* --- Latin-1 Codec ------------------------------------------------------ */
2197
2198PyObject *PyUnicode_DecodeLatin1(const char *s,
2199 int size,
2200 const char *errors)
2201{
2202 PyUnicodeObject *v;
2203 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002204
Guido van Rossumd57fd912000-03-10 22:53:23 +00002205 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002206 if (size == 1 && *(unsigned char*)s < 256) {
2207 Py_UNICODE r = *(unsigned char*)s;
2208 return PyUnicode_FromUnicode(&r, 1);
2209 }
2210
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211 v = _PyUnicode_New(size);
2212 if (v == NULL)
2213 goto onError;
2214 if (size == 0)
2215 return (PyObject *)v;
2216 p = PyUnicode_AS_UNICODE(v);
2217 while (size-- > 0)
2218 *p++ = (unsigned char)*s++;
2219 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002220
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 onError:
2222 Py_XDECREF(v);
2223 return NULL;
2224}
2225
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002226/* create or adjust a UnicodeEncodeError */
2227static void make_encode_exception(PyObject **exceptionObject,
2228 const char *encoding,
2229 const Py_UNICODE *unicode, int size,
2230 int startpos, int endpos,
2231 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002232{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002233 if (*exceptionObject == NULL) {
2234 *exceptionObject = PyUnicodeEncodeError_Create(
2235 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002236 }
2237 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002238 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2239 goto onError;
2240 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2241 goto onError;
2242 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2243 goto onError;
2244 return;
2245 onError:
2246 Py_DECREF(*exceptionObject);
2247 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 }
2249}
2250
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002251/* raises a UnicodeEncodeError */
2252static void raise_encode_exception(PyObject **exceptionObject,
2253 const char *encoding,
2254 const Py_UNICODE *unicode, int size,
2255 int startpos, int endpos,
2256 const char *reason)
2257{
2258 make_encode_exception(exceptionObject,
2259 encoding, unicode, size, startpos, endpos, reason);
2260 if (*exceptionObject != NULL)
2261 PyCodec_StrictErrors(*exceptionObject);
2262}
2263
2264/* error handling callback helper:
2265 build arguments, call the callback and check the arguments,
2266 put the result into newpos and return the replacement string, which
2267 has to be freed by the caller */
2268static PyObject *unicode_encode_call_errorhandler(const char *errors,
2269 PyObject **errorHandler,
2270 const char *encoding, const char *reason,
2271 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2272 int startpos, int endpos,
2273 int *newpos)
2274{
2275 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2276
2277 PyObject *restuple;
2278 PyObject *resunicode;
2279
2280 if (*errorHandler == NULL) {
2281 *errorHandler = PyCodec_LookupError(errors);
2282 if (*errorHandler == NULL)
2283 return NULL;
2284 }
2285
2286 make_encode_exception(exceptionObject,
2287 encoding, unicode, size, startpos, endpos, reason);
2288 if (*exceptionObject == NULL)
2289 return NULL;
2290
2291 restuple = PyObject_CallFunctionObjArgs(
2292 *errorHandler, *exceptionObject, NULL);
2293 if (restuple == NULL)
2294 return NULL;
2295 if (!PyTuple_Check(restuple)) {
2296 PyErr_Format(PyExc_TypeError, &argparse[4]);
2297 Py_DECREF(restuple);
2298 return NULL;
2299 }
2300 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2301 &resunicode, newpos)) {
2302 Py_DECREF(restuple);
2303 return NULL;
2304 }
2305 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002306 *newpos = size+*newpos;
2307 if (*newpos<0 || *newpos>size) {
2308 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2309 Py_DECREF(restuple);
2310 return NULL;
2311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002312 Py_INCREF(resunicode);
2313 Py_DECREF(restuple);
2314 return resunicode;
2315}
2316
2317static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2318 int size,
2319 const char *errors,
2320 int limit)
2321{
2322 /* output object */
2323 PyObject *res;
2324 /* pointers to the beginning and end+1 of input */
2325 const Py_UNICODE *startp = p;
2326 const Py_UNICODE *endp = p + size;
2327 /* pointer to the beginning of the unencodable characters */
2328 /* const Py_UNICODE *badp = NULL; */
2329 /* pointer into the output */
2330 char *str;
2331 /* current output position */
2332 int respos = 0;
2333 int ressize;
2334 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2335 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2336 PyObject *errorHandler = NULL;
2337 PyObject *exc = NULL;
2338 /* the following variable is used for caching string comparisons
2339 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2340 int known_errorHandler = -1;
2341
2342 /* allocate enough for a simple encoding without
2343 replacements, if we need more, we'll resize */
2344 res = PyString_FromStringAndSize(NULL, size);
2345 if (res == NULL)
2346 goto onError;
2347 if (size == 0)
2348 return res;
2349 str = PyString_AS_STRING(res);
2350 ressize = size;
2351
2352 while (p<endp) {
2353 Py_UNICODE c = *p;
2354
2355 /* can we encode this? */
2356 if (c<limit) {
2357 /* no overflow check, because we know that the space is enough */
2358 *str++ = (char)c;
2359 ++p;
2360 }
2361 else {
2362 int unicodepos = p-startp;
2363 int requiredsize;
2364 PyObject *repunicode;
2365 int repsize;
2366 int newpos;
2367 int respos;
2368 Py_UNICODE *uni2;
2369 /* startpos for collecting unencodable chars */
2370 const Py_UNICODE *collstart = p;
2371 const Py_UNICODE *collend = p;
2372 /* find all unecodable characters */
2373 while ((collend < endp) && ((*collend)>=limit))
2374 ++collend;
2375 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2376 if (known_errorHandler==-1) {
2377 if ((errors==NULL) || (!strcmp(errors, "strict")))
2378 known_errorHandler = 1;
2379 else if (!strcmp(errors, "replace"))
2380 known_errorHandler = 2;
2381 else if (!strcmp(errors, "ignore"))
2382 known_errorHandler = 3;
2383 else if (!strcmp(errors, "xmlcharrefreplace"))
2384 known_errorHandler = 4;
2385 else
2386 known_errorHandler = 0;
2387 }
2388 switch (known_errorHandler) {
2389 case 1: /* strict */
2390 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2391 goto onError;
2392 case 2: /* replace */
2393 while (collstart++<collend)
2394 *str++ = '?'; /* fall through */
2395 case 3: /* ignore */
2396 p = collend;
2397 break;
2398 case 4: /* xmlcharrefreplace */
2399 respos = str-PyString_AS_STRING(res);
2400 /* determine replacement size (temporarily (mis)uses p) */
2401 for (p = collstart, repsize = 0; p < collend; ++p) {
2402 if (*p<10)
2403 repsize += 2+1+1;
2404 else if (*p<100)
2405 repsize += 2+2+1;
2406 else if (*p<1000)
2407 repsize += 2+3+1;
2408 else if (*p<10000)
2409 repsize += 2+4+1;
2410 else if (*p<100000)
2411 repsize += 2+5+1;
2412 else if (*p<1000000)
2413 repsize += 2+6+1;
2414 else
2415 repsize += 2+7+1;
2416 }
2417 requiredsize = respos+repsize+(endp-collend);
2418 if (requiredsize > ressize) {
2419 if (requiredsize<2*ressize)
2420 requiredsize = 2*ressize;
2421 if (_PyString_Resize(&res, requiredsize))
2422 goto onError;
2423 str = PyString_AS_STRING(res) + respos;
2424 ressize = requiredsize;
2425 }
2426 /* generate replacement (temporarily (mis)uses p) */
2427 for (p = collstart; p < collend; ++p) {
2428 str += sprintf(str, "&#%d;", (int)*p);
2429 }
2430 p = collend;
2431 break;
2432 default:
2433 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2434 encoding, reason, startp, size, &exc,
2435 collstart-startp, collend-startp, &newpos);
2436 if (repunicode == NULL)
2437 goto onError;
2438 /* need more space? (at least enough for what we
2439 have+the replacement+the rest of the string, so
2440 we won't have to check space for encodable characters) */
2441 respos = str-PyString_AS_STRING(res);
2442 repsize = PyUnicode_GET_SIZE(repunicode);
2443 requiredsize = respos+repsize+(endp-collend);
2444 if (requiredsize > ressize) {
2445 if (requiredsize<2*ressize)
2446 requiredsize = 2*ressize;
2447 if (_PyString_Resize(&res, requiredsize)) {
2448 Py_DECREF(repunicode);
2449 goto onError;
2450 }
2451 str = PyString_AS_STRING(res) + respos;
2452 ressize = requiredsize;
2453 }
2454 /* check if there is anything unencodable in the replacement
2455 and copy it to the output */
2456 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2457 c = *uni2;
2458 if (c >= limit) {
2459 raise_encode_exception(&exc, encoding, startp, size,
2460 unicodepos, unicodepos+1, reason);
2461 Py_DECREF(repunicode);
2462 goto onError;
2463 }
2464 *str = (char)c;
2465 }
2466 p = startp + newpos;
2467 Py_DECREF(repunicode);
2468 }
2469 }
2470 }
2471 /* Resize if we allocated to much */
2472 respos = str-PyString_AS_STRING(res);
2473 if (respos<ressize)
2474 /* If this falls res will be NULL */
2475 _PyString_Resize(&res, respos);
2476 Py_XDECREF(errorHandler);
2477 Py_XDECREF(exc);
2478 return res;
2479
2480 onError:
2481 Py_XDECREF(res);
2482 Py_XDECREF(errorHandler);
2483 Py_XDECREF(exc);
2484 return NULL;
2485}
2486
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2488 int size,
2489 const char *errors)
2490{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002491 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002492}
2493
2494PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2495{
2496 if (!PyUnicode_Check(unicode)) {
2497 PyErr_BadArgument();
2498 return NULL;
2499 }
2500 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2501 PyUnicode_GET_SIZE(unicode),
2502 NULL);
2503}
2504
2505/* --- 7-bit ASCII Codec -------------------------------------------------- */
2506
Guido van Rossumd57fd912000-03-10 22:53:23 +00002507PyObject *PyUnicode_DecodeASCII(const char *s,
2508 int size,
2509 const char *errors)
2510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002511 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 PyUnicodeObject *v;
2513 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002514 int startinpos;
2515 int endinpos;
2516 int outpos;
2517 const char *e;
2518 PyObject *errorHandler = NULL;
2519 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002520
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002522 if (size == 1 && *(unsigned char*)s < 128) {
2523 Py_UNICODE r = *(unsigned char*)s;
2524 return PyUnicode_FromUnicode(&r, 1);
2525 }
Tim Petersced69f82003-09-16 20:30:58 +00002526
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 v = _PyUnicode_New(size);
2528 if (v == NULL)
2529 goto onError;
2530 if (size == 0)
2531 return (PyObject *)v;
2532 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002533 e = s + size;
2534 while (s < e) {
2535 register unsigned char c = (unsigned char)*s;
2536 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002537 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002538 ++s;
2539 }
2540 else {
2541 startinpos = s-starts;
2542 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002543 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002544 if (unicode_decode_call_errorhandler(
2545 errors, &errorHandler,
2546 "ascii", "ordinal not in range(128)",
2547 starts, size, &startinpos, &endinpos, &exc, &s,
2548 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002550 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002552 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002553 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002554 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002555 Py_XDECREF(errorHandler);
2556 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002558
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 onError:
2560 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 Py_XDECREF(errorHandler);
2562 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 return NULL;
2564}
2565
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2567 int size,
2568 const char *errors)
2569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002570 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002571}
2572
2573PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2574{
2575 if (!PyUnicode_Check(unicode)) {
2576 PyErr_BadArgument();
2577 return NULL;
2578 }
2579 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2580 PyUnicode_GET_SIZE(unicode),
2581 NULL);
2582}
2583
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002584#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002585
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002586/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002587
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002588PyObject *PyUnicode_DecodeMBCS(const char *s,
2589 int size,
2590 const char *errors)
2591{
2592 PyUnicodeObject *v;
2593 Py_UNICODE *p;
2594
2595 /* First get the size of the result */
2596 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002597 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002598 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2599
2600 v = _PyUnicode_New(usize);
2601 if (v == NULL)
2602 return NULL;
2603 if (usize == 0)
2604 return (PyObject *)v;
2605 p = PyUnicode_AS_UNICODE(v);
2606 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2607 Py_DECREF(v);
2608 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2609 }
2610
2611 return (PyObject *)v;
2612}
2613
2614PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2615 int size,
2616 const char *errors)
2617{
2618 PyObject *repr;
2619 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002620 DWORD mbcssize;
2621
2622 /* If there are no characters, bail now! */
2623 if (size==0)
2624 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002625
2626 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002627 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002628 if (mbcssize==0)
2629 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2630
2631 repr = PyString_FromStringAndSize(NULL, mbcssize);
2632 if (repr == NULL)
2633 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002634 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002635 return repr;
2636
2637 /* Do the conversion */
2638 s = PyString_AS_STRING(repr);
2639 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2640 Py_DECREF(repr);
2641 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2642 }
2643 return repr;
2644}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002645
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002646PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2647{
2648 if (!PyUnicode_Check(unicode)) {
2649 PyErr_BadArgument();
2650 return NULL;
2651 }
2652 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2653 PyUnicode_GET_SIZE(unicode),
2654 NULL);
2655}
2656
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002657#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659/* --- Character Mapping Codec -------------------------------------------- */
2660
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661PyObject *PyUnicode_DecodeCharmap(const char *s,
2662 int size,
2663 PyObject *mapping,
2664 const char *errors)
2665{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002666 const char *starts = s;
2667 int startinpos;
2668 int endinpos;
2669 int outpos;
2670 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002671 PyUnicodeObject *v;
2672 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002673 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002674 PyObject *errorHandler = NULL;
2675 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002676
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 /* Default to Latin-1 */
2678 if (mapping == NULL)
2679 return PyUnicode_DecodeLatin1(s, size, errors);
2680
2681 v = _PyUnicode_New(size);
2682 if (v == NULL)
2683 goto onError;
2684 if (size == 0)
2685 return (PyObject *)v;
2686 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002687 e = s + size;
2688 while (s < e) {
2689 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002690 PyObject *w, *x;
2691
2692 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2693 w = PyInt_FromLong((long)ch);
2694 if (w == NULL)
2695 goto onError;
2696 x = PyObject_GetItem(mapping, w);
2697 Py_DECREF(w);
2698 if (x == NULL) {
2699 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002700 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002701 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002702 x = Py_None;
2703 Py_INCREF(x);
2704 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002705 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002706 }
2707
2708 /* Apply mapping */
2709 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002710 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002711 if (value < 0 || value > 65535) {
2712 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002713 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714 Py_DECREF(x);
2715 goto onError;
2716 }
2717 *p++ = (Py_UNICODE)value;
2718 }
2719 else if (x == Py_None) {
2720 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002721 outpos = p-PyUnicode_AS_UNICODE(v);
2722 startinpos = s-starts;
2723 endinpos = startinpos+1;
2724 if (unicode_decode_call_errorhandler(
2725 errors, &errorHandler,
2726 "charmap", "character maps to <undefined>",
2727 starts, size, &startinpos, &endinpos, &exc, &s,
2728 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 Py_DECREF(x);
2730 goto onError;
2731 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002732 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733 }
2734 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002735 int targetsize = PyUnicode_GET_SIZE(x);
2736
2737 if (targetsize == 1)
2738 /* 1-1 mapping */
2739 *p++ = *PyUnicode_AS_UNICODE(x);
2740
2741 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002743 if (targetsize > extrachars) {
2744 /* resize first */
2745 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2746 int needed = (targetsize - extrachars) + \
2747 (targetsize << 2);
2748 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002749 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002750 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002751 Py_DECREF(x);
2752 goto onError;
2753 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002754 p = PyUnicode_AS_UNICODE(v) + oldpos;
2755 }
2756 Py_UNICODE_COPY(p,
2757 PyUnicode_AS_UNICODE(x),
2758 targetsize);
2759 p += targetsize;
2760 extrachars -= targetsize;
2761 }
2762 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002763 }
2764 else {
2765 /* wrong return value */
2766 PyErr_SetString(PyExc_TypeError,
2767 "character mapping must return integer, None or unicode");
2768 Py_DECREF(x);
2769 goto onError;
2770 }
2771 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002772 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002773 }
2774 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002775 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002777 Py_XDECREF(errorHandler);
2778 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002780
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002782 Py_XDECREF(errorHandler);
2783 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 Py_XDECREF(v);
2785 return NULL;
2786}
2787
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002788/* Lookup the character ch in the mapping. If the character
2789 can't be found, Py_None is returned (or NULL, if another
2790 error occured). */
2791static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002792{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002793 PyObject *w = PyInt_FromLong((long)c);
2794 PyObject *x;
2795
2796 if (w == NULL)
2797 return NULL;
2798 x = PyObject_GetItem(mapping, w);
2799 Py_DECREF(w);
2800 if (x == NULL) {
2801 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2802 /* No mapping found means: mapping is undefined. */
2803 PyErr_Clear();
2804 x = Py_None;
2805 Py_INCREF(x);
2806 return x;
2807 } else
2808 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002809 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002810 else if (x == Py_None)
2811 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002812 else if (PyInt_Check(x)) {
2813 long value = PyInt_AS_LONG(x);
2814 if (value < 0 || value > 255) {
2815 PyErr_SetString(PyExc_TypeError,
2816 "character mapping must be in range(256)");
2817 Py_DECREF(x);
2818 return NULL;
2819 }
2820 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002822 else if (PyString_Check(x))
2823 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002825 /* wrong return value */
2826 PyErr_SetString(PyExc_TypeError,
2827 "character mapping must return integer, None or str");
2828 Py_DECREF(x);
2829 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002830 }
2831}
2832
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002833/* lookup the character, put the result in the output string and adjust
2834 various state variables. Reallocate the output string if not enough
2835 space is available. Return a new reference to the object that
2836 was put in the output buffer, or Py_None, if the mapping was undefined
2837 (in which case no character was written) or NULL, if a
2838 reallocation error ocurred. The called must decref the result */
2839static
2840PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2841 PyObject **outobj, int *outpos)
2842{
2843 PyObject *rep = charmapencode_lookup(c, mapping);
2844
2845 if (rep==NULL)
2846 return NULL;
2847 else if (rep==Py_None)
2848 return rep;
2849 else {
2850 char *outstart = PyString_AS_STRING(*outobj);
2851 int outsize = PyString_GET_SIZE(*outobj);
2852 if (PyInt_Check(rep)) {
2853 int requiredsize = *outpos+1;
2854 if (outsize<requiredsize) {
2855 /* exponentially overallocate to minimize reallocations */
2856 if (requiredsize < 2*outsize)
2857 requiredsize = 2*outsize;
2858 if (_PyString_Resize(outobj, requiredsize)) {
2859 Py_DECREF(rep);
2860 return NULL;
2861 }
2862 outstart = PyString_AS_STRING(*outobj);
2863 }
2864 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2865 }
2866 else {
2867 const char *repchars = PyString_AS_STRING(rep);
2868 int repsize = PyString_GET_SIZE(rep);
2869 int requiredsize = *outpos+repsize;
2870 if (outsize<requiredsize) {
2871 /* exponentially overallocate to minimize reallocations */
2872 if (requiredsize < 2*outsize)
2873 requiredsize = 2*outsize;
2874 if (_PyString_Resize(outobj, requiredsize)) {
2875 Py_DECREF(rep);
2876 return NULL;
2877 }
2878 outstart = PyString_AS_STRING(*outobj);
2879 }
2880 memcpy(outstart + *outpos, repchars, repsize);
2881 *outpos += repsize;
2882 }
2883 }
2884 return rep;
2885}
2886
2887/* handle an error in PyUnicode_EncodeCharmap
2888 Return 0 on success, -1 on error */
2889static
2890int charmap_encoding_error(
2891 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2892 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002893 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002894 PyObject **res, int *respos)
2895{
2896 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2897 int repsize;
2898 int newpos;
2899 Py_UNICODE *uni2;
2900 /* startpos for collecting unencodable chars */
2901 int collstartpos = *inpos;
2902 int collendpos = *inpos+1;
2903 int collpos;
2904 char *encoding = "charmap";
2905 char *reason = "character maps to <undefined>";
2906
2907 PyObject *x;
2908 /* find all unencodable characters */
2909 while (collendpos < size) {
2910 x = charmapencode_lookup(p[collendpos], mapping);
2911 if (x==NULL)
2912 return -1;
2913 else if (x!=Py_None) {
2914 Py_DECREF(x);
2915 break;
2916 }
2917 Py_DECREF(x);
2918 ++collendpos;
2919 }
2920 /* cache callback name lookup
2921 * (if not done yet, i.e. it's the first error) */
2922 if (*known_errorHandler==-1) {
2923 if ((errors==NULL) || (!strcmp(errors, "strict")))
2924 *known_errorHandler = 1;
2925 else if (!strcmp(errors, "replace"))
2926 *known_errorHandler = 2;
2927 else if (!strcmp(errors, "ignore"))
2928 *known_errorHandler = 3;
2929 else if (!strcmp(errors, "xmlcharrefreplace"))
2930 *known_errorHandler = 4;
2931 else
2932 *known_errorHandler = 0;
2933 }
2934 switch (*known_errorHandler) {
2935 case 1: /* strict */
2936 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2937 return -1;
2938 case 2: /* replace */
2939 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2940 x = charmapencode_output('?', mapping, res, respos);
2941 if (x==NULL) {
2942 return -1;
2943 }
2944 else if (x==Py_None) {
2945 Py_DECREF(x);
2946 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2947 return -1;
2948 }
2949 Py_DECREF(x);
2950 }
2951 /* fall through */
2952 case 3: /* ignore */
2953 *inpos = collendpos;
2954 break;
2955 case 4: /* xmlcharrefreplace */
2956 /* generate replacement (temporarily (mis)uses p) */
2957 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2958 char buffer[2+29+1+1];
2959 char *cp;
2960 sprintf(buffer, "&#%d;", (int)p[collpos]);
2961 for (cp = buffer; *cp; ++cp) {
2962 x = charmapencode_output(*cp, mapping, res, respos);
2963 if (x==NULL)
2964 return -1;
2965 else if (x==Py_None) {
2966 Py_DECREF(x);
2967 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2968 return -1;
2969 }
2970 Py_DECREF(x);
2971 }
2972 }
2973 *inpos = collendpos;
2974 break;
2975 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002976 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002977 encoding, reason, p, size, exceptionObject,
2978 collstartpos, collendpos, &newpos);
2979 if (repunicode == NULL)
2980 return -1;
2981 /* generate replacement */
2982 repsize = PyUnicode_GET_SIZE(repunicode);
2983 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2984 x = charmapencode_output(*uni2, mapping, res, respos);
2985 if (x==NULL) {
2986 Py_DECREF(repunicode);
2987 return -1;
2988 }
2989 else if (x==Py_None) {
2990 Py_DECREF(repunicode);
2991 Py_DECREF(x);
2992 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2993 return -1;
2994 }
2995 Py_DECREF(x);
2996 }
2997 *inpos = newpos;
2998 Py_DECREF(repunicode);
2999 }
3000 return 0;
3001}
3002
Guido van Rossumd57fd912000-03-10 22:53:23 +00003003PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3004 int size,
3005 PyObject *mapping,
3006 const char *errors)
3007{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003008 /* output object */
3009 PyObject *res = NULL;
3010 /* current input position */
3011 int inpos = 0;
3012 /* current output position */
3013 int respos = 0;
3014 PyObject *errorHandler = NULL;
3015 PyObject *exc = NULL;
3016 /* the following variable is used for caching string comparisons
3017 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3018 * 3=ignore, 4=xmlcharrefreplace */
3019 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020
3021 /* Default to Latin-1 */
3022 if (mapping == NULL)
3023 return PyUnicode_EncodeLatin1(p, size, errors);
3024
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 /* allocate enough for a simple encoding without
3026 replacements, if we need more, we'll resize */
3027 res = PyString_FromStringAndSize(NULL, size);
3028 if (res == NULL)
3029 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003030 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003031 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003032
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003033 while (inpos<size) {
3034 /* try to encode it */
3035 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3036 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003037 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038 if (x==Py_None) { /* unencodable character */
3039 if (charmap_encoding_error(p, size, &inpos, mapping,
3040 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003041 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003042 &res, &respos)) {
3043 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003044 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047 else
3048 /* done with this character => adjust input position */
3049 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003050 Py_DECREF(x);
3051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003052
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003053 /* Resize if we allocated to much */
3054 if (respos<PyString_GET_SIZE(res)) {
3055 if (_PyString_Resize(&res, respos))
3056 goto onError;
3057 }
3058 Py_XDECREF(exc);
3059 Py_XDECREF(errorHandler);
3060 return res;
3061
3062 onError:
3063 Py_XDECREF(res);
3064 Py_XDECREF(exc);
3065 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003066 return NULL;
3067}
3068
3069PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3070 PyObject *mapping)
3071{
3072 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3073 PyErr_BadArgument();
3074 return NULL;
3075 }
3076 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3077 PyUnicode_GET_SIZE(unicode),
3078 mapping,
3079 NULL);
3080}
3081
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003082/* create or adjust a UnicodeTranslateError */
3083static void make_translate_exception(PyObject **exceptionObject,
3084 const Py_UNICODE *unicode, int size,
3085 int startpos, int endpos,
3086 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003087{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003088 if (*exceptionObject == NULL) {
3089 *exceptionObject = PyUnicodeTranslateError_Create(
3090 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003091 }
3092 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003093 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3094 goto onError;
3095 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3096 goto onError;
3097 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3098 goto onError;
3099 return;
3100 onError:
3101 Py_DECREF(*exceptionObject);
3102 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003103 }
3104}
3105
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003106/* raises a UnicodeTranslateError */
3107static void raise_translate_exception(PyObject **exceptionObject,
3108 const Py_UNICODE *unicode, int size,
3109 int startpos, int endpos,
3110 const char *reason)
3111{
3112 make_translate_exception(exceptionObject,
3113 unicode, size, startpos, endpos, reason);
3114 if (*exceptionObject != NULL)
3115 PyCodec_StrictErrors(*exceptionObject);
3116}
3117
3118/* error handling callback helper:
3119 build arguments, call the callback and check the arguments,
3120 put the result into newpos and return the replacement string, which
3121 has to be freed by the caller */
3122static PyObject *unicode_translate_call_errorhandler(const char *errors,
3123 PyObject **errorHandler,
3124 const char *reason,
3125 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3126 int startpos, int endpos,
3127 int *newpos)
3128{
3129 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3130
3131 PyObject *restuple;
3132 PyObject *resunicode;
3133
3134 if (*errorHandler == NULL) {
3135 *errorHandler = PyCodec_LookupError(errors);
3136 if (*errorHandler == NULL)
3137 return NULL;
3138 }
3139
3140 make_translate_exception(exceptionObject,
3141 unicode, size, startpos, endpos, reason);
3142 if (*exceptionObject == NULL)
3143 return NULL;
3144
3145 restuple = PyObject_CallFunctionObjArgs(
3146 *errorHandler, *exceptionObject, NULL);
3147 if (restuple == NULL)
3148 return NULL;
3149 if (!PyTuple_Check(restuple)) {
3150 PyErr_Format(PyExc_TypeError, &argparse[4]);
3151 Py_DECREF(restuple);
3152 return NULL;
3153 }
3154 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3155 &resunicode, newpos)) {
3156 Py_DECREF(restuple);
3157 return NULL;
3158 }
3159 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003160 *newpos = size+*newpos;
3161 if (*newpos<0 || *newpos>size) {
3162 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3163 Py_DECREF(restuple);
3164 return NULL;
3165 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003166 Py_INCREF(resunicode);
3167 Py_DECREF(restuple);
3168 return resunicode;
3169}
3170
3171/* Lookup the character ch in the mapping and put the result in result,
3172 which must be decrefed by the caller.
3173 Return 0 on success, -1 on error */
3174static
3175int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3176{
3177 PyObject *w = PyInt_FromLong((long)c);
3178 PyObject *x;
3179
3180 if (w == NULL)
3181 return -1;
3182 x = PyObject_GetItem(mapping, w);
3183 Py_DECREF(w);
3184 if (x == NULL) {
3185 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3186 /* No mapping found means: use 1:1 mapping. */
3187 PyErr_Clear();
3188 *result = NULL;
3189 return 0;
3190 } else
3191 return -1;
3192 }
3193 else if (x == Py_None) {
3194 *result = x;
3195 return 0;
3196 }
3197 else if (PyInt_Check(x)) {
3198 long value = PyInt_AS_LONG(x);
3199 long max = PyUnicode_GetMax();
3200 if (value < 0 || value > max) {
3201 PyErr_Format(PyExc_TypeError,
3202 "character mapping must be in range(0x%lx)", max+1);
3203 Py_DECREF(x);
3204 return -1;
3205 }
3206 *result = x;
3207 return 0;
3208 }
3209 else if (PyUnicode_Check(x)) {
3210 *result = x;
3211 return 0;
3212 }
3213 else {
3214 /* wrong return value */
3215 PyErr_SetString(PyExc_TypeError,
3216 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003217 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003218 return -1;
3219 }
3220}
3221/* ensure that *outobj is at least requiredsize characters long,
3222if not reallocate and adjust various state variables.
3223Return 0 on success, -1 on error */
3224static
Walter Dörwald4894c302003-10-24 14:25:28 +00003225int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003226 int requiredsize)
3227{
Walter Dörwald4894c302003-10-24 14:25:28 +00003228 int oldsize = PyUnicode_GET_SIZE(*outobj);
3229 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003230 /* remember old output position */
3231 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3232 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003233 if (requiredsize < 2 * oldsize)
3234 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003235 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 return -1;
3237 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003238 }
3239 return 0;
3240}
3241/* lookup the character, put the result in the output string and adjust
3242 various state variables. Return a new reference to the object that
3243 was put in the output buffer in *result, or Py_None, if the mapping was
3244 undefined (in which case no character was written).
3245 The called must decref result.
3246 Return 0 on success, -1 on error. */
3247static
Walter Dörwald4894c302003-10-24 14:25:28 +00003248int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3249 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3250 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003251{
Walter Dörwald4894c302003-10-24 14:25:28 +00003252 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003253 return -1;
3254 if (*res==NULL) {
3255 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003256 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003257 }
3258 else if (*res==Py_None)
3259 ;
3260 else if (PyInt_Check(*res)) {
3261 /* no overflow check, because we know that the space is enough */
3262 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3263 }
3264 else if (PyUnicode_Check(*res)) {
3265 int repsize = PyUnicode_GET_SIZE(*res);
3266 if (repsize==1) {
3267 /* no overflow check, because we know that the space is enough */
3268 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3269 }
3270 else if (repsize!=0) {
3271 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003272 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
3273 (insize - (*curinp-*startinp)) +
3274 repsize - 1;
3275 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003276 return -1;
3277 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3278 *outp += repsize;
3279 }
3280 }
3281 else
3282 return -1;
3283 return 0;
3284}
3285
3286PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 int size,
3288 PyObject *mapping,
3289 const char *errors)
3290{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003291 /* output object */
3292 PyObject *res = NULL;
3293 /* pointers to the beginning and end+1 of input */
3294 const Py_UNICODE *startp = p;
3295 const Py_UNICODE *endp = p + size;
3296 /* pointer into the output */
3297 Py_UNICODE *str;
3298 /* current output position */
3299 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 char *reason = "character maps to <undefined>";
3301 PyObject *errorHandler = NULL;
3302 PyObject *exc = NULL;
3303 /* the following variable is used for caching string comparisons
3304 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3305 * 3=ignore, 4=xmlcharrefreplace */
3306 int known_errorHandler = -1;
3307
Guido van Rossumd57fd912000-03-10 22:53:23 +00003308 if (mapping == NULL) {
3309 PyErr_BadArgument();
3310 return NULL;
3311 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003312
3313 /* allocate enough for a simple 1:1 translation without
3314 replacements, if we need more, we'll resize */
3315 res = PyUnicode_FromUnicode(NULL, size);
3316 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003317 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003318 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319 return res;
3320 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003322 while (p<endp) {
3323 /* try to encode it */
3324 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003325 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003326 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 goto onError;
3328 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003329 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003330 if (x!=Py_None) /* it worked => adjust input pointer */
3331 ++p;
3332 else { /* untranslatable character */
3333 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3334 int repsize;
3335 int newpos;
3336 Py_UNICODE *uni2;
3337 /* startpos for collecting untranslatable chars */
3338 const Py_UNICODE *collstart = p;
3339 const Py_UNICODE *collend = p+1;
3340 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003342 /* find all untranslatable characters */
3343 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003344 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003345 goto onError;
3346 Py_XDECREF(x);
3347 if (x!=Py_None)
3348 break;
3349 ++collend;
3350 }
3351 /* cache callback name lookup
3352 * (if not done yet, i.e. it's the first error) */
3353 if (known_errorHandler==-1) {
3354 if ((errors==NULL) || (!strcmp(errors, "strict")))
3355 known_errorHandler = 1;
3356 else if (!strcmp(errors, "replace"))
3357 known_errorHandler = 2;
3358 else if (!strcmp(errors, "ignore"))
3359 known_errorHandler = 3;
3360 else if (!strcmp(errors, "xmlcharrefreplace"))
3361 known_errorHandler = 4;
3362 else
3363 known_errorHandler = 0;
3364 }
3365 switch (known_errorHandler) {
3366 case 1: /* strict */
3367 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3368 goto onError;
3369 case 2: /* replace */
3370 /* No need to check for space, this is a 1:1 replacement */
3371 for (coll = collstart; coll<collend; ++coll)
3372 *str++ = '?';
3373 /* fall through */
3374 case 3: /* ignore */
3375 p = collend;
3376 break;
3377 case 4: /* xmlcharrefreplace */
3378 /* generate replacement (temporarily (mis)uses p) */
3379 for (p = collstart; p < collend; ++p) {
3380 char buffer[2+29+1+1];
3381 char *cp;
3382 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003383 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003384 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3385 goto onError;
3386 for (cp = buffer; *cp; ++cp)
3387 *str++ = *cp;
3388 }
3389 p = collend;
3390 break;
3391 default:
3392 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3393 reason, startp, size, &exc,
3394 collstart-startp, collend-startp, &newpos);
3395 if (repunicode == NULL)
3396 goto onError;
3397 /* generate replacement */
3398 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003399 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3401 Py_DECREF(repunicode);
3402 goto onError;
3403 }
3404 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3405 *str++ = *uni2;
3406 p = startp + newpos;
3407 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003408 }
3409 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003410 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003411 /* Resize if we allocated to much */
3412 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003413 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003414 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003415 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003416 }
3417 Py_XDECREF(exc);
3418 Py_XDECREF(errorHandler);
3419 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003421 onError:
3422 Py_XDECREF(res);
3423 Py_XDECREF(exc);
3424 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 return NULL;
3426}
3427
3428PyObject *PyUnicode_Translate(PyObject *str,
3429 PyObject *mapping,
3430 const char *errors)
3431{
3432 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003433
Guido van Rossumd57fd912000-03-10 22:53:23 +00003434 str = PyUnicode_FromObject(str);
3435 if (str == NULL)
3436 goto onError;
3437 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3438 PyUnicode_GET_SIZE(str),
3439 mapping,
3440 errors);
3441 Py_DECREF(str);
3442 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003443
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 onError:
3445 Py_XDECREF(str);
3446 return NULL;
3447}
Tim Petersced69f82003-09-16 20:30:58 +00003448
Guido van Rossum9e896b32000-04-05 20:11:21 +00003449/* --- Decimal Encoder ---------------------------------------------------- */
3450
3451int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3452 int length,
3453 char *output,
3454 const char *errors)
3455{
3456 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003457 PyObject *errorHandler = NULL;
3458 PyObject *exc = NULL;
3459 const char *encoding = "decimal";
3460 const char *reason = "invalid decimal Unicode string";
3461 /* the following variable is used for caching string comparisons
3462 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3463 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003464
3465 if (output == NULL) {
3466 PyErr_BadArgument();
3467 return -1;
3468 }
3469
3470 p = s;
3471 end = s + length;
3472 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003473 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003474 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003475 PyObject *repunicode;
3476 int repsize;
3477 int newpos;
3478 Py_UNICODE *uni2;
3479 Py_UNICODE *collstart;
3480 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003481
Guido van Rossum9e896b32000-04-05 20:11:21 +00003482 if (Py_UNICODE_ISSPACE(ch)) {
3483 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003484 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003485 continue;
3486 }
3487 decimal = Py_UNICODE_TODECIMAL(ch);
3488 if (decimal >= 0) {
3489 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003490 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003491 continue;
3492 }
Guido van Rossumba477042000-04-06 18:18:10 +00003493 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003494 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003495 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003496 continue;
3497 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003498 /* All other characters are considered unencodable */
3499 collstart = p;
3500 collend = p+1;
3501 while (collend < end) {
3502 if ((0 < *collend && *collend < 256) ||
3503 !Py_UNICODE_ISSPACE(*collend) ||
3504 Py_UNICODE_TODECIMAL(*collend))
3505 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003506 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003507 /* cache callback name lookup
3508 * (if not done yet, i.e. it's the first error) */
3509 if (known_errorHandler==-1) {
3510 if ((errors==NULL) || (!strcmp(errors, "strict")))
3511 known_errorHandler = 1;
3512 else if (!strcmp(errors, "replace"))
3513 known_errorHandler = 2;
3514 else if (!strcmp(errors, "ignore"))
3515 known_errorHandler = 3;
3516 else if (!strcmp(errors, "xmlcharrefreplace"))
3517 known_errorHandler = 4;
3518 else
3519 known_errorHandler = 0;
3520 }
3521 switch (known_errorHandler) {
3522 case 1: /* strict */
3523 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3524 goto onError;
3525 case 2: /* replace */
3526 for (p = collstart; p < collend; ++p)
3527 *output++ = '?';
3528 /* fall through */
3529 case 3: /* ignore */
3530 p = collend;
3531 break;
3532 case 4: /* xmlcharrefreplace */
3533 /* generate replacement (temporarily (mis)uses p) */
3534 for (p = collstart; p < collend; ++p)
3535 output += sprintf(output, "&#%d;", (int)*p);
3536 p = collend;
3537 break;
3538 default:
3539 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3540 encoding, reason, s, length, &exc,
3541 collstart-s, collend-s, &newpos);
3542 if (repunicode == NULL)
3543 goto onError;
3544 /* generate replacement */
3545 repsize = PyUnicode_GET_SIZE(repunicode);
3546 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3547 Py_UNICODE ch = *uni2;
3548 if (Py_UNICODE_ISSPACE(ch))
3549 *output++ = ' ';
3550 else {
3551 decimal = Py_UNICODE_TODECIMAL(ch);
3552 if (decimal >= 0)
3553 *output++ = '0' + decimal;
3554 else if (0 < ch && ch < 256)
3555 *output++ = (char)ch;
3556 else {
3557 Py_DECREF(repunicode);
3558 raise_encode_exception(&exc, encoding,
3559 s, length, collstart-s, collend-s, reason);
3560 goto onError;
3561 }
3562 }
3563 }
3564 p = s + newpos;
3565 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003566 }
3567 }
3568 /* 0-terminate the output string */
3569 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003570 Py_XDECREF(exc);
3571 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003572 return 0;
3573
3574 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003575 Py_XDECREF(exc);
3576 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003577 return -1;
3578}
3579
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580/* --- Helpers ------------------------------------------------------------ */
3581
Tim Petersced69f82003-09-16 20:30:58 +00003582static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583int count(PyUnicodeObject *self,
3584 int start,
3585 int end,
3586 PyUnicodeObject *substring)
3587{
3588 int count = 0;
3589
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003590 if (start < 0)
3591 start += self->length;
3592 if (start < 0)
3593 start = 0;
3594 if (end > self->length)
3595 end = self->length;
3596 if (end < 0)
3597 end += self->length;
3598 if (end < 0)
3599 end = 0;
3600
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003601 if (substring->length == 0)
3602 return (end - start + 1);
3603
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 end -= substring->length;
3605
3606 while (start <= end)
3607 if (Py_UNICODE_MATCH(self, start, substring)) {
3608 count++;
3609 start += substring->length;
3610 } else
3611 start++;
3612
3613 return count;
3614}
3615
3616int PyUnicode_Count(PyObject *str,
3617 PyObject *substr,
3618 int start,
3619 int end)
3620{
3621 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003622
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 str = PyUnicode_FromObject(str);
3624 if (str == NULL)
3625 return -1;
3626 substr = PyUnicode_FromObject(substr);
3627 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003628 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 return -1;
3630 }
Tim Petersced69f82003-09-16 20:30:58 +00003631
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632 result = count((PyUnicodeObject *)str,
3633 start, end,
3634 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636 Py_DECREF(str);
3637 Py_DECREF(substr);
3638 return result;
3639}
3640
Tim Petersced69f82003-09-16 20:30:58 +00003641static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642int findstring(PyUnicodeObject *self,
3643 PyUnicodeObject *substring,
3644 int start,
3645 int end,
3646 int direction)
3647{
3648 if (start < 0)
3649 start += self->length;
3650 if (start < 0)
3651 start = 0;
3652
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 if (end > self->length)
3654 end = self->length;
3655 if (end < 0)
3656 end += self->length;
3657 if (end < 0)
3658 end = 0;
3659
Guido van Rossum76afbd92002-08-20 17:29:29 +00003660 if (substring->length == 0)
3661 return (direction > 0) ? start : end;
3662
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 end -= substring->length;
3664
3665 if (direction < 0) {
3666 for (; end >= start; end--)
3667 if (Py_UNICODE_MATCH(self, end, substring))
3668 return end;
3669 } else {
3670 for (; start <= end; start++)
3671 if (Py_UNICODE_MATCH(self, start, substring))
3672 return start;
3673 }
3674
3675 return -1;
3676}
3677
3678int PyUnicode_Find(PyObject *str,
3679 PyObject *substr,
3680 int start,
3681 int end,
3682 int direction)
3683{
3684 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 str = PyUnicode_FromObject(str);
3687 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003688 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 substr = PyUnicode_FromObject(substr);
3690 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003691 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003692 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 }
Tim Petersced69f82003-09-16 20:30:58 +00003694
Guido van Rossumd57fd912000-03-10 22:53:23 +00003695 result = findstring((PyUnicodeObject *)str,
3696 (PyUnicodeObject *)substr,
3697 start, end, direction);
3698 Py_DECREF(str);
3699 Py_DECREF(substr);
3700 return result;
3701}
3702
Tim Petersced69f82003-09-16 20:30:58 +00003703static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704int tailmatch(PyUnicodeObject *self,
3705 PyUnicodeObject *substring,
3706 int start,
3707 int end,
3708 int direction)
3709{
3710 if (start < 0)
3711 start += self->length;
3712 if (start < 0)
3713 start = 0;
3714
3715 if (substring->length == 0)
3716 return 1;
3717
3718 if (end > self->length)
3719 end = self->length;
3720 if (end < 0)
3721 end += self->length;
3722 if (end < 0)
3723 end = 0;
3724
3725 end -= substring->length;
3726 if (end < start)
3727 return 0;
3728
3729 if (direction > 0) {
3730 if (Py_UNICODE_MATCH(self, end, substring))
3731 return 1;
3732 } else {
3733 if (Py_UNICODE_MATCH(self, start, substring))
3734 return 1;
3735 }
3736
3737 return 0;
3738}
3739
3740int PyUnicode_Tailmatch(PyObject *str,
3741 PyObject *substr,
3742 int start,
3743 int end,
3744 int direction)
3745{
3746 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003747
Guido van Rossumd57fd912000-03-10 22:53:23 +00003748 str = PyUnicode_FromObject(str);
3749 if (str == NULL)
3750 return -1;
3751 substr = PyUnicode_FromObject(substr);
3752 if (substr == NULL) {
3753 Py_DECREF(substr);
3754 return -1;
3755 }
Tim Petersced69f82003-09-16 20:30:58 +00003756
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 result = tailmatch((PyUnicodeObject *)str,
3758 (PyUnicodeObject *)substr,
3759 start, end, direction);
3760 Py_DECREF(str);
3761 Py_DECREF(substr);
3762 return result;
3763}
3764
Tim Petersced69f82003-09-16 20:30:58 +00003765static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003766const Py_UNICODE *findchar(const Py_UNICODE *s,
3767 int size,
3768 Py_UNICODE ch)
3769{
3770 /* like wcschr, but doesn't stop at NULL characters */
3771
3772 while (size-- > 0) {
3773 if (*s == ch)
3774 return s;
3775 s++;
3776 }
3777
3778 return NULL;
3779}
3780
3781/* Apply fixfct filter to the Unicode object self and return a
3782 reference to the modified object */
3783
Tim Petersced69f82003-09-16 20:30:58 +00003784static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785PyObject *fixup(PyUnicodeObject *self,
3786 int (*fixfct)(PyUnicodeObject *s))
3787{
3788
3789 PyUnicodeObject *u;
3790
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003791 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003792 if (u == NULL)
3793 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003794
3795 Py_UNICODE_COPY(u->str, self->str, self->length);
3796
Tim Peters7a29bd52001-09-12 03:03:31 +00003797 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798 /* fixfct should return TRUE if it modified the buffer. If
3799 FALSE, return a reference to the original buffer instead
3800 (to save space, not time) */
3801 Py_INCREF(self);
3802 Py_DECREF(u);
3803 return (PyObject*) self;
3804 }
3805 return (PyObject*) u;
3806}
3807
Tim Petersced69f82003-09-16 20:30:58 +00003808static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809int fixupper(PyUnicodeObject *self)
3810{
3811 int len = self->length;
3812 Py_UNICODE *s = self->str;
3813 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003814
Guido van Rossumd57fd912000-03-10 22:53:23 +00003815 while (len-- > 0) {
3816 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 ch = Py_UNICODE_TOUPPER(*s);
3819 if (ch != *s) {
3820 status = 1;
3821 *s = ch;
3822 }
3823 s++;
3824 }
3825
3826 return status;
3827}
3828
Tim Petersced69f82003-09-16 20:30:58 +00003829static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830int fixlower(PyUnicodeObject *self)
3831{
3832 int len = self->length;
3833 Py_UNICODE *s = self->str;
3834 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003835
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836 while (len-- > 0) {
3837 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003838
Guido van Rossumd57fd912000-03-10 22:53:23 +00003839 ch = Py_UNICODE_TOLOWER(*s);
3840 if (ch != *s) {
3841 status = 1;
3842 *s = ch;
3843 }
3844 s++;
3845 }
3846
3847 return status;
3848}
3849
Tim Petersced69f82003-09-16 20:30:58 +00003850static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851int fixswapcase(PyUnicodeObject *self)
3852{
3853 int len = self->length;
3854 Py_UNICODE *s = self->str;
3855 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003856
Guido van Rossumd57fd912000-03-10 22:53:23 +00003857 while (len-- > 0) {
3858 if (Py_UNICODE_ISUPPER(*s)) {
3859 *s = Py_UNICODE_TOLOWER(*s);
3860 status = 1;
3861 } else if (Py_UNICODE_ISLOWER(*s)) {
3862 *s = Py_UNICODE_TOUPPER(*s);
3863 status = 1;
3864 }
3865 s++;
3866 }
3867
3868 return status;
3869}
3870
Tim Petersced69f82003-09-16 20:30:58 +00003871static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003872int fixcapitalize(PyUnicodeObject *self)
3873{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003874 int len = self->length;
3875 Py_UNICODE *s = self->str;
3876 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003877
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003878 if (len == 0)
3879 return 0;
3880 if (Py_UNICODE_ISLOWER(*s)) {
3881 *s = Py_UNICODE_TOUPPER(*s);
3882 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003884 s++;
3885 while (--len > 0) {
3886 if (Py_UNICODE_ISUPPER(*s)) {
3887 *s = Py_UNICODE_TOLOWER(*s);
3888 status = 1;
3889 }
3890 s++;
3891 }
3892 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003893}
3894
3895static
3896int fixtitle(PyUnicodeObject *self)
3897{
3898 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3899 register Py_UNICODE *e;
3900 int previous_is_cased;
3901
3902 /* Shortcut for single character strings */
3903 if (PyUnicode_GET_SIZE(self) == 1) {
3904 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3905 if (*p != ch) {
3906 *p = ch;
3907 return 1;
3908 }
3909 else
3910 return 0;
3911 }
Tim Petersced69f82003-09-16 20:30:58 +00003912
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 e = p + PyUnicode_GET_SIZE(self);
3914 previous_is_cased = 0;
3915 for (; p < e; p++) {
3916 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003917
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918 if (previous_is_cased)
3919 *p = Py_UNICODE_TOLOWER(ch);
3920 else
3921 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003922
3923 if (Py_UNICODE_ISLOWER(ch) ||
3924 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003925 Py_UNICODE_ISTITLE(ch))
3926 previous_is_cased = 1;
3927 else
3928 previous_is_cased = 0;
3929 }
3930 return 1;
3931}
3932
3933PyObject *PyUnicode_Join(PyObject *separator,
3934 PyObject *seq)
3935{
3936 Py_UNICODE *sep;
3937 int seplen;
3938 PyUnicodeObject *res = NULL;
3939 int reslen = 0;
3940 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003941 int sz = 100;
3942 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003943 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944
Tim Peters2cfe3682001-05-05 05:36:48 +00003945 it = PyObject_GetIter(seq);
3946 if (it == NULL)
3947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948
3949 if (separator == NULL) {
3950 Py_UNICODE blank = ' ';
3951 sep = &blank;
3952 seplen = 1;
3953 }
3954 else {
3955 separator = PyUnicode_FromObject(separator);
3956 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003957 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003958 sep = PyUnicode_AS_UNICODE(separator);
3959 seplen = PyUnicode_GET_SIZE(separator);
3960 }
Tim Petersced69f82003-09-16 20:30:58 +00003961
Guido van Rossumd57fd912000-03-10 22:53:23 +00003962 res = _PyUnicode_New(sz);
3963 if (res == NULL)
3964 goto onError;
3965 p = PyUnicode_AS_UNICODE(res);
3966 reslen = 0;
3967
Tim Peters2cfe3682001-05-05 05:36:48 +00003968 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003970 PyObject *item = PyIter_Next(it);
3971 if (item == NULL) {
3972 if (PyErr_Occurred())
3973 goto onError;
3974 break;
3975 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003976 if (!PyUnicode_Check(item)) {
3977 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003978 if (!PyString_Check(item)) {
3979 PyErr_Format(PyExc_TypeError,
3980 "sequence item %i: expected string or Unicode,"
3981 " %.80s found",
3982 i, item->ob_type->tp_name);
3983 Py_DECREF(item);
3984 goto onError;
3985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003986 v = PyUnicode_FromObject(item);
3987 Py_DECREF(item);
3988 item = v;
3989 if (item == NULL)
3990 goto onError;
3991 }
3992 itemlen = PyUnicode_GET_SIZE(item);
3993 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003994 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003995 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003997 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998 sz *= 2;
3999 p = PyUnicode_AS_UNICODE(res) + reslen;
4000 }
4001 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004002 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 p += seplen;
4004 reslen += seplen;
4005 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004006 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007 p += itemlen;
4008 reslen += itemlen;
4009 Py_DECREF(item);
4010 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004011 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012 goto onError;
4013
4014 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004015 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004016 return (PyObject *)res;
4017
4018 onError:
4019 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004020 Py_XDECREF(res);
4021 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004022 return NULL;
4023}
4024
Tim Petersced69f82003-09-16 20:30:58 +00004025static
4026PyUnicodeObject *pad(PyUnicodeObject *self,
4027 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 int right,
4029 Py_UNICODE fill)
4030{
4031 PyUnicodeObject *u;
4032
4033 if (left < 0)
4034 left = 0;
4035 if (right < 0)
4036 right = 0;
4037
Tim Peters7a29bd52001-09-12 03:03:31 +00004038 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 Py_INCREF(self);
4040 return self;
4041 }
4042
4043 u = _PyUnicode_New(left + self->length + right);
4044 if (u) {
4045 if (left)
4046 Py_UNICODE_FILL(u->str, fill, left);
4047 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4048 if (right)
4049 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4050 }
4051
4052 return u;
4053}
4054
4055#define SPLIT_APPEND(data, left, right) \
4056 str = PyUnicode_FromUnicode(data + left, right - left); \
4057 if (!str) \
4058 goto onError; \
4059 if (PyList_Append(list, str)) { \
4060 Py_DECREF(str); \
4061 goto onError; \
4062 } \
4063 else \
4064 Py_DECREF(str);
4065
4066static
4067PyObject *split_whitespace(PyUnicodeObject *self,
4068 PyObject *list,
4069 int maxcount)
4070{
4071 register int i;
4072 register int j;
4073 int len = self->length;
4074 PyObject *str;
4075
4076 for (i = j = 0; i < len; ) {
4077 /* find a token */
4078 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4079 i++;
4080 j = i;
4081 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4082 i++;
4083 if (j < i) {
4084 if (maxcount-- <= 0)
4085 break;
4086 SPLIT_APPEND(self->str, j, i);
4087 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4088 i++;
4089 j = i;
4090 }
4091 }
4092 if (j < len) {
4093 SPLIT_APPEND(self->str, j, len);
4094 }
4095 return list;
4096
4097 onError:
4098 Py_DECREF(list);
4099 return NULL;
4100}
4101
4102PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004103 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104{
4105 register int i;
4106 register int j;
4107 int len;
4108 PyObject *list;
4109 PyObject *str;
4110 Py_UNICODE *data;
4111
4112 string = PyUnicode_FromObject(string);
4113 if (string == NULL)
4114 return NULL;
4115 data = PyUnicode_AS_UNICODE(string);
4116 len = PyUnicode_GET_SIZE(string);
4117
Guido van Rossumd57fd912000-03-10 22:53:23 +00004118 list = PyList_New(0);
4119 if (!list)
4120 goto onError;
4121
4122 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004123 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 /* Find a line and append it */
4126 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4127 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128
4129 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004130 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 if (i < len) {
4132 if (data[i] == '\r' && i + 1 < len &&
4133 data[i+1] == '\n')
4134 i += 2;
4135 else
4136 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004137 if (keepends)
4138 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 }
Guido van Rossum86662912000-04-11 15:38:46 +00004140 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 j = i;
4142 }
4143 if (j < len) {
4144 SPLIT_APPEND(data, j, len);
4145 }
4146
4147 Py_DECREF(string);
4148 return list;
4149
4150 onError:
4151 Py_DECREF(list);
4152 Py_DECREF(string);
4153 return NULL;
4154}
4155
Tim Petersced69f82003-09-16 20:30:58 +00004156static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157PyObject *split_char(PyUnicodeObject *self,
4158 PyObject *list,
4159 Py_UNICODE ch,
4160 int maxcount)
4161{
4162 register int i;
4163 register int j;
4164 int len = self->length;
4165 PyObject *str;
4166
4167 for (i = j = 0; i < len; ) {
4168 if (self->str[i] == ch) {
4169 if (maxcount-- <= 0)
4170 break;
4171 SPLIT_APPEND(self->str, j, i);
4172 i = j = i + 1;
4173 } else
4174 i++;
4175 }
4176 if (j <= len) {
4177 SPLIT_APPEND(self->str, j, len);
4178 }
4179 return list;
4180
4181 onError:
4182 Py_DECREF(list);
4183 return NULL;
4184}
4185
Tim Petersced69f82003-09-16 20:30:58 +00004186static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004187PyObject *split_substring(PyUnicodeObject *self,
4188 PyObject *list,
4189 PyUnicodeObject *substring,
4190 int maxcount)
4191{
4192 register int i;
4193 register int j;
4194 int len = self->length;
4195 int sublen = substring->length;
4196 PyObject *str;
4197
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004198 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 if (Py_UNICODE_MATCH(self, i, substring)) {
4200 if (maxcount-- <= 0)
4201 break;
4202 SPLIT_APPEND(self->str, j, i);
4203 i = j = i + sublen;
4204 } else
4205 i++;
4206 }
4207 if (j <= len) {
4208 SPLIT_APPEND(self->str, j, len);
4209 }
4210 return list;
4211
4212 onError:
4213 Py_DECREF(list);
4214 return NULL;
4215}
4216
4217#undef SPLIT_APPEND
4218
4219static
4220PyObject *split(PyUnicodeObject *self,
4221 PyUnicodeObject *substring,
4222 int maxcount)
4223{
4224 PyObject *list;
4225
4226 if (maxcount < 0)
4227 maxcount = INT_MAX;
4228
4229 list = PyList_New(0);
4230 if (!list)
4231 return NULL;
4232
4233 if (substring == NULL)
4234 return split_whitespace(self,list,maxcount);
4235
4236 else if (substring->length == 1)
4237 return split_char(self,list,substring->str[0],maxcount);
4238
4239 else if (substring->length == 0) {
4240 Py_DECREF(list);
4241 PyErr_SetString(PyExc_ValueError, "empty separator");
4242 return NULL;
4243 }
4244 else
4245 return split_substring(self,list,substring,maxcount);
4246}
4247
Tim Petersced69f82003-09-16 20:30:58 +00004248static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249PyObject *replace(PyUnicodeObject *self,
4250 PyUnicodeObject *str1,
4251 PyUnicodeObject *str2,
4252 int maxcount)
4253{
4254 PyUnicodeObject *u;
4255
4256 if (maxcount < 0)
4257 maxcount = INT_MAX;
4258
4259 if (str1->length == 1 && str2->length == 1) {
4260 int i;
4261
4262 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004263 if (!findchar(self->str, self->length, str1->str[0]) &&
4264 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265 /* nothing to replace, return original string */
4266 Py_INCREF(self);
4267 u = self;
4268 } else {
4269 Py_UNICODE u1 = str1->str[0];
4270 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004271
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004273 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274 self->length
4275 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004276 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004277 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004278 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279 for (i = 0; i < u->length; i++)
4280 if (u->str[i] == u1) {
4281 if (--maxcount < 0)
4282 break;
4283 u->str[i] = u2;
4284 }
4285 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004286 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287
4288 } else {
4289 int n, i;
4290 Py_UNICODE *p;
4291
4292 /* replace strings */
4293 n = count(self, 0, self->length, str1);
4294 if (n > maxcount)
4295 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004296 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004298 if (PyUnicode_CheckExact(self)) {
4299 Py_INCREF(self);
4300 u = self;
4301 }
4302 else {
4303 u = (PyUnicodeObject *)
4304 PyUnicode_FromUnicode(self->str, self->length);
4305 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 } else {
4307 u = _PyUnicode_New(
4308 self->length + n * (str2->length - str1->length));
4309 if (u) {
4310 i = 0;
4311 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004312 if (str1->length > 0) {
4313 while (i <= self->length - str1->length)
4314 if (Py_UNICODE_MATCH(self, i, str1)) {
4315 /* replace string segment */
4316 Py_UNICODE_COPY(p, str2->str, str2->length);
4317 p += str2->length;
4318 i += str1->length;
4319 if (--n <= 0) {
4320 /* copy remaining part */
4321 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4322 break;
4323 }
4324 } else
4325 *p++ = self->str[i++];
4326 } else {
4327 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 Py_UNICODE_COPY(p, str2->str, str2->length);
4329 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004330 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004333 }
4334 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4335 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336 }
4337 }
4338 }
Tim Petersced69f82003-09-16 20:30:58 +00004339
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 return (PyObject *) u;
4341}
4342
4343/* --- Unicode Object Methods --------------------------------------------- */
4344
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004345PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346"S.title() -> unicode\n\
4347\n\
4348Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004349characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350
4351static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004352unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 return fixup(self, fixtitle);
4355}
4356
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004357PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004358"S.capitalize() -> unicode\n\
4359\n\
4360Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004361have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004362
4363static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004364unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366 return fixup(self, fixcapitalize);
4367}
4368
4369#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004370PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004371"S.capwords() -> unicode\n\
4372\n\
4373Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004374normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
4376static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004377unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378{
4379 PyObject *list;
4380 PyObject *item;
4381 int i;
4382
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383 /* Split into words */
4384 list = split(self, NULL, -1);
4385 if (!list)
4386 return NULL;
4387
4388 /* Capitalize each word */
4389 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4390 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4391 fixcapitalize);
4392 if (item == NULL)
4393 goto onError;
4394 Py_DECREF(PyList_GET_ITEM(list, i));
4395 PyList_SET_ITEM(list, i, item);
4396 }
4397
4398 /* Join the words to form a new string */
4399 item = PyUnicode_Join(NULL, list);
4400
4401onError:
4402 Py_DECREF(list);
4403 return (PyObject *)item;
4404}
4405#endif
4406
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004407/* Argument converter. Coerces to a single unicode character */
4408
4409static int
4410convert_uc(PyObject *obj, void *addr)
4411{
4412 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4413 PyObject *uniobj;
4414 Py_UNICODE *unistr;
4415
4416 uniobj = PyUnicode_FromObject(obj);
4417 if (uniobj == NULL) {
4418 PyErr_SetString(PyExc_TypeError,
4419 "The fill character cannot be converted to Unicode");
4420 return 0;
4421 }
4422 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4423 PyErr_SetString(PyExc_TypeError,
4424 "The fill character must be exactly one character long");
4425 Py_DECREF(uniobj);
4426 return 0;
4427 }
4428 unistr = PyUnicode_AS_UNICODE(uniobj);
4429 *fillcharloc = unistr[0];
4430 Py_DECREF(uniobj);
4431 return 1;
4432}
4433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004434PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004435"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004436\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004437Return S centered in a Unicode string of length width. Padding is\n\
4438done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439
4440static PyObject *
4441unicode_center(PyUnicodeObject *self, PyObject *args)
4442{
4443 int marg, left;
4444 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004445 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004446
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004447 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004448 return NULL;
4449
Tim Peters7a29bd52001-09-12 03:03:31 +00004450 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004451 Py_INCREF(self);
4452 return (PyObject*) self;
4453 }
4454
4455 marg = width - self->length;
4456 left = marg / 2 + (marg & width & 1);
4457
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004458 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004459}
4460
Marc-André Lemburge5034372000-08-08 08:04:29 +00004461#if 0
4462
4463/* This code should go into some future Unicode collation support
4464 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004465 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004466
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004467/* speedy UTF-16 code point order comparison */
4468/* gleaned from: */
4469/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4470
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004471static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004472{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004473 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004474 0, 0, 0, 0, 0, 0, 0, 0,
4475 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004476 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004477};
4478
Guido van Rossumd57fd912000-03-10 22:53:23 +00004479static int
4480unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4481{
4482 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004483
Guido van Rossumd57fd912000-03-10 22:53:23 +00004484 Py_UNICODE *s1 = str1->str;
4485 Py_UNICODE *s2 = str2->str;
4486
4487 len1 = str1->length;
4488 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004489
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004491 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004492
4493 c1 = *s1++;
4494 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004495
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004496 if (c1 > (1<<11) * 26)
4497 c1 += utf16Fixup[c1>>11];
4498 if (c2 > (1<<11) * 26)
4499 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004500 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004501
4502 if (c1 != c2)
4503 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004504
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004505 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 }
4507
4508 return (len1 < len2) ? -1 : (len1 != len2);
4509}
4510
Marc-André Lemburge5034372000-08-08 08:04:29 +00004511#else
4512
4513static int
4514unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4515{
4516 register int len1, len2;
4517
4518 Py_UNICODE *s1 = str1->str;
4519 Py_UNICODE *s2 = str2->str;
4520
4521 len1 = str1->length;
4522 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004523
Marc-André Lemburge5034372000-08-08 08:04:29 +00004524 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004525 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004526
Fredrik Lundh45714e92001-06-26 16:39:36 +00004527 c1 = *s1++;
4528 c2 = *s2++;
4529
4530 if (c1 != c2)
4531 return (c1 < c2) ? -1 : 1;
4532
Marc-André Lemburge5034372000-08-08 08:04:29 +00004533 len1--; len2--;
4534 }
4535
4536 return (len1 < len2) ? -1 : (len1 != len2);
4537}
4538
4539#endif
4540
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541int PyUnicode_Compare(PyObject *left,
4542 PyObject *right)
4543{
4544 PyUnicodeObject *u = NULL, *v = NULL;
4545 int result;
4546
4547 /* Coerce the two arguments */
4548 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4549 if (u == NULL)
4550 goto onError;
4551 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4552 if (v == NULL)
4553 goto onError;
4554
Thomas Wouters7e474022000-07-16 12:04:32 +00004555 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556 if (v == u) {
4557 Py_DECREF(u);
4558 Py_DECREF(v);
4559 return 0;
4560 }
4561
4562 result = unicode_compare(u, v);
4563
4564 Py_DECREF(u);
4565 Py_DECREF(v);
4566 return result;
4567
4568onError:
4569 Py_XDECREF(u);
4570 Py_XDECREF(v);
4571 return -1;
4572}
4573
Guido van Rossum403d68b2000-03-13 15:55:09 +00004574int PyUnicode_Contains(PyObject *container,
4575 PyObject *element)
4576{
4577 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004578 int result, size;
4579 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004580
4581 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004582 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004583 if (v == NULL) {
4584 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004585 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004586 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004587 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004588 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004589 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004590 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004591
Barry Warsaw817918c2002-08-06 16:58:21 +00004592 size = PyUnicode_GET_SIZE(v);
4593 rhs = PyUnicode_AS_UNICODE(v);
4594 lhs = PyUnicode_AS_UNICODE(u);
4595
Guido van Rossum403d68b2000-03-13 15:55:09 +00004596 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004597 if (size == 1) {
4598 end = lhs + PyUnicode_GET_SIZE(u);
4599 while (lhs < end) {
4600 if (*lhs++ == *rhs) {
4601 result = 1;
4602 break;
4603 }
4604 }
4605 }
4606 else {
4607 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4608 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004609 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004610 result = 1;
4611 break;
4612 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004613 }
4614 }
4615
4616 Py_DECREF(u);
4617 Py_DECREF(v);
4618 return result;
4619
4620onError:
4621 Py_XDECREF(u);
4622 Py_XDECREF(v);
4623 return -1;
4624}
4625
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626/* Concat to string or Unicode object giving a new Unicode object. */
4627
4628PyObject *PyUnicode_Concat(PyObject *left,
4629 PyObject *right)
4630{
4631 PyUnicodeObject *u = NULL, *v = NULL, *w;
4632
4633 /* Coerce the two arguments */
4634 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4635 if (u == NULL)
4636 goto onError;
4637 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4638 if (v == NULL)
4639 goto onError;
4640
4641 /* Shortcuts */
4642 if (v == unicode_empty) {
4643 Py_DECREF(v);
4644 return (PyObject *)u;
4645 }
4646 if (u == unicode_empty) {
4647 Py_DECREF(u);
4648 return (PyObject *)v;
4649 }
4650
4651 /* Concat the two Unicode strings */
4652 w = _PyUnicode_New(u->length + v->length);
4653 if (w == NULL)
4654 goto onError;
4655 Py_UNICODE_COPY(w->str, u->str, u->length);
4656 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4657
4658 Py_DECREF(u);
4659 Py_DECREF(v);
4660 return (PyObject *)w;
4661
4662onError:
4663 Py_XDECREF(u);
4664 Py_XDECREF(v);
4665 return NULL;
4666}
4667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004668PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669"S.count(sub[, start[, end]]) -> int\n\
4670\n\
4671Return the number of occurrences of substring sub in Unicode string\n\
4672S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004673interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674
4675static PyObject *
4676unicode_count(PyUnicodeObject *self, PyObject *args)
4677{
4678 PyUnicodeObject *substring;
4679 int start = 0;
4680 int end = INT_MAX;
4681 PyObject *result;
4682
Guido van Rossumb8872e62000-05-09 14:14:27 +00004683 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4684 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 return NULL;
4686
4687 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4688 (PyObject *)substring);
4689 if (substring == NULL)
4690 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004691
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 if (start < 0)
4693 start += self->length;
4694 if (start < 0)
4695 start = 0;
4696 if (end > self->length)
4697 end = self->length;
4698 if (end < 0)
4699 end += self->length;
4700 if (end < 0)
4701 end = 0;
4702
4703 result = PyInt_FromLong((long) count(self, start, end, substring));
4704
4705 Py_DECREF(substring);
4706 return result;
4707}
4708
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004709PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004710"S.encode([encoding[,errors]]) -> string\n\
4711\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004712Return an encoded string version of S. Default encoding is the current\n\
4713default string encoding. errors may be given to set a different error\n\
4714handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004715a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4716'xmlcharrefreplace' as well as any other name registered with\n\
4717codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718
4719static PyObject *
4720unicode_encode(PyUnicodeObject *self, PyObject *args)
4721{
4722 char *encoding = NULL;
4723 char *errors = NULL;
4724 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4725 return NULL;
4726 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4727}
4728
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004729PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730"S.expandtabs([tabsize]) -> unicode\n\
4731\n\
4732Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004733If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734
4735static PyObject*
4736unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4737{
4738 Py_UNICODE *e;
4739 Py_UNICODE *p;
4740 Py_UNICODE *q;
4741 int i, j;
4742 PyUnicodeObject *u;
4743 int tabsize = 8;
4744
4745 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4746 return NULL;
4747
Thomas Wouters7e474022000-07-16 12:04:32 +00004748 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749 i = j = 0;
4750 e = self->str + self->length;
4751 for (p = self->str; p < e; p++)
4752 if (*p == '\t') {
4753 if (tabsize > 0)
4754 j += tabsize - (j % tabsize);
4755 }
4756 else {
4757 j++;
4758 if (*p == '\n' || *p == '\r') {
4759 i += j;
4760 j = 0;
4761 }
4762 }
4763
4764 /* Second pass: create output string and fill it */
4765 u = _PyUnicode_New(i + j);
4766 if (!u)
4767 return NULL;
4768
4769 j = 0;
4770 q = u->str;
4771
4772 for (p = self->str; p < e; p++)
4773 if (*p == '\t') {
4774 if (tabsize > 0) {
4775 i = tabsize - (j % tabsize);
4776 j += i;
4777 while (i--)
4778 *q++ = ' ';
4779 }
4780 }
4781 else {
4782 j++;
4783 *q++ = *p;
4784 if (*p == '\n' || *p == '\r')
4785 j = 0;
4786 }
4787
4788 return (PyObject*) u;
4789}
4790
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004791PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792"S.find(sub [,start [,end]]) -> int\n\
4793\n\
4794Return the lowest index in S where substring sub is found,\n\
4795such that sub is contained within s[start,end]. Optional\n\
4796arguments start and end are interpreted as in slice notation.\n\
4797\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004798Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799
4800static PyObject *
4801unicode_find(PyUnicodeObject *self, PyObject *args)
4802{
4803 PyUnicodeObject *substring;
4804 int start = 0;
4805 int end = INT_MAX;
4806 PyObject *result;
4807
Guido van Rossumb8872e62000-05-09 14:14:27 +00004808 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4809 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 return NULL;
4811 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4812 (PyObject *)substring);
4813 if (substring == NULL)
4814 return NULL;
4815
4816 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4817
4818 Py_DECREF(substring);
4819 return result;
4820}
4821
4822static PyObject *
4823unicode_getitem(PyUnicodeObject *self, int index)
4824{
4825 if (index < 0 || index >= self->length) {
4826 PyErr_SetString(PyExc_IndexError, "string index out of range");
4827 return NULL;
4828 }
4829
4830 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4831}
4832
4833static long
4834unicode_hash(PyUnicodeObject *self)
4835{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004836 /* Since Unicode objects compare equal to their ASCII string
4837 counterparts, they should use the individual character values
4838 as basis for their hash value. This is needed to assure that
4839 strings and Unicode objects behave in the same way as
4840 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841
Fredrik Lundhdde61642000-07-10 18:27:47 +00004842 register int len;
4843 register Py_UNICODE *p;
4844 register long x;
4845
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 if (self->hash != -1)
4847 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004848 len = PyUnicode_GET_SIZE(self);
4849 p = PyUnicode_AS_UNICODE(self);
4850 x = *p << 7;
4851 while (--len >= 0)
4852 x = (1000003*x) ^ *p++;
4853 x ^= PyUnicode_GET_SIZE(self);
4854 if (x == -1)
4855 x = -2;
4856 self->hash = x;
4857 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004858}
4859
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004860PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861"S.index(sub [,start [,end]]) -> int\n\
4862\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004863Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864
4865static PyObject *
4866unicode_index(PyUnicodeObject *self, PyObject *args)
4867{
4868 int result;
4869 PyUnicodeObject *substring;
4870 int start = 0;
4871 int end = INT_MAX;
4872
Guido van Rossumb8872e62000-05-09 14:14:27 +00004873 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4874 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004876
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4878 (PyObject *)substring);
4879 if (substring == NULL)
4880 return NULL;
4881
4882 result = findstring(self, substring, start, end, 1);
4883
4884 Py_DECREF(substring);
4885 if (result < 0) {
4886 PyErr_SetString(PyExc_ValueError, "substring not found");
4887 return NULL;
4888 }
4889 return PyInt_FromLong(result);
4890}
4891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004892PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004893"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004895Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004896at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897
4898static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004899unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900{
4901 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4902 register const Py_UNICODE *e;
4903 int cased;
4904
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 /* Shortcut for single character strings */
4906 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004907 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004908
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004909 /* Special case for empty strings */
4910 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004911 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004912
Guido van Rossumd57fd912000-03-10 22:53:23 +00004913 e = p + PyUnicode_GET_SIZE(self);
4914 cased = 0;
4915 for (; p < e; p++) {
4916 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004917
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004919 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 else if (!cased && Py_UNICODE_ISLOWER(ch))
4921 cased = 1;
4922 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004923 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924}
4925
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004926PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004927"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00004929Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004930at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004931
4932static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004933unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934{
4935 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4936 register const Py_UNICODE *e;
4937 int cased;
4938
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939 /* Shortcut for single character strings */
4940 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004941 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004943 /* Special case for empty strings */
4944 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004945 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004946
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947 e = p + PyUnicode_GET_SIZE(self);
4948 cased = 0;
4949 for (; p < e; p++) {
4950 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004951
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004953 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954 else if (!cased && Py_UNICODE_ISUPPER(ch))
4955 cased = 1;
4956 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004957 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004958}
4959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004960PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004961"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00004963Return True if S is a titlecased string and there is at least one\n\
4964character in S, i.e. upper- and titlecase characters may only\n\
4965follow uncased characters and lowercase characters only cased ones.\n\
4966Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967
4968static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004969unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004970{
4971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4972 register const Py_UNICODE *e;
4973 int cased, previous_is_cased;
4974
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975 /* Shortcut for single character strings */
4976 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004977 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4978 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004980 /* Special case for empty strings */
4981 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004982 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004983
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984 e = p + PyUnicode_GET_SIZE(self);
4985 cased = 0;
4986 previous_is_cased = 0;
4987 for (; p < e; p++) {
4988 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00004989
Guido van Rossumd57fd912000-03-10 22:53:23 +00004990 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4991 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004992 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 previous_is_cased = 1;
4994 cased = 1;
4995 }
4996 else if (Py_UNICODE_ISLOWER(ch)) {
4997 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004998 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004999 previous_is_cased = 1;
5000 cased = 1;
5001 }
5002 else
5003 previous_is_cased = 0;
5004 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005005 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006}
5007
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005008PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005009"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005011Return True if all characters in S are whitespace\n\
5012and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013
5014static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005015unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016{
5017 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5018 register const Py_UNICODE *e;
5019
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 /* Shortcut for single character strings */
5021 if (PyUnicode_GET_SIZE(self) == 1 &&
5022 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005023 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005025 /* Special case for empty strings */
5026 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005027 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005028
Guido van Rossumd57fd912000-03-10 22:53:23 +00005029 e = p + PyUnicode_GET_SIZE(self);
5030 for (; p < e; p++) {
5031 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005032 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005034 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035}
5036
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005037PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005038"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005039\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005040Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005041and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005042
5043static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005044unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005045{
5046 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5047 register const Py_UNICODE *e;
5048
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005049 /* Shortcut for single character strings */
5050 if (PyUnicode_GET_SIZE(self) == 1 &&
5051 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005052 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005053
5054 /* Special case for empty strings */
5055 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005056 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005057
5058 e = p + PyUnicode_GET_SIZE(self);
5059 for (; p < e; p++) {
5060 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005061 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005062 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005063 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005064}
5065
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005066PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005067"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005068\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005069Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005070and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005071
5072static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005073unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005074{
5075 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5076 register const Py_UNICODE *e;
5077
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005078 /* Shortcut for single character strings */
5079 if (PyUnicode_GET_SIZE(self) == 1 &&
5080 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005081 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005082
5083 /* Special case for empty strings */
5084 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005085 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005086
5087 e = p + PyUnicode_GET_SIZE(self);
5088 for (; p < e; p++) {
5089 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005090 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005091 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005092 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005093}
5094
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005095PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005096"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005098Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005099False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100
5101static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005102unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103{
5104 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5105 register const Py_UNICODE *e;
5106
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 /* Shortcut for single character strings */
5108 if (PyUnicode_GET_SIZE(self) == 1 &&
5109 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005110 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005112 /* Special case for empty strings */
5113 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005114 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005115
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116 e = p + PyUnicode_GET_SIZE(self);
5117 for (; p < e; p++) {
5118 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005119 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005121 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122}
5123
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005124PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005125"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005127Return True if all characters in S are digits\n\
5128and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129
5130static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005131unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005132{
5133 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5134 register const Py_UNICODE *e;
5135
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 /* Shortcut for single character strings */
5137 if (PyUnicode_GET_SIZE(self) == 1 &&
5138 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005139 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005141 /* Special case for empty strings */
5142 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005143 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 e = p + PyUnicode_GET_SIZE(self);
5146 for (; p < e; p++) {
5147 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005148 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005150 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005151}
5152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005153PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005154"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005156Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005157False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158
5159static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005160unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161{
5162 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5163 register const Py_UNICODE *e;
5164
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 /* Shortcut for single character strings */
5166 if (PyUnicode_GET_SIZE(self) == 1 &&
5167 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005168 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005170 /* Special case for empty strings */
5171 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005172 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005173
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 e = p + PyUnicode_GET_SIZE(self);
5175 for (; p < e; p++) {
5176 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005177 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005179 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180}
5181
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005182PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183"S.join(sequence) -> unicode\n\
5184\n\
5185Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005186sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005187
5188static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005189unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005191 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192}
5193
5194static int
5195unicode_length(PyUnicodeObject *self)
5196{
5197 return self->length;
5198}
5199
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005200PyDoc_STRVAR(ljust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005201"S.ljust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202\n\
5203Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005204done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205
5206static PyObject *
5207unicode_ljust(PyUnicodeObject *self, PyObject *args)
5208{
5209 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005210 Py_UNICODE fillchar = ' ';
5211
5212 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 return NULL;
5214
Tim Peters7a29bd52001-09-12 03:03:31 +00005215 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 Py_INCREF(self);
5217 return (PyObject*) self;
5218 }
5219
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005220 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221}
5222
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005223PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224"S.lower() -> unicode\n\
5225\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005226Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
5228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005229unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 return fixup(self, fixlower);
5232}
5233
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005234#define LEFTSTRIP 0
5235#define RIGHTSTRIP 1
5236#define BOTHSTRIP 2
5237
5238/* Arrays indexed by above */
5239static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5240
5241#define STRIPNAME(i) (stripformat[i]+3)
5242
5243static const Py_UNICODE *
5244unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5245{
Tim Peters030a5ce2002-04-22 19:00:10 +00005246 size_t i;
5247 for (i = 0; i < n; ++i)
5248 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005249 return s+i;
5250 return NULL;
5251}
5252
5253/* externally visible for str.strip(unicode) */
5254PyObject *
5255_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5256{
5257 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5258 int len = PyUnicode_GET_SIZE(self);
5259 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5260 int seplen = PyUnicode_GET_SIZE(sepobj);
5261 int i, j;
5262
5263 i = 0;
5264 if (striptype != RIGHTSTRIP) {
5265 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5266 i++;
5267 }
5268 }
5269
5270 j = len;
5271 if (striptype != LEFTSTRIP) {
5272 do {
5273 j--;
5274 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5275 j++;
5276 }
5277
5278 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5279 Py_INCREF(self);
5280 return (PyObject*)self;
5281 }
5282 else
5283 return PyUnicode_FromUnicode(s+i, j-i);
5284}
5285
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286
5287static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005288do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005289{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005290 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5291 int len = PyUnicode_GET_SIZE(self), i, j;
5292
5293 i = 0;
5294 if (striptype != RIGHTSTRIP) {
5295 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5296 i++;
5297 }
5298 }
5299
5300 j = len;
5301 if (striptype != LEFTSTRIP) {
5302 do {
5303 j--;
5304 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5305 j++;
5306 }
5307
5308 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5309 Py_INCREF(self);
5310 return (PyObject*)self;
5311 }
5312 else
5313 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314}
5315
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005316
5317static PyObject *
5318do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5319{
5320 PyObject *sep = NULL;
5321
5322 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5323 return NULL;
5324
5325 if (sep != NULL && sep != Py_None) {
5326 if (PyUnicode_Check(sep))
5327 return _PyUnicode_XStrip(self, striptype, sep);
5328 else if (PyString_Check(sep)) {
5329 PyObject *res;
5330 sep = PyUnicode_FromObject(sep);
5331 if (sep==NULL)
5332 return NULL;
5333 res = _PyUnicode_XStrip(self, striptype, sep);
5334 Py_DECREF(sep);
5335 return res;
5336 }
5337 else {
5338 PyErr_Format(PyExc_TypeError,
5339 "%s arg must be None, unicode or str",
5340 STRIPNAME(striptype));
5341 return NULL;
5342 }
5343 }
5344
5345 return do_strip(self, striptype);
5346}
5347
5348
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005349PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005350"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005351\n\
5352Return a copy of the string S with leading and trailing\n\
5353whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005354If chars is given and not None, remove characters in chars instead.\n\
5355If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005356
5357static PyObject *
5358unicode_strip(PyUnicodeObject *self, PyObject *args)
5359{
5360 if (PyTuple_GET_SIZE(args) == 0)
5361 return do_strip(self, BOTHSTRIP); /* Common case */
5362 else
5363 return do_argstrip(self, BOTHSTRIP, args);
5364}
5365
5366
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005367PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005368"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005369\n\
5370Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005371If chars is given and not None, remove characters in chars instead.\n\
5372If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005373
5374static PyObject *
5375unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5376{
5377 if (PyTuple_GET_SIZE(args) == 0)
5378 return do_strip(self, LEFTSTRIP); /* Common case */
5379 else
5380 return do_argstrip(self, LEFTSTRIP, args);
5381}
5382
5383
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005384PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005385"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005386\n\
5387Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005388If chars is given and not None, remove characters in chars instead.\n\
5389If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005390
5391static PyObject *
5392unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5393{
5394 if (PyTuple_GET_SIZE(args) == 0)
5395 return do_strip(self, RIGHTSTRIP); /* Common case */
5396 else
5397 return do_argstrip(self, RIGHTSTRIP, args);
5398}
5399
5400
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401static PyObject*
5402unicode_repeat(PyUnicodeObject *str, int len)
5403{
5404 PyUnicodeObject *u;
5405 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005406 int nchars;
5407 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005408
5409 if (len < 0)
5410 len = 0;
5411
Tim Peters7a29bd52001-09-12 03:03:31 +00005412 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005413 /* no repeat, return original string */
5414 Py_INCREF(str);
5415 return (PyObject*) str;
5416 }
Tim Peters8f422462000-09-09 06:13:41 +00005417
5418 /* ensure # of chars needed doesn't overflow int and # of bytes
5419 * needed doesn't overflow size_t
5420 */
5421 nchars = len * str->length;
5422 if (len && nchars / len != str->length) {
5423 PyErr_SetString(PyExc_OverflowError,
5424 "repeated string is too long");
5425 return NULL;
5426 }
5427 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5428 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5429 PyErr_SetString(PyExc_OverflowError,
5430 "repeated string is too long");
5431 return NULL;
5432 }
5433 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005434 if (!u)
5435 return NULL;
5436
5437 p = u->str;
5438
5439 while (len-- > 0) {
5440 Py_UNICODE_COPY(p, str->str, str->length);
5441 p += str->length;
5442 }
5443
5444 return (PyObject*) u;
5445}
5446
5447PyObject *PyUnicode_Replace(PyObject *obj,
5448 PyObject *subobj,
5449 PyObject *replobj,
5450 int maxcount)
5451{
5452 PyObject *self;
5453 PyObject *str1;
5454 PyObject *str2;
5455 PyObject *result;
5456
5457 self = PyUnicode_FromObject(obj);
5458 if (self == NULL)
5459 return NULL;
5460 str1 = PyUnicode_FromObject(subobj);
5461 if (str1 == NULL) {
5462 Py_DECREF(self);
5463 return NULL;
5464 }
5465 str2 = PyUnicode_FromObject(replobj);
5466 if (str2 == NULL) {
5467 Py_DECREF(self);
5468 Py_DECREF(str1);
5469 return NULL;
5470 }
Tim Petersced69f82003-09-16 20:30:58 +00005471 result = replace((PyUnicodeObject *)self,
5472 (PyUnicodeObject *)str1,
5473 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 maxcount);
5475 Py_DECREF(self);
5476 Py_DECREF(str1);
5477 Py_DECREF(str2);
5478 return result;
5479}
5480
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005481PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482"S.replace (old, new[, maxsplit]) -> unicode\n\
5483\n\
5484Return a copy of S with all occurrences of substring\n\
5485old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005486given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487
5488static PyObject*
5489unicode_replace(PyUnicodeObject *self, PyObject *args)
5490{
5491 PyUnicodeObject *str1;
5492 PyUnicodeObject *str2;
5493 int maxcount = -1;
5494 PyObject *result;
5495
5496 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5497 return NULL;
5498 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5499 if (str1 == NULL)
5500 return NULL;
5501 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005502 if (str2 == NULL) {
5503 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005505 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506
5507 result = replace(self, str1, str2, maxcount);
5508
5509 Py_DECREF(str1);
5510 Py_DECREF(str2);
5511 return result;
5512}
5513
5514static
5515PyObject *unicode_repr(PyObject *unicode)
5516{
5517 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5518 PyUnicode_GET_SIZE(unicode),
5519 1);
5520}
5521
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005522PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523"S.rfind(sub [,start [,end]]) -> int\n\
5524\n\
5525Return the highest index in S where substring sub is found,\n\
5526such that sub is contained within s[start,end]. Optional\n\
5527arguments start and end are interpreted as in slice notation.\n\
5528\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005529Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530
5531static PyObject *
5532unicode_rfind(PyUnicodeObject *self, PyObject *args)
5533{
5534 PyUnicodeObject *substring;
5535 int start = 0;
5536 int end = INT_MAX;
5537 PyObject *result;
5538
Guido van Rossumb8872e62000-05-09 14:14:27 +00005539 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5540 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 return NULL;
5542 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5543 (PyObject *)substring);
5544 if (substring == NULL)
5545 return NULL;
5546
5547 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5548
5549 Py_DECREF(substring);
5550 return result;
5551}
5552
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005553PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005554"S.rindex(sub [,start [,end]]) -> int\n\
5555\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005556Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
5558static PyObject *
5559unicode_rindex(PyUnicodeObject *self, PyObject *args)
5560{
5561 int result;
5562 PyUnicodeObject *substring;
5563 int start = 0;
5564 int end = INT_MAX;
5565
Guido van Rossumb8872e62000-05-09 14:14:27 +00005566 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5567 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 return NULL;
5569 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5570 (PyObject *)substring);
5571 if (substring == NULL)
5572 return NULL;
5573
5574 result = findstring(self, substring, start, end, -1);
5575
5576 Py_DECREF(substring);
5577 if (result < 0) {
5578 PyErr_SetString(PyExc_ValueError, "substring not found");
5579 return NULL;
5580 }
5581 return PyInt_FromLong(result);
5582}
5583
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005584PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005585"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586\n\
5587Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005588done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589
5590static PyObject *
5591unicode_rjust(PyUnicodeObject *self, PyObject *args)
5592{
5593 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005594 Py_UNICODE fillchar = ' ';
5595
5596 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597 return NULL;
5598
Tim Peters7a29bd52001-09-12 03:03:31 +00005599 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600 Py_INCREF(self);
5601 return (PyObject*) self;
5602 }
5603
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005604 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605}
5606
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607static PyObject*
5608unicode_slice(PyUnicodeObject *self, int start, int end)
5609{
5610 /* standard clamping */
5611 if (start < 0)
5612 start = 0;
5613 if (end < 0)
5614 end = 0;
5615 if (end > self->length)
5616 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005617 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005618 /* full slice, return original string */
5619 Py_INCREF(self);
5620 return (PyObject*) self;
5621 }
5622 if (start > end)
5623 start = end;
5624 /* copy slice */
5625 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5626 end - start);
5627}
5628
5629PyObject *PyUnicode_Split(PyObject *s,
5630 PyObject *sep,
5631 int maxsplit)
5632{
5633 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005634
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 s = PyUnicode_FromObject(s);
5636 if (s == NULL)
5637 return NULL;
5638 if (sep != NULL) {
5639 sep = PyUnicode_FromObject(sep);
5640 if (sep == NULL) {
5641 Py_DECREF(s);
5642 return NULL;
5643 }
5644 }
5645
5646 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5647
5648 Py_DECREF(s);
5649 Py_XDECREF(sep);
5650 return result;
5651}
5652
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005653PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005654"S.split([sep [,maxsplit]]) -> list of strings\n\
5655\n\
5656Return a list of the words in S, using sep as the\n\
5657delimiter string. If maxsplit is given, at most maxsplit\n\
5658splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005659is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660
5661static PyObject*
5662unicode_split(PyUnicodeObject *self, PyObject *args)
5663{
5664 PyObject *substring = Py_None;
5665 int maxcount = -1;
5666
5667 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5668 return NULL;
5669
5670 if (substring == Py_None)
5671 return split(self, NULL, maxcount);
5672 else if (PyUnicode_Check(substring))
5673 return split(self, (PyUnicodeObject *)substring, maxcount);
5674 else
5675 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5676}
5677
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005678PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005679"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005680\n\
5681Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005682Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005683is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005684
5685static PyObject*
5686unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5687{
Guido van Rossum86662912000-04-11 15:38:46 +00005688 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689
Guido van Rossum86662912000-04-11 15:38:46 +00005690 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691 return NULL;
5692
Guido van Rossum86662912000-04-11 15:38:46 +00005693 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694}
5695
5696static
5697PyObject *unicode_str(PyUnicodeObject *self)
5698{
Fred Drakee4315f52000-05-09 19:53:39 +00005699 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005700}
5701
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005702PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703"S.swapcase() -> unicode\n\
5704\n\
5705Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005706and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005707
5708static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005709unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711 return fixup(self, fixswapcase);
5712}
5713
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005714PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005715"S.translate(table) -> unicode\n\
5716\n\
5717Return a copy of the string S, where all characters have been mapped\n\
5718through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005719Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5720Unmapped characters are left untouched. Characters mapped to None\n\
5721are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005722
5723static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005724unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725{
Tim Petersced69f82003-09-16 20:30:58 +00005726 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00005728 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729 "ignore");
5730}
5731
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005732PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733"S.upper() -> unicode\n\
5734\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005735Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736
5737static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005738unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 return fixup(self, fixupper);
5741}
5742
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005743PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744"S.zfill(width) -> unicode\n\
5745\n\
5746Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005747of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
5749static PyObject *
5750unicode_zfill(PyUnicodeObject *self, PyObject *args)
5751{
5752 int fill;
5753 PyUnicodeObject *u;
5754
5755 int width;
5756 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5757 return NULL;
5758
5759 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005760 if (PyUnicode_CheckExact(self)) {
5761 Py_INCREF(self);
5762 return (PyObject*) self;
5763 }
5764 else
5765 return PyUnicode_FromUnicode(
5766 PyUnicode_AS_UNICODE(self),
5767 PyUnicode_GET_SIZE(self)
5768 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005769 }
5770
5771 fill = width - self->length;
5772
5773 u = pad(self, fill, 0, '0');
5774
Walter Dörwald068325e2002-04-15 13:36:47 +00005775 if (u == NULL)
5776 return NULL;
5777
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 if (u->str[fill] == '+' || u->str[fill] == '-') {
5779 /* move sign to beginning of string */
5780 u->str[0] = u->str[fill];
5781 u->str[fill] = '0';
5782 }
5783
5784 return (PyObject*) u;
5785}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786
5787#if 0
5788static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005789unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005791 return PyInt_FromLong(unicode_freelist_size);
5792}
5793#endif
5794
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005795PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005796"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005797\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005798Return True if S starts with the specified prefix, False otherwise.\n\
5799With optional start, test S beginning at that position.\n\
5800With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801
5802static PyObject *
5803unicode_startswith(PyUnicodeObject *self,
5804 PyObject *args)
5805{
5806 PyUnicodeObject *substring;
5807 int start = 0;
5808 int end = INT_MAX;
5809 PyObject *result;
5810
Guido van Rossumb8872e62000-05-09 14:14:27 +00005811 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5812 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 return NULL;
5814 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5815 (PyObject *)substring);
5816 if (substring == NULL)
5817 return NULL;
5818
Guido van Rossum77f6a652002-04-03 22:41:51 +00005819 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820
5821 Py_DECREF(substring);
5822 return result;
5823}
5824
5825
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005826PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005827"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005828\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00005829Return True if S ends with the specified suffix, False otherwise.\n\
5830With optional start, test S beginning at that position.\n\
5831With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
5833static PyObject *
5834unicode_endswith(PyUnicodeObject *self,
5835 PyObject *args)
5836{
5837 PyUnicodeObject *substring;
5838 int start = 0;
5839 int end = INT_MAX;
5840 PyObject *result;
5841
Guido van Rossumb8872e62000-05-09 14:14:27 +00005842 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5843 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844 return NULL;
5845 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5846 (PyObject *)substring);
5847 if (substring == NULL)
5848 return NULL;
5849
Guido van Rossum77f6a652002-04-03 22:41:51 +00005850 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
5852 Py_DECREF(substring);
5853 return result;
5854}
5855
5856
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005857
5858static PyObject *
5859unicode_getnewargs(PyUnicodeObject *v)
5860{
5861 return Py_BuildValue("(u#)", v->str, v->length);
5862}
5863
5864
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865static PyMethodDef unicode_methods[] = {
5866
5867 /* Order is according to common usage: often used methods should
5868 appear first, since lookup is done sequentially. */
5869
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005870 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5871 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5872 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5873 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5874 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5875 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5876 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5877 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5878 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5879 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5880 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5881 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5882 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005883 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005884/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5885 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5886 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5887 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005888 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005889 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005890 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005891 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5892 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5893 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5894 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5895 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5896 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5897 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5898 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5899 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5900 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5901 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5902 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5903 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5904 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005905 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005906#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005907 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908#endif
5909
5910#if 0
5911 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005912 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005913#endif
5914
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005915 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 {NULL, NULL}
5917};
5918
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005919static PyObject *
5920unicode_mod(PyObject *v, PyObject *w)
5921{
5922 if (!PyUnicode_Check(v)) {
5923 Py_INCREF(Py_NotImplemented);
5924 return Py_NotImplemented;
5925 }
5926 return PyUnicode_Format(v, w);
5927}
5928
5929static PyNumberMethods unicode_as_number = {
5930 0, /*nb_add*/
5931 0, /*nb_subtract*/
5932 0, /*nb_multiply*/
5933 0, /*nb_divide*/
5934 unicode_mod, /*nb_remainder*/
5935};
5936
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937static PySequenceMethods unicode_as_sequence = {
5938 (inquiry) unicode_length, /* sq_length */
5939 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5940 (intargfunc) unicode_repeat, /* sq_repeat */
5941 (intargfunc) unicode_getitem, /* sq_item */
5942 (intintargfunc) unicode_slice, /* sq_slice */
5943 0, /* sq_ass_item */
5944 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005945 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946};
5947
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005948static PyObject*
5949unicode_subscript(PyUnicodeObject* self, PyObject* item)
5950{
5951 if (PyInt_Check(item)) {
5952 long i = PyInt_AS_LONG(item);
5953 if (i < 0)
5954 i += PyString_GET_SIZE(self);
5955 return unicode_getitem(self, i);
5956 } else if (PyLong_Check(item)) {
5957 long i = PyLong_AsLong(item);
5958 if (i == -1 && PyErr_Occurred())
5959 return NULL;
5960 if (i < 0)
5961 i += PyString_GET_SIZE(self);
5962 return unicode_getitem(self, i);
5963 } else if (PySlice_Check(item)) {
5964 int start, stop, step, slicelength, cur, i;
5965 Py_UNICODE* source_buf;
5966 Py_UNICODE* result_buf;
5967 PyObject* result;
5968
5969 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5970 &start, &stop, &step, &slicelength) < 0) {
5971 return NULL;
5972 }
5973
5974 if (slicelength <= 0) {
5975 return PyUnicode_FromUnicode(NULL, 0);
5976 } else {
5977 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5978 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5979
5980 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5981 result_buf[i] = source_buf[cur];
5982 }
Tim Petersced69f82003-09-16 20:30:58 +00005983
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005984 result = PyUnicode_FromUnicode(result_buf, slicelength);
5985 PyMem_FREE(result_buf);
5986 return result;
5987 }
5988 } else {
5989 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5990 return NULL;
5991 }
5992}
5993
5994static PyMappingMethods unicode_as_mapping = {
5995 (inquiry)unicode_length, /* mp_length */
5996 (binaryfunc)unicode_subscript, /* mp_subscript */
5997 (objobjargproc)0, /* mp_ass_subscript */
5998};
5999
Guido van Rossumd57fd912000-03-10 22:53:23 +00006000static int
6001unicode_buffer_getreadbuf(PyUnicodeObject *self,
6002 int index,
6003 const void **ptr)
6004{
6005 if (index != 0) {
6006 PyErr_SetString(PyExc_SystemError,
6007 "accessing non-existent unicode segment");
6008 return -1;
6009 }
6010 *ptr = (void *) self->str;
6011 return PyUnicode_GET_DATA_SIZE(self);
6012}
6013
6014static int
6015unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6016 const void **ptr)
6017{
6018 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006019 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020 return -1;
6021}
6022
6023static int
6024unicode_buffer_getsegcount(PyUnicodeObject *self,
6025 int *lenp)
6026{
6027 if (lenp)
6028 *lenp = PyUnicode_GET_DATA_SIZE(self);
6029 return 1;
6030}
6031
6032static int
6033unicode_buffer_getcharbuf(PyUnicodeObject *self,
6034 int index,
6035 const void **ptr)
6036{
6037 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006038
Guido van Rossumd57fd912000-03-10 22:53:23 +00006039 if (index != 0) {
6040 PyErr_SetString(PyExc_SystemError,
6041 "accessing non-existent unicode segment");
6042 return -1;
6043 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006044 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045 if (str == NULL)
6046 return -1;
6047 *ptr = (void *) PyString_AS_STRING(str);
6048 return PyString_GET_SIZE(str);
6049}
6050
6051/* Helpers for PyUnicode_Format() */
6052
6053static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006054getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055{
6056 int argidx = *p_argidx;
6057 if (argidx < arglen) {
6058 (*p_argidx)++;
6059 if (arglen < 0)
6060 return args;
6061 else
6062 return PyTuple_GetItem(args, argidx);
6063 }
6064 PyErr_SetString(PyExc_TypeError,
6065 "not enough arguments for format string");
6066 return NULL;
6067}
6068
6069#define F_LJUST (1<<0)
6070#define F_SIGN (1<<1)
6071#define F_BLANK (1<<2)
6072#define F_ALT (1<<3)
6073#define F_ZERO (1<<4)
6074
6075static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077{
6078 register int i;
6079 int len;
6080 va_list va;
6081 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006082 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006083
6084 /* First, format the string as char array, then expand to Py_UNICODE
6085 array. */
6086 charbuffer = (char *)buffer;
6087 len = vsprintf(charbuffer, format, va);
6088 for (i = len - 1; i >= 0; i--)
6089 buffer[i] = (Py_UNICODE) charbuffer[i];
6090
6091 va_end(va);
6092 return len;
6093}
6094
Guido van Rossum078151d2002-08-11 04:24:12 +00006095/* XXX To save some code duplication, formatfloat/long/int could have been
6096 shared with stringobject.c, converting from 8-bit to Unicode after the
6097 formatting is done. */
6098
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099static int
6100formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006101 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006102 int flags,
6103 int prec,
6104 int type,
6105 PyObject *v)
6106{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006107 /* fmt = '%#.' + `prec` + `type`
6108 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109 char fmt[20];
6110 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006111
Guido van Rossumd57fd912000-03-10 22:53:23 +00006112 x = PyFloat_AsDouble(v);
6113 if (x == -1.0 && PyErr_Occurred())
6114 return -1;
6115 if (prec < 0)
6116 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006117 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6118 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006119 /* Worst case length calc to ensure no buffer overrun:
6120
6121 'g' formats:
6122 fmt = %#.<prec>g
6123 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6124 for any double rep.)
6125 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6126
6127 'f' formats:
6128 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6129 len = 1 + 50 + 1 + prec = 52 + prec
6130
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006131 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006132 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006133
6134 */
6135 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6136 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006137 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006138 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006139 return -1;
6140 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006141 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6142 (flags&F_ALT) ? "#" : "",
6143 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 return usprintf(buf, fmt, x);
6145}
6146
Tim Peters38fd5b62000-09-21 05:43:11 +00006147static PyObject*
6148formatlong(PyObject *val, int flags, int prec, int type)
6149{
6150 char *buf;
6151 int i, len;
6152 PyObject *str; /* temporary string object. */
6153 PyUnicodeObject *result;
6154
6155 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6156 if (!str)
6157 return NULL;
6158 result = _PyUnicode_New(len);
6159 for (i = 0; i < len; i++)
6160 result->str[i] = buf[i];
6161 result->str[len] = 0;
6162 Py_DECREF(str);
6163 return (PyObject*)result;
6164}
6165
Guido van Rossumd57fd912000-03-10 22:53:23 +00006166static int
6167formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006168 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 int flags,
6170 int prec,
6171 int type,
6172 PyObject *v)
6173{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006174 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006175 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6176 * + 1 + 1
6177 * = 24
6178 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006179 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006180 long x;
6181
6182 x = PyInt_AsLong(v);
6183 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006184 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006185 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006186 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006187 "%u/%o/%x/%X of negative int will return "
6188 "a signed string in Python 2.4 and up") < 0)
6189 return -1;
6190 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006192 prec = 1;
6193
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006194 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006195 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6196 */
6197 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006198 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006199 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006200 return -1;
6201 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006202
6203 if ((flags & F_ALT) &&
6204 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006205 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006206 * of issues that cause pain:
6207 * - when 0 is being converted, the C standard leaves off
6208 * the '0x' or '0X', which is inconsistent with other
6209 * %#x/%#X conversions and inconsistent with Python's
6210 * hex() function
6211 * - there are platforms that violate the standard and
6212 * convert 0 with the '0x' or '0X'
6213 * (Metrowerks, Compaq Tru64)
6214 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006215 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006216 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006217 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006218 * We can achieve the desired consistency by inserting our
6219 * own '0x' or '0X' prefix, and substituting %x/%X in place
6220 * of %#x/%#X.
6221 *
6222 * Note that this is the same approach as used in
6223 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006224 */
Tim Petersced69f82003-09-16 20:30:58 +00006225 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006226 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006227 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006228 else {
6229 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
Tim Petersced69f82003-09-16 20:30:58 +00006230 (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006231 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006233 return usprintf(buf, fmt, x);
6234}
6235
6236static int
6237formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006238 size_t buflen,
6239 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006241 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006242 if (PyUnicode_Check(v)) {
6243 if (PyUnicode_GET_SIZE(v) != 1)
6244 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006246 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006248 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006249 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006250 goto onError;
6251 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253
6254 else {
6255 /* Integer input truncated to a character */
6256 long x;
6257 x = PyInt_AsLong(v);
6258 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006259 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006260#ifdef Py_UNICODE_WIDE
6261 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006262 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006263 "%c arg not in range(0x110000) "
6264 "(wide Python build)");
6265 return -1;
6266 }
6267#else
6268 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006269 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006270 "%c arg not in range(0x10000) "
6271 "(narrow Python build)");
6272 return -1;
6273 }
6274#endif
6275 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006276 }
6277 buf[1] = '\0';
6278 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006279
6280 onError:
6281 PyErr_SetString(PyExc_TypeError,
6282 "%c requires int or char");
6283 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006284}
6285
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006286/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6287
6288 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6289 chars are formatted. XXX This is a magic number. Each formatting
6290 routine does bounds checking to ensure no overflow, but a better
6291 solution may be to malloc a buffer of appropriate size for each
6292 format. For now, the current solution is sufficient.
6293*/
6294#define FORMATBUFLEN (size_t)120
6295
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296PyObject *PyUnicode_Format(PyObject *format,
6297 PyObject *args)
6298{
6299 Py_UNICODE *fmt, *res;
6300 int fmtcnt, rescnt, reslen, arglen, argidx;
6301 int args_owned = 0;
6302 PyUnicodeObject *result = NULL;
6303 PyObject *dict = NULL;
6304 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006305
Guido van Rossumd57fd912000-03-10 22:53:23 +00006306 if (format == NULL || args == NULL) {
6307 PyErr_BadInternalCall();
6308 return NULL;
6309 }
6310 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006311 if (uformat == NULL)
6312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006313 fmt = PyUnicode_AS_UNICODE(uformat);
6314 fmtcnt = PyUnicode_GET_SIZE(uformat);
6315
6316 reslen = rescnt = fmtcnt + 100;
6317 result = _PyUnicode_New(reslen);
6318 if (result == NULL)
6319 goto onError;
6320 res = PyUnicode_AS_UNICODE(result);
6321
6322 if (PyTuple_Check(args)) {
6323 arglen = PyTuple_Size(args);
6324 argidx = 0;
6325 }
6326 else {
6327 arglen = -1;
6328 argidx = -2;
6329 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006330 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6331 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 dict = args;
6333
6334 while (--fmtcnt >= 0) {
6335 if (*fmt != '%') {
6336 if (--rescnt < 0) {
6337 rescnt = fmtcnt + 100;
6338 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006339 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006340 return NULL;
6341 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6342 --rescnt;
6343 }
6344 *res++ = *fmt++;
6345 }
6346 else {
6347 /* Got a format specifier */
6348 int flags = 0;
6349 int width = -1;
6350 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006351 Py_UNICODE c = '\0';
6352 Py_UNICODE fill;
6353 PyObject *v = NULL;
6354 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006355 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006356 Py_UNICODE sign;
6357 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006358 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359
6360 fmt++;
6361 if (*fmt == '(') {
6362 Py_UNICODE *keystart;
6363 int keylen;
6364 PyObject *key;
6365 int pcount = 1;
6366
6367 if (dict == NULL) {
6368 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006369 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006370 goto onError;
6371 }
6372 ++fmt;
6373 --fmtcnt;
6374 keystart = fmt;
6375 /* Skip over balanced parentheses */
6376 while (pcount > 0 && --fmtcnt >= 0) {
6377 if (*fmt == ')')
6378 --pcount;
6379 else if (*fmt == '(')
6380 ++pcount;
6381 fmt++;
6382 }
6383 keylen = fmt - keystart - 1;
6384 if (fmtcnt < 0 || pcount > 0) {
6385 PyErr_SetString(PyExc_ValueError,
6386 "incomplete format key");
6387 goto onError;
6388 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006389#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006390 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 then looked up since Python uses strings to hold
6392 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006393 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006394 key = PyUnicode_EncodeUTF8(keystart,
6395 keylen,
6396 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006397#else
6398 key = PyUnicode_FromUnicode(keystart, keylen);
6399#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006400 if (key == NULL)
6401 goto onError;
6402 if (args_owned) {
6403 Py_DECREF(args);
6404 args_owned = 0;
6405 }
6406 args = PyObject_GetItem(dict, key);
6407 Py_DECREF(key);
6408 if (args == NULL) {
6409 goto onError;
6410 }
6411 args_owned = 1;
6412 arglen = -1;
6413 argidx = -2;
6414 }
6415 while (--fmtcnt >= 0) {
6416 switch (c = *fmt++) {
6417 case '-': flags |= F_LJUST; continue;
6418 case '+': flags |= F_SIGN; continue;
6419 case ' ': flags |= F_BLANK; continue;
6420 case '#': flags |= F_ALT; continue;
6421 case '0': flags |= F_ZERO; continue;
6422 }
6423 break;
6424 }
6425 if (c == '*') {
6426 v = getnextarg(args, arglen, &argidx);
6427 if (v == NULL)
6428 goto onError;
6429 if (!PyInt_Check(v)) {
6430 PyErr_SetString(PyExc_TypeError,
6431 "* wants int");
6432 goto onError;
6433 }
6434 width = PyInt_AsLong(v);
6435 if (width < 0) {
6436 flags |= F_LJUST;
6437 width = -width;
6438 }
6439 if (--fmtcnt >= 0)
6440 c = *fmt++;
6441 }
6442 else if (c >= '0' && c <= '9') {
6443 width = c - '0';
6444 while (--fmtcnt >= 0) {
6445 c = *fmt++;
6446 if (c < '0' || c > '9')
6447 break;
6448 if ((width*10) / 10 != width) {
6449 PyErr_SetString(PyExc_ValueError,
6450 "width too big");
6451 goto onError;
6452 }
6453 width = width*10 + (c - '0');
6454 }
6455 }
6456 if (c == '.') {
6457 prec = 0;
6458 if (--fmtcnt >= 0)
6459 c = *fmt++;
6460 if (c == '*') {
6461 v = getnextarg(args, arglen, &argidx);
6462 if (v == NULL)
6463 goto onError;
6464 if (!PyInt_Check(v)) {
6465 PyErr_SetString(PyExc_TypeError,
6466 "* wants int");
6467 goto onError;
6468 }
6469 prec = PyInt_AsLong(v);
6470 if (prec < 0)
6471 prec = 0;
6472 if (--fmtcnt >= 0)
6473 c = *fmt++;
6474 }
6475 else if (c >= '0' && c <= '9') {
6476 prec = c - '0';
6477 while (--fmtcnt >= 0) {
6478 c = Py_CHARMASK(*fmt++);
6479 if (c < '0' || c > '9')
6480 break;
6481 if ((prec*10) / 10 != prec) {
6482 PyErr_SetString(PyExc_ValueError,
6483 "prec too big");
6484 goto onError;
6485 }
6486 prec = prec*10 + (c - '0');
6487 }
6488 }
6489 } /* prec */
6490 if (fmtcnt >= 0) {
6491 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006492 if (--fmtcnt >= 0)
6493 c = *fmt++;
6494 }
6495 }
6496 if (fmtcnt < 0) {
6497 PyErr_SetString(PyExc_ValueError,
6498 "incomplete format");
6499 goto onError;
6500 }
6501 if (c != '%') {
6502 v = getnextarg(args, arglen, &argidx);
6503 if (v == NULL)
6504 goto onError;
6505 }
6506 sign = 0;
6507 fill = ' ';
6508 switch (c) {
6509
6510 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006511 pbuf = formatbuf;
6512 /* presume that buffer length is at least 1 */
6513 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006514 len = 1;
6515 break;
6516
6517 case 's':
6518 case 'r':
6519 if (PyUnicode_Check(v) && c == 's') {
6520 temp = v;
6521 Py_INCREF(temp);
6522 }
6523 else {
6524 PyObject *unicode;
6525 if (c == 's')
6526 temp = PyObject_Str(v);
6527 else
6528 temp = PyObject_Repr(v);
6529 if (temp == NULL)
6530 goto onError;
6531 if (!PyString_Check(temp)) {
6532 /* XXX Note: this should never happen, since
6533 PyObject_Repr() and PyObject_Str() assure
6534 this */
6535 Py_DECREF(temp);
6536 PyErr_SetString(PyExc_TypeError,
6537 "%s argument has non-string str()");
6538 goto onError;
6539 }
Fred Drakee4315f52000-05-09 19:53:39 +00006540 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006541 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006542 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 "strict");
6544 Py_DECREF(temp);
6545 temp = unicode;
6546 if (temp == NULL)
6547 goto onError;
6548 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006549 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550 len = PyUnicode_GET_SIZE(temp);
6551 if (prec >= 0 && len > prec)
6552 len = prec;
6553 break;
6554
6555 case 'i':
6556 case 'd':
6557 case 'u':
6558 case 'o':
6559 case 'x':
6560 case 'X':
6561 if (c == 'i')
6562 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006563 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006564 temp = formatlong(v, flags, prec, c);
6565 if (!temp)
6566 goto onError;
6567 pbuf = PyUnicode_AS_UNICODE(temp);
6568 len = PyUnicode_GET_SIZE(temp);
6569 /* unbounded ints can always produce
6570 a sign character! */
6571 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006573 else {
6574 pbuf = formatbuf;
6575 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6576 flags, prec, c, v);
6577 if (len < 0)
6578 goto onError;
6579 /* only d conversion is signed */
6580 sign = c == 'd';
6581 }
6582 if (flags & F_ZERO)
6583 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006584 break;
6585
6586 case 'e':
6587 case 'E':
6588 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006589 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006590 case 'g':
6591 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006592 if (c == 'F')
6593 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006594 pbuf = formatbuf;
6595 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6596 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006597 if (len < 0)
6598 goto onError;
6599 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006600 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006601 fill = '0';
6602 break;
6603
6604 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006605 pbuf = formatbuf;
6606 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006607 if (len < 0)
6608 goto onError;
6609 break;
6610
6611 default:
6612 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006613 "unsupported format character '%c' (0x%x) "
6614 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006615 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006616 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006617 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 goto onError;
6619 }
6620 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006621 if (*pbuf == '-' || *pbuf == '+') {
6622 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 len--;
6624 }
6625 else if (flags & F_SIGN)
6626 sign = '+';
6627 else if (flags & F_BLANK)
6628 sign = ' ';
6629 else
6630 sign = 0;
6631 }
6632 if (width < len)
6633 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006634 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635 reslen -= rescnt;
6636 rescnt = width + fmtcnt + 100;
6637 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006638 if (reslen < 0) {
6639 Py_DECREF(result);
6640 return PyErr_NoMemory();
6641 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006642 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643 return NULL;
6644 res = PyUnicode_AS_UNICODE(result)
6645 + reslen - rescnt;
6646 }
6647 if (sign) {
6648 if (fill != ' ')
6649 *res++ = sign;
6650 rescnt--;
6651 if (width > len)
6652 width--;
6653 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006654 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6655 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006656 assert(pbuf[1] == c);
6657 if (fill != ' ') {
6658 *res++ = *pbuf++;
6659 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006660 }
Tim Petersfff53252001-04-12 18:38:48 +00006661 rescnt -= 2;
6662 width -= 2;
6663 if (width < 0)
6664 width = 0;
6665 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006667 if (width > len && !(flags & F_LJUST)) {
6668 do {
6669 --rescnt;
6670 *res++ = fill;
6671 } while (--width > len);
6672 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006673 if (fill == ' ') {
6674 if (sign)
6675 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006676 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006677 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006678 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006679 *res++ = *pbuf++;
6680 *res++ = *pbuf++;
6681 }
6682 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006683 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684 res += len;
6685 rescnt -= len;
6686 while (--width >= len) {
6687 --rescnt;
6688 *res++ = ' ';
6689 }
6690 if (dict && (argidx < arglen) && c != '%') {
6691 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006692 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 goto onError;
6694 }
6695 Py_XDECREF(temp);
6696 } /* '%' */
6697 } /* until end */
6698 if (argidx < arglen && !dict) {
6699 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006700 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 goto onError;
6702 }
6703
6704 if (args_owned) {
6705 Py_DECREF(args);
6706 }
6707 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00006708 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006709 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006710 return (PyObject *)result;
6711
6712 onError:
6713 Py_XDECREF(result);
6714 Py_DECREF(uformat);
6715 if (args_owned) {
6716 Py_DECREF(args);
6717 }
6718 return NULL;
6719}
6720
6721static PyBufferProcs unicode_as_buffer = {
6722 (getreadbufferproc) unicode_buffer_getreadbuf,
6723 (getwritebufferproc) unicode_buffer_getwritebuf,
6724 (getsegcountproc) unicode_buffer_getsegcount,
6725 (getcharbufferproc) unicode_buffer_getcharbuf,
6726};
6727
Jeremy Hylton938ace62002-07-17 16:30:39 +00006728static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006729unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6730
Tim Peters6d6c1a32001-08-02 04:15:00 +00006731static PyObject *
6732unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6733{
6734 PyObject *x = NULL;
6735 static char *kwlist[] = {"string", "encoding", "errors", 0};
6736 char *encoding = NULL;
6737 char *errors = NULL;
6738
Guido van Rossume023fe02001-08-30 03:12:59 +00006739 if (type != &PyUnicode_Type)
6740 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006741 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6742 kwlist, &x, &encoding, &errors))
6743 return NULL;
6744 if (x == NULL)
6745 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006746 if (encoding == NULL && errors == NULL)
6747 return PyObject_Unicode(x);
6748 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006749 return PyUnicode_FromEncodedObject(x, encoding, errors);
6750}
6751
Guido van Rossume023fe02001-08-30 03:12:59 +00006752static PyObject *
6753unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6754{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006755 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006756 int n;
6757
6758 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6759 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6760 if (tmp == NULL)
6761 return NULL;
6762 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006763 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006764 if (pnew == NULL) {
6765 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00006766 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00006767 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006768 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6769 if (pnew->str == NULL) {
6770 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006771 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00006772 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00006773 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00006774 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006775 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6776 pnew->length = n;
6777 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006778 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006779 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006780}
6781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006782PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006783"unicode(string [, encoding[, errors]]) -> object\n\
6784\n\
6785Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006786encoding defaults to the current default string encoding.\n\
6787errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006788
Guido van Rossumd57fd912000-03-10 22:53:23 +00006789PyTypeObject PyUnicode_Type = {
6790 PyObject_HEAD_INIT(&PyType_Type)
6791 0, /* ob_size */
6792 "unicode", /* tp_name */
6793 sizeof(PyUnicodeObject), /* tp_size */
6794 0, /* tp_itemsize */
6795 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006796 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006797 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006798 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006799 0, /* tp_setattr */
6800 (cmpfunc) unicode_compare, /* tp_compare */
6801 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006802 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006803 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006804 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006805 (hashfunc) unicode_hash, /* tp_hash*/
6806 0, /* tp_call*/
6807 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006808 PyObject_GenericGetAttr, /* tp_getattro */
6809 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006810 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006811 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6812 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006813 unicode_doc, /* tp_doc */
6814 0, /* tp_traverse */
6815 0, /* tp_clear */
6816 0, /* tp_richcompare */
6817 0, /* tp_weaklistoffset */
6818 0, /* tp_iter */
6819 0, /* tp_iternext */
6820 unicode_methods, /* tp_methods */
6821 0, /* tp_members */
6822 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006823 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006824 0, /* tp_dict */
6825 0, /* tp_descr_get */
6826 0, /* tp_descr_set */
6827 0, /* tp_dictoffset */
6828 0, /* tp_init */
6829 0, /* tp_alloc */
6830 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006831 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006832};
6833
6834/* Initialize the Unicode implementation */
6835
Thomas Wouters78890102000-07-22 19:25:51 +00006836void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006837{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006838 int i;
6839
Fred Drakee4315f52000-05-09 19:53:39 +00006840 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006841 unicode_freelist = NULL;
6842 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006843 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006844 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006845 for (i = 0; i < 256; i++)
6846 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006847 if (PyType_Ready(&PyUnicode_Type) < 0)
6848 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006849}
6850
6851/* Finalize the Unicode implementation */
6852
6853void
Thomas Wouters78890102000-07-22 19:25:51 +00006854_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006855{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006856 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006857 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006858
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006859 Py_XDECREF(unicode_empty);
6860 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006861
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006862 for (i = 0; i < 256; i++) {
6863 if (unicode_latin1[i]) {
6864 Py_DECREF(unicode_latin1[i]);
6865 unicode_latin1[i] = NULL;
6866 }
6867 }
6868
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006869 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006870 PyUnicodeObject *v = u;
6871 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006872 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006873 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006874 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006875 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006876 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006877 unicode_freelist = NULL;
6878 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006879}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006880
6881/*
6882Local variables:
6883c-basic-offset: 4
6884indent-tabs-mode: nil
6885End:
6886*/