blob: 00ccea8d4c5110257edaed4ace73ac22c704097e [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Nicholas Bastin1ce9e4c2004-06-17 18:27:18 +000044#ifdef __SUNPRO_C
45#pragma error_messages (off,E_END_OF_LOOP_CODE_NOT_REACHED)
46#endif
47
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000048#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000049#include <windows.h>
50#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000051
Guido van Rossumd57fd912000-03-10 22:53:23 +000052/* Limit for the Unicode object free list */
53
54#define MAX_UNICODE_FREELIST_SIZE 1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
Tim Petersced69f82003-09-16 20:30:58 +000060 limit. This reduces malloc() overhead for small Unicode objects.
Guido van Rossumd57fd912000-03-10 22:53:23 +000061
Barry Warsaw51ac5802000-03-20 16:36:48 +000062 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000063 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000064 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
Guido van Rossumfd4b9572000-04-10 13:51:10 +000068 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71*/
72
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000074
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000083/* --- Globals ------------------------------------------------------------
84
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
87
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Guido van Rossumd57fd912000-03-10 22:53:23 +000090/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000091static PyUnicodeObject *unicode_freelist;
92static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000093
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000094/* The empty Unicode object is shared to improve performance. */
95static PyUnicodeObject *unicode_empty;
96
97/* Single character Unicode strings in the Latin-1 range are being
98 shared as well. */
99static PyUnicodeObject *unicode_latin1[256];
100
Fred Drakee4315f52000-05-09 19:53:39 +0000101/* Default encoding to use and assume when NULL is passed as encoding
102 parameter; it is initialized by _PyUnicode_Init().
103
104 Always use the PyUnicode_SetDefaultEncoding() and
Tim Petersced69f82003-09-16 20:30:58 +0000105 PyUnicode_GetDefaultEncoding() APIs to access this global.
Fred Drakee4315f52000-05-09 19:53:39 +0000106
107*/
Fred Drakee4315f52000-05-09 19:53:39 +0000108static char unicode_default_encoding[100];
109
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000111PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000112{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000113#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000114 return 0x10FFFF;
115#else
116 /* This is actually an illegal character, so it should
117 not be passed to unichr. */
118 return 0xFFFF;
119#endif
120}
121
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122/* --- Unicode Object ----------------------------------------------------- */
123
124static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000125int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000126 int length)
127{
128 void *oldstr;
Tim Petersced69f82003-09-16 20:30:58 +0000129
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000130 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000131 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000132 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000133
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000134 /* Resizing shared object (unicode_empty or single character
135 objects) in-place is not allowed. Use PyUnicode_Resize()
136 instead ! */
Tim Petersced69f82003-09-16 20:30:58 +0000137 if (unicode == unicode_empty ||
138 (unicode->length == 1 &&
139 /* MvL said unicode->str[] may be signed. Python generally assumes
140 * an int contains at least 32 bits, and we don't use more than
141 * 32 bits even in a UCS4 build, so casting to unsigned int should
142 * be correct.
143 */
144 (unsigned int)unicode->str[0] < 256U &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000145 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000147 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000148 return -1;
149 }
150
151 /* We allocate one more byte to make sure the string is
152 Ux0000 terminated -- XXX is this needed ? */
153 oldstr = unicode->str;
154 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
155 if (!unicode->str) {
156 unicode->str = oldstr;
157 PyErr_NoMemory();
158 return -1;
159 }
160 unicode->str[length] = 0;
161 unicode->length = length;
162
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000163 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000164 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000165 if (unicode->defenc) {
166 Py_DECREF(unicode->defenc);
167 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000168 }
169 unicode->hash = -1;
Tim Petersced69f82003-09-16 20:30:58 +0000170
Guido van Rossumd57fd912000-03-10 22:53:23 +0000171 return 0;
172}
173
174/* We allocate one more byte to make sure the string is
Tim Petersced69f82003-09-16 20:30:58 +0000175 Ux0000 terminated -- XXX is this needed ?
Guido van Rossumd57fd912000-03-10 22:53:23 +0000176
177 XXX This allocator could further be enhanced by assuring that the
178 free list never reduces its size below 1.
179
180*/
181
182static
183PyUnicodeObject *_PyUnicode_New(int length)
184{
185 register PyUnicodeObject *unicode;
186
Tim Petersced69f82003-09-16 20:30:58 +0000187 /* Optimization fo empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 if (length == 0 && unicode_empty != NULL) {
189 Py_INCREF(unicode_empty);
190 return unicode_empty;
191 }
192
193 /* Unicode freelist & memory allocation */
194 if (unicode_freelist) {
195 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000196 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000197 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000198 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000199 /* Keep-Alive optimization: we only upsize the buffer,
200 never downsize it. */
Tim Petersced69f82003-09-16 20:30:58 +0000201 if ((unicode->length < length) &&
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000202 unicode_resize(unicode, length) < 0) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000203 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000204 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 }
206 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000207 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000209 }
210 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 }
212 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000213 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 if (unicode == NULL)
215 return NULL;
216 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
217 }
218
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000219 if (!unicode->str) {
220 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000221 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000222 }
Jeremy Hyltond8082792003-09-16 19:41:39 +0000223 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +0000224 * the caller fails before initializing str -- unicode_resize()
225 * reads str[0], and the Keep-Alive optimization can keep memory
226 * allocated for str alive across a call to unicode_dealloc(unicode).
227 * We don't want unicode_resize to read uninitialized memory in
228 * that case.
229 */
Jeremy Hyltond8082792003-09-16 19:41:39 +0000230 unicode->str[0] = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 unicode->str[length] = 0;
232 unicode->length = length;
233 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000234 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000235 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236
237 onError:
238 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000239 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000240 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241}
242
243static
Guido van Rossum9475a232001-10-05 20:51:39 +0000244void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000246 if (PyUnicode_CheckExact(unicode) &&
247 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000248 /* Keep-Alive optimization */
249 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000250 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000251 unicode->str = NULL;
252 unicode->length = 0;
253 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000254 if (unicode->defenc) {
255 Py_DECREF(unicode->defenc);
256 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000257 }
258 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000259 *(PyUnicodeObject **)unicode = unicode_freelist;
260 unicode_freelist = unicode;
261 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 }
263 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000264 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000265 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000266 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268}
269
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000270int PyUnicode_Resize(PyObject **unicode, int length)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000271{
272 register PyUnicodeObject *v;
273
274 /* Argument checks */
275 if (unicode == NULL) {
276 PyErr_BadInternalCall();
277 return -1;
278 }
279 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000280 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000281 PyErr_BadInternalCall();
282 return -1;
283 }
284
285 /* Resizing unicode_empty and single character objects is not
286 possible since these are being shared. We simply return a fresh
287 copy with the same Unicode content. */
Tim Petersced69f82003-09-16 20:30:58 +0000288 if (v->length != length &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000289 (v == unicode_empty || v->length == 1)) {
290 PyUnicodeObject *w = _PyUnicode_New(length);
291 if (w == NULL)
292 return -1;
293 Py_UNICODE_COPY(w->str, v->str,
294 length < v->length ? length : v->length);
Raymond Hettingerc8df5782003-03-09 07:30:43 +0000295 Py_DECREF(*unicode);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 *unicode = (PyObject *)w;
297 return 0;
298 }
299
300 /* Note that we don't have to modify *unicode for unshared Unicode
301 objects, since we can modify them in-place. */
302 return unicode_resize(v, length);
303}
304
305/* Internal API for use in unicodeobject.c only ! */
306#define _PyUnicode_Resize(unicodevar, length) \
307 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
308
Guido van Rossumd57fd912000-03-10 22:53:23 +0000309PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
310 int size)
311{
312 PyUnicodeObject *unicode;
313
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000314 /* If the Unicode data is known at construction time, we can apply
315 some optimizations which share commonly used objects. */
316 if (u != NULL) {
317
318 /* Optimization for empty strings */
319 if (size == 0 && unicode_empty != NULL) {
320 Py_INCREF(unicode_empty);
321 return (PyObject *)unicode_empty;
322 }
323
324 /* Single character Unicode objects in the Latin-1 range are
325 shared when using this constructor */
326 if (size == 1 && *u < 256) {
327 unicode = unicode_latin1[*u];
328 if (!unicode) {
329 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000330 if (!unicode)
331 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000332 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000333 unicode_latin1[*u] = unicode;
334 }
335 Py_INCREF(unicode);
336 return (PyObject *)unicode;
337 }
338 }
Tim Petersced69f82003-09-16 20:30:58 +0000339
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 unicode = _PyUnicode_New(size);
341 if (!unicode)
342 return NULL;
343
344 /* Copy the Unicode data into the new object */
345 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000346 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000347
348 return (PyObject *)unicode;
349}
350
351#ifdef HAVE_WCHAR_H
352
353PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
354 int size)
355{
356 PyUnicodeObject *unicode;
357
358 if (w == NULL) {
359 PyErr_BadInternalCall();
360 return NULL;
361 }
362
363 unicode = _PyUnicode_New(size);
364 if (!unicode)
365 return NULL;
366
367 /* Copy the wchar_t data into the new object */
368#ifdef HAVE_USABLE_WCHAR_T
369 memcpy(unicode->str, w, size * sizeof(wchar_t));
Tim Petersced69f82003-09-16 20:30:58 +0000370#else
Guido van Rossumd57fd912000-03-10 22:53:23 +0000371 {
372 register Py_UNICODE *u;
373 register int i;
374 u = PyUnicode_AS_UNICODE(unicode);
375 for (i = size; i >= 0; i--)
376 *u++ = *w++;
377 }
378#endif
379
380 return (PyObject *)unicode;
381}
382
383int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
384 register wchar_t *w,
385 int size)
386{
387 if (unicode == NULL) {
388 PyErr_BadInternalCall();
389 return -1;
390 }
391 if (size > PyUnicode_GET_SIZE(unicode))
392 size = PyUnicode_GET_SIZE(unicode);
393#ifdef HAVE_USABLE_WCHAR_T
394 memcpy(w, unicode->str, size * sizeof(wchar_t));
395#else
396 {
397 register Py_UNICODE *u;
398 register int i;
399 u = PyUnicode_AS_UNICODE(unicode);
400 for (i = size; i >= 0; i--)
401 *w++ = *u++;
402 }
403#endif
404
405 return size;
406}
407
408#endif
409
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000410PyObject *PyUnicode_FromOrdinal(int ordinal)
411{
Hye-Shik Chang40574832004-04-06 07:24:51 +0000412 Py_UNICODE s[1];
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000413
414#ifdef Py_UNICODE_WIDE
415 if (ordinal < 0 || ordinal > 0x10ffff) {
416 PyErr_SetString(PyExc_ValueError,
417 "unichr() arg not in range(0x110000) "
418 "(wide Python build)");
419 return NULL;
420 }
421#else
422 if (ordinal < 0 || ordinal > 0xffff) {
423 PyErr_SetString(PyExc_ValueError,
424 "unichr() arg not in range(0x10000) "
425 "(narrow Python build)");
426 return NULL;
427 }
428#endif
429
Hye-Shik Chang40574832004-04-06 07:24:51 +0000430 s[0] = (Py_UNICODE)ordinal;
431 return PyUnicode_FromUnicode(s, 1);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000432}
433
Guido van Rossumd57fd912000-03-10 22:53:23 +0000434PyObject *PyUnicode_FromObject(register PyObject *obj)
435{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000436 /* XXX Perhaps we should make this API an alias of
437 PyObject_Unicode() instead ?! */
438 if (PyUnicode_CheckExact(obj)) {
439 Py_INCREF(obj);
440 return obj;
441 }
442 if (PyUnicode_Check(obj)) {
443 /* For a Unicode subtype that's not a Unicode object,
444 return a true Unicode object with the same data. */
445 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
446 PyUnicode_GET_SIZE(obj));
447 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000448 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
449}
450
451PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
452 const char *encoding,
453 const char *errors)
454{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000455 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000457 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000458
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459 if (obj == NULL) {
460 PyErr_BadInternalCall();
461 return NULL;
462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000464#if 0
465 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000466 that no encodings is given and then redirect to
467 PyObject_Unicode() which then applies the additional logic for
468 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000469
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000470 NOTE: This API should really only be used for object which
471 represent *encoded* Unicode !
472
473 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000474 if (PyUnicode_Check(obj)) {
475 if (encoding) {
476 PyErr_SetString(PyExc_TypeError,
477 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000479 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000481 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000482#else
483 if (PyUnicode_Check(obj)) {
484 PyErr_SetString(PyExc_TypeError,
485 "decoding Unicode is not supported");
486 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000487 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000488#endif
489
490 /* Coerce object */
491 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 s = PyString_AS_STRING(obj);
493 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000494 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000495 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
496 /* Overwrite the error message with something more useful in
497 case of a TypeError. */
498 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000499 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000500 "coercing to Unicode: need string or buffer, "
501 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000502 obj->ob_type->tp_name);
503 goto onError;
504 }
Tim Petersced69f82003-09-16 20:30:58 +0000505
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000506 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000507 if (len == 0) {
508 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000509 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510 }
Tim Petersced69f82003-09-16 20:30:58 +0000511 else
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000513
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000514 return v;
515
516 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000518}
519
520PyObject *PyUnicode_Decode(const char *s,
521 int size,
522 const char *encoding,
523 const char *errors)
524{
525 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000526
527 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000528 encoding = PyUnicode_GetDefaultEncoding();
529
530 /* Shortcuts for common default encodings */
531 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000533 else if (strcmp(encoding, "latin-1") == 0)
534 return PyUnicode_DecodeLatin1(s, size, errors);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000535#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
536 else if (strcmp(encoding, "mbcs") == 0)
537 return PyUnicode_DecodeMBCS(s, size, errors);
538#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000539 else if (strcmp(encoding, "ascii") == 0)
540 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000541
542 /* Decode via the codec registry */
543 buffer = PyBuffer_FromMemory((void *)s, size);
544 if (buffer == NULL)
545 goto onError;
546 unicode = PyCodec_Decode(buffer, encoding, errors);
547 if (unicode == NULL)
548 goto onError;
549 if (!PyUnicode_Check(unicode)) {
550 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000551 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000552 unicode->ob_type->tp_name);
553 Py_DECREF(unicode);
554 goto onError;
555 }
556 Py_DECREF(buffer);
557 return unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000558
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 onError:
560 Py_XDECREF(buffer);
561 return NULL;
562}
563
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000564PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
565 const char *encoding,
566 const char *errors)
567{
568 PyObject *v;
569
570 if (!PyUnicode_Check(unicode)) {
571 PyErr_BadArgument();
572 goto onError;
573 }
574
575 if (encoding == NULL)
576 encoding = PyUnicode_GetDefaultEncoding();
577
578 /* Decode via the codec registry */
579 v = PyCodec_Decode(unicode, encoding, errors);
580 if (v == NULL)
581 goto onError;
582 return v;
583
584 onError:
585 return NULL;
586}
587
Guido van Rossumd57fd912000-03-10 22:53:23 +0000588PyObject *PyUnicode_Encode(const Py_UNICODE *s,
589 int size,
590 const char *encoding,
591 const char *errors)
592{
593 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +0000594
Guido van Rossumd57fd912000-03-10 22:53:23 +0000595 unicode = PyUnicode_FromUnicode(s, size);
596 if (unicode == NULL)
597 return NULL;
598 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
599 Py_DECREF(unicode);
600 return v;
601}
602
Marc-André Lemburgd2d45982004-07-08 17:57:32 +0000603PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
604 const char *encoding,
605 const char *errors)
606{
607 PyObject *v;
608
609 if (!PyUnicode_Check(unicode)) {
610 PyErr_BadArgument();
611 goto onError;
612 }
613
614 if (encoding == NULL)
615 encoding = PyUnicode_GetDefaultEncoding();
616
617 /* Encode via the codec registry */
618 v = PyCodec_Encode(unicode, encoding, errors);
619 if (v == NULL)
620 goto onError;
621 return v;
622
623 onError:
624 return NULL;
625}
626
Guido van Rossumd57fd912000-03-10 22:53:23 +0000627PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
628 const char *encoding,
629 const char *errors)
630{
631 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000632
Guido van Rossumd57fd912000-03-10 22:53:23 +0000633 if (!PyUnicode_Check(unicode)) {
634 PyErr_BadArgument();
635 goto onError;
636 }
Fred Drakee4315f52000-05-09 19:53:39 +0000637
Tim Petersced69f82003-09-16 20:30:58 +0000638 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000639 encoding = PyUnicode_GetDefaultEncoding();
640
641 /* Shortcuts for common default encodings */
642 if (errors == NULL) {
643 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000644 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000645 else if (strcmp(encoding, "latin-1") == 0)
646 return PyUnicode_AsLatin1String(unicode);
Mark Hammond0ccda1e2003-07-01 00:13:27 +0000647#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
648 else if (strcmp(encoding, "mbcs") == 0)
649 return PyUnicode_AsMBCSString(unicode);
650#endif
Fred Drakee4315f52000-05-09 19:53:39 +0000651 else if (strcmp(encoding, "ascii") == 0)
652 return PyUnicode_AsASCIIString(unicode);
653 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000654
655 /* Encode via the codec registry */
656 v = PyCodec_Encode(unicode, encoding, errors);
657 if (v == NULL)
658 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000659 if (!PyString_Check(v)) {
660 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000661 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000662 v->ob_type->tp_name);
663 Py_DECREF(v);
664 goto onError;
665 }
666 return v;
Tim Petersced69f82003-09-16 20:30:58 +0000667
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668 onError:
669 return NULL;
670}
671
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000672PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
673 const char *errors)
674{
675 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
676
677 if (v)
678 return v;
679 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
680 if (v && errors == NULL)
681 ((PyUnicodeObject *)unicode)->defenc = v;
682 return v;
683}
684
Guido van Rossumd57fd912000-03-10 22:53:23 +0000685Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
686{
687 if (!PyUnicode_Check(unicode)) {
688 PyErr_BadArgument();
689 goto onError;
690 }
691 return PyUnicode_AS_UNICODE(unicode);
692
693 onError:
694 return NULL;
695}
696
697int PyUnicode_GetSize(PyObject *unicode)
698{
699 if (!PyUnicode_Check(unicode)) {
700 PyErr_BadArgument();
701 goto onError;
702 }
703 return PyUnicode_GET_SIZE(unicode);
704
705 onError:
706 return -1;
707}
708
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +0000709int PyUnicode_GetWidth(PyObject *unicode)
710{
711 const Py_UNICODE *p, *e;
712 int width;
713
714 if (!PyUnicode_Check(unicode)) {
715 PyErr_BadArgument();
716 return -1;
717 }
718
719 p = PyUnicode_AS_UNICODE(unicode);
720 e = p + PyUnicode_GET_SIZE(unicode);
721 for (width = 0; p < e; p++)
722 if (Py_UNICODE_ISWIDE(*p))
723 width += 2;
724 else
725 width++;
726
727 return width;
728}
729
Thomas Wouters78890102000-07-22 19:25:51 +0000730const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000731{
732 return unicode_default_encoding;
733}
734
735int PyUnicode_SetDefaultEncoding(const char *encoding)
736{
737 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +0000738
Fred Drakee4315f52000-05-09 19:53:39 +0000739 /* Make sure the encoding is valid. As side effect, this also
740 loads the encoding into the codec registry cache. */
741 v = _PyCodec_Lookup(encoding);
742 if (v == NULL)
743 goto onError;
744 Py_DECREF(v);
745 strncpy(unicode_default_encoding,
Tim Petersced69f82003-09-16 20:30:58 +0000746 encoding,
Fred Drakee4315f52000-05-09 19:53:39 +0000747 sizeof(unicode_default_encoding));
748 return 0;
749
750 onError:
751 return -1;
752}
753
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000754/* error handling callback helper:
755 build arguments, call the callback and check the arguments,
756 if no exception occured, copy the replacement to the output
757 and adjust various state variables.
758 return 0 on success, -1 on error
759*/
760
761static
762int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
763 const char *encoding, const char *reason,
764 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
765 PyObject **output, int *outpos, Py_UNICODE **outptr)
766{
767 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
768
769 PyObject *restuple = NULL;
770 PyObject *repunicode = NULL;
771 int outsize = PyUnicode_GET_SIZE(*output);
772 int requiredsize;
773 int newpos;
774 Py_UNICODE *repptr;
775 int repsize;
776 int res = -1;
777
778 if (*errorHandler == NULL) {
779 *errorHandler = PyCodec_LookupError(errors);
780 if (*errorHandler == NULL)
781 goto onError;
782 }
783
784 if (*exceptionObject == NULL) {
785 *exceptionObject = PyUnicodeDecodeError_Create(
786 encoding, input, insize, *startinpos, *endinpos, reason);
787 if (*exceptionObject == NULL)
788 goto onError;
789 }
790 else {
791 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
792 goto onError;
793 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
794 goto onError;
795 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
796 goto onError;
797 }
798
799 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
800 if (restuple == NULL)
801 goto onError;
802 if (!PyTuple_Check(restuple)) {
803 PyErr_Format(PyExc_TypeError, &argparse[4]);
804 goto onError;
805 }
806 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
807 goto onError;
808 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000809 newpos = insize+newpos;
810 if (newpos<0 || newpos>insize) {
811 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
812 goto onError;
813 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000814
815 /* need more space? (at least enough for what we
816 have+the replacement+the rest of the string (starting
817 at the new input position), so we won't have to check space
818 when there are no errors in the rest of the string) */
819 repptr = PyUnicode_AS_UNICODE(repunicode);
820 repsize = PyUnicode_GET_SIZE(repunicode);
821 requiredsize = *outpos + repsize + insize-newpos;
822 if (requiredsize > outsize) {
823 if (requiredsize<2*outsize)
824 requiredsize = 2*outsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +0000825 if (PyUnicode_Resize(output, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000826 goto onError;
827 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
828 }
829 *endinpos = newpos;
830 *inptr = input + newpos;
831 Py_UNICODE_COPY(*outptr, repptr, repsize);
832 *outptr += repsize;
833 *outpos += repsize;
834 /* we made it! */
835 res = 0;
836
837 onError:
838 Py_XDECREF(restuple);
839 return res;
840}
841
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000842/* --- UTF-7 Codec -------------------------------------------------------- */
843
844/* see RFC2152 for details */
845
Tim Petersced69f82003-09-16 20:30:58 +0000846static
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000847char utf7_special[128] = {
848 /* indicate whether a UTF-7 character is special i.e. cannot be directly
849 encoded:
850 0 - not special
851 1 - special
852 2 - whitespace (optional)
853 3 - RFC2152 Set O (optional) */
854 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
855 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
856 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
857 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
858 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
859 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
860 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
861 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
862
863};
864
865#define SPECIAL(c, encodeO, encodeWS) \
866 (((c)>127 || utf7_special[(c)] == 1) || \
867 (encodeWS && (utf7_special[(c)] == 2)) || \
868 (encodeO && (utf7_special[(c)] == 3)))
869
870#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
871#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
872#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
873 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
874
875#define ENCODE(out, ch, bits) \
876 while (bits >= 6) { \
877 *out++ = B64(ch >> (bits-6)); \
878 bits -= 6; \
879 }
880
881#define DECODE(out, ch, bits, surrogate) \
882 while (bits >= 16) { \
883 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
884 bits -= 16; \
885 if (surrogate) { \
886 /* We have already generated an error for the high surrogate
887 so let's not bother seeing if the low surrogate is correct or not */\
888 surrogate = 0; \
889 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
890 /* This is a surrogate pair. Unfortunately we can't represent \
891 it in a 16-bit character */ \
892 surrogate = 1; \
893 errmsg = "code pairs are not supported"; \
894 goto utf7Error; \
895 } else { \
896 *out++ = outCh; \
897 } \
898 } \
899
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000900PyObject *PyUnicode_DecodeUTF7(const char *s,
901 int size,
902 const char *errors)
903{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000904 const char *starts = s;
905 int startinpos;
906 int endinpos;
907 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000908 const char *e;
909 PyUnicodeObject *unicode;
910 Py_UNICODE *p;
911 const char *errmsg = "";
912 int inShift = 0;
913 unsigned int bitsleft = 0;
914 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000915 int surrogate = 0;
916 PyObject *errorHandler = NULL;
917 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000918
919 unicode = _PyUnicode_New(size);
920 if (!unicode)
921 return NULL;
922 if (size == 0)
923 return (PyObject *)unicode;
924
925 p = unicode->str;
926 e = s + size;
927
928 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000929 Py_UNICODE ch;
930 restart:
931 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000932
933 if (inShift) {
934 if ((ch == '-') || !B64CHAR(ch)) {
935 inShift = 0;
936 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000937
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000938 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
939 if (bitsleft >= 6) {
940 /* The shift sequence has a partial character in it. If
941 bitsleft < 6 then we could just classify it as padding
942 but that is not the case here */
943
944 errmsg = "partial character in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000945 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000946 }
947 /* According to RFC2152 the remaining bits should be zero. We
Tim Petersced69f82003-09-16 20:30:58 +0000948 choose to signal an error/insert a replacement character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000949 here so indicate the potential of a misencoded character. */
950
951 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
952 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
953 errmsg = "non-zero padding bits in shift sequence";
Tim Petersced69f82003-09-16 20:30:58 +0000954 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000955 }
956
957 if (ch == '-') {
958 if ((s < e) && (*(s) == '-')) {
Tim Petersced69f82003-09-16 20:30:58 +0000959 *p++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000960 inShift = 1;
961 }
962 } else if (SPECIAL(ch,0,0)) {
963 errmsg = "unexpected special character";
Tim Petersced69f82003-09-16 20:30:58 +0000964 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000965 } else {
966 *p++ = ch;
967 }
968 } else {
969 charsleft = (charsleft << 6) | UB64(ch);
970 bitsleft += 6;
971 s++;
972 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
973 }
974 }
975 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000976 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000977 s++;
978 if (s < e && *s == '-') {
979 s++;
980 *p++ = '+';
981 } else
982 {
983 inShift = 1;
984 bitsleft = 0;
985 }
986 }
987 else if (SPECIAL(ch,0,0)) {
988 errmsg = "unexpected special character";
989 s++;
Tim Petersced69f82003-09-16 20:30:58 +0000990 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000991 }
992 else {
993 *p++ = ch;
994 s++;
995 }
996 continue;
997 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000998 outpos = p-PyUnicode_AS_UNICODE(unicode);
999 endinpos = s-starts;
1000 if (unicode_decode_call_errorhandler(
1001 errors, &errorHandler,
1002 "utf7", errmsg,
1003 starts, size, &startinpos, &endinpos, &exc, &s,
1004 (PyObject **)&unicode, &outpos, &p))
1005 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001006 }
1007
1008 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001009 outpos = p-PyUnicode_AS_UNICODE(unicode);
1010 endinpos = size;
1011 if (unicode_decode_call_errorhandler(
1012 errors, &errorHandler,
1013 "utf7", "unterminated shift sequence",
1014 starts, size, &startinpos, &endinpos, &exc, &s,
1015 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001016 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001017 if (s < e)
1018 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001019 }
1020
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001021 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001022 goto onError;
1023
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001024 Py_XDECREF(errorHandler);
1025 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001026 return (PyObject *)unicode;
1027
1028onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001029 Py_XDECREF(errorHandler);
1030 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001031 Py_DECREF(unicode);
1032 return NULL;
1033}
1034
1035
1036PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1037 int size,
1038 int encodeSetO,
1039 int encodeWhiteSpace,
1040 const char *errors)
1041{
1042 PyObject *v;
1043 /* It might be possible to tighten this worst case */
1044 unsigned int cbAllocated = 5 * size;
1045 int inShift = 0;
1046 int i = 0;
1047 unsigned int bitsleft = 0;
1048 unsigned long charsleft = 0;
1049 char * out;
1050 char * start;
1051
1052 if (size == 0)
1053 return PyString_FromStringAndSize(NULL, 0);
1054
1055 v = PyString_FromStringAndSize(NULL, cbAllocated);
1056 if (v == NULL)
1057 return NULL;
1058
1059 start = out = PyString_AS_STRING(v);
1060 for (;i < size; ++i) {
1061 Py_UNICODE ch = s[i];
1062
1063 if (!inShift) {
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001064 if (ch == '+') {
1065 *out++ = '+';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001066 *out++ = '-';
1067 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1068 charsleft = ch;
1069 bitsleft = 16;
1070 *out++ = '+';
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001071 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001072 inShift = bitsleft > 0;
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001073 } else {
1074 *out++ = (char) ch;
1075 }
1076 } else {
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001077 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1078 *out++ = B64(charsleft << (6-bitsleft));
1079 charsleft = 0;
1080 bitsleft = 0;
1081 /* Characters not in the BASE64 set implicitly unshift the sequence
1082 so no '-' is required, except if the character is itself a '-' */
1083 if (B64CHAR(ch) || ch == '-') {
1084 *out++ = '-';
1085 }
1086 inShift = 0;
1087 *out++ = (char) ch;
1088 } else {
1089 bitsleft += 16;
1090 charsleft = (charsleft << 16) | ch;
1091 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1092
1093 /* If the next character is special then we dont' need to terminate
Tim Petersced69f82003-09-16 20:30:58 +00001094 the shift sequence. If the next character is not a BASE64 character
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001095 or '-' then the shift sequence will be terminated implicitly and we
1096 don't have to insert a '-'. */
1097
1098 if (bitsleft == 0) {
1099 if (i + 1 < size) {
1100 Py_UNICODE ch2 = s[i+1];
1101
1102 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
Tim Petersced69f82003-09-16 20:30:58 +00001103
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001104 } else if (B64CHAR(ch2) || ch2 == '-') {
1105 *out++ = '-';
1106 inShift = 0;
1107 } else {
1108 inShift = 0;
1109 }
1110
1111 }
1112 else {
1113 *out++ = '-';
1114 inShift = 0;
1115 }
1116 }
Tim Petersced69f82003-09-16 20:30:58 +00001117 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001118 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00001119 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001120 if (bitsleft) {
1121 *out++= B64(charsleft << (6-bitsleft) );
1122 *out++ = '-';
1123 }
1124
Tim Peters5de98422002-04-27 18:44:32 +00001125 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001126 return v;
1127}
1128
1129#undef SPECIAL
1130#undef B64
1131#undef B64CHAR
1132#undef UB64
1133#undef ENCODE
1134#undef DECODE
1135
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136/* --- UTF-8 Codec -------------------------------------------------------- */
1137
Tim Petersced69f82003-09-16 20:30:58 +00001138static
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139char utf8_code_length[256] = {
1140 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1141 illegal prefix. see RFC 2279 for details */
1142 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1144 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1145 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1146 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1147 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1148 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1149 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1150 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1151 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1152 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1153 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1154 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1155 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1156 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1157 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1158};
1159
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160PyObject *PyUnicode_DecodeUTF8(const char *s,
1161 int size,
1162 const char *errors)
1163{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001164 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001166 int startinpos;
1167 int endinpos;
1168 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 const char *e;
1170 PyUnicodeObject *unicode;
1171 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001172 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001173 PyObject *errorHandler = NULL;
1174 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175
1176 /* Note: size will always be longer than the resulting Unicode
1177 character count */
1178 unicode = _PyUnicode_New(size);
1179 if (!unicode)
1180 return NULL;
1181 if (size == 0)
1182 return (PyObject *)unicode;
1183
1184 /* Unpack UTF-8 encoded data */
1185 p = unicode->str;
1186 e = s + size;
1187
1188 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001189 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001192 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193 s++;
1194 continue;
1195 }
1196
1197 n = utf8_code_length[ch];
1198
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001199 if (s + n > e) {
1200 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001201 startinpos = s-starts;
1202 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001203 goto utf8Error;
1204 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205
1206 switch (n) {
1207
1208 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001209 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001210 startinpos = s-starts;
1211 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001212 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213
1214 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001215 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001216 startinpos = s-starts;
1217 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001218 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219
1220 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001221 if ((s[1] & 0xc0) != 0x80) {
1222 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001223 startinpos = s-starts;
1224 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001225 goto utf8Error;
1226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001228 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001229 startinpos = s-starts;
1230 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001231 errmsg = "illegal encoding";
1232 goto utf8Error;
1233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001235 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236 break;
1237
1238 case 3:
Tim Petersced69f82003-09-16 20:30:58 +00001239 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001240 (s[2] & 0xc0) != 0x80) {
1241 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001242 startinpos = s-starts;
1243 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001244 goto utf8Error;
1245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001247 if (ch < 0x0800) {
1248 /* Note: UTF-8 encodings of surrogates are considered
Tim Petersced69f82003-09-16 20:30:58 +00001249 legal UTF-8 sequences;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001250
1251 XXX For wide builds (UCS-4) we should probably try
1252 to recombine the surrogates into a single code
1253 unit.
1254 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001255 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001256 startinpos = s-starts;
1257 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001258 goto utf8Error;
1259 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001261 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001262 break;
1263
1264 case 4:
1265 if ((s[1] & 0xc0) != 0x80 ||
1266 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001267 (s[3] & 0xc0) != 0x80) {
1268 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001269 startinpos = s-starts;
1270 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001271 goto utf8Error;
1272 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001273 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1274 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1275 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001276 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001277 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001278 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001279 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001280 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001281 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001282 startinpos = s-starts;
1283 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001284 goto utf8Error;
1285 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001286#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001287 *p++ = (Py_UNICODE)ch;
1288#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001289 /* compute and append the two surrogates: */
Tim Petersced69f82003-09-16 20:30:58 +00001290
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001291 /* translate from 10000..10FFFF to 0..FFFF */
1292 ch -= 0x10000;
Tim Petersced69f82003-09-16 20:30:58 +00001293
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001294 /* high surrogate = top 10 bits added to D800 */
1295 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
Tim Petersced69f82003-09-16 20:30:58 +00001296
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001297 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001298 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001299#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001300 break;
1301
1302 default:
1303 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001304 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001305 startinpos = s-starts;
1306 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001307 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 }
1309 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001310 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001311
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001313 outpos = p-PyUnicode_AS_UNICODE(unicode);
1314 if (unicode_decode_call_errorhandler(
1315 errors, &errorHandler,
1316 "utf8", errmsg,
1317 starts, size, &startinpos, &endinpos, &exc, &s,
1318 (PyObject **)&unicode, &outpos, &p))
1319 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 }
1321
1322 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001323 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 goto onError;
1325
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001326 Py_XDECREF(errorHandler);
1327 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328 return (PyObject *)unicode;
1329
1330onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001331 Py_XDECREF(errorHandler);
1332 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333 Py_DECREF(unicode);
1334 return NULL;
1335}
1336
Tim Peters602f7402002-04-27 18:03:26 +00001337/* Allocation strategy: if the string is short, convert into a stack buffer
1338 and allocate exactly as much space needed at the end. Else allocate the
1339 maximum possible needed (4 result bytes per Unicode character), and return
1340 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001341*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001342PyObject *
1343PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1344 int size,
1345 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346{
Tim Peters602f7402002-04-27 18:03:26 +00001347#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001348
Tim Peters602f7402002-04-27 18:03:26 +00001349 int i; /* index into s of next input byte */
1350 PyObject *v; /* result string object */
1351 char *p; /* next free byte in output buffer */
1352 int nallocated; /* number of result bytes allocated */
1353 int nneeded; /* number of result bytes needed */
1354 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001355
Tim Peters602f7402002-04-27 18:03:26 +00001356 assert(s != NULL);
1357 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001358
Tim Peters602f7402002-04-27 18:03:26 +00001359 if (size <= MAX_SHORT_UNICHARS) {
1360 /* Write into the stack buffer; nallocated can't overflow.
1361 * At the end, we'll allocate exactly as much heap space as it
1362 * turns out we need.
1363 */
1364 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1365 v = NULL; /* will allocate after we're done */
1366 p = stackbuf;
1367 }
1368 else {
1369 /* Overallocate on the heap, and give the excess back at the end. */
1370 nallocated = size * 4;
1371 if (nallocated / 4 != size) /* overflow! */
1372 return PyErr_NoMemory();
1373 v = PyString_FromStringAndSize(NULL, nallocated);
1374 if (v == NULL)
1375 return NULL;
1376 p = PyString_AS_STRING(v);
1377 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001378
Tim Peters602f7402002-04-27 18:03:26 +00001379 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001380 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001381
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001382 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001383 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001384 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001385
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001387 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001388 *p++ = (char)(0xc0 | (ch >> 6));
1389 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001390 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001391 else {
Tim Peters602f7402002-04-27 18:03:26 +00001392 /* Encode UCS2 Unicode ordinals */
1393 if (ch < 0x10000) {
1394 /* Special case: check for high surrogate */
1395 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1396 Py_UCS4 ch2 = s[i];
1397 /* Check for low surrogate and combine the two to
1398 form a UCS4 value */
1399 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001400 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001401 i++;
1402 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001403 }
Tim Peters602f7402002-04-27 18:03:26 +00001404 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001405 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001406 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001407 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1408 *p++ = (char)(0x80 | (ch & 0x3f));
1409 continue;
1410 }
1411encodeUCS4:
1412 /* Encode UCS4 Unicode ordinals */
1413 *p++ = (char)(0xf0 | (ch >> 18));
1414 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1415 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1416 *p++ = (char)(0x80 | (ch & 0x3f));
1417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001418 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001419
Tim Peters602f7402002-04-27 18:03:26 +00001420 if (v == NULL) {
1421 /* This was stack allocated. */
1422 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1423 assert(nneeded <= nallocated);
1424 v = PyString_FromStringAndSize(stackbuf, nneeded);
1425 }
1426 else {
1427 /* Cut back to size actually needed. */
1428 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1429 assert(nneeded <= nallocated);
1430 _PyString_Resize(&v, nneeded);
1431 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001433
Tim Peters602f7402002-04-27 18:03:26 +00001434#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435}
1436
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1438{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439 if (!PyUnicode_Check(unicode)) {
1440 PyErr_BadArgument();
1441 return NULL;
1442 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001443 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1444 PyUnicode_GET_SIZE(unicode),
1445 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446}
1447
1448/* --- UTF-16 Codec ------------------------------------------------------- */
1449
Tim Peters772747b2001-08-09 22:21:55 +00001450PyObject *
1451PyUnicode_DecodeUTF16(const char *s,
1452 int size,
1453 const char *errors,
1454 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001456 const char *starts = s;
1457 int startinpos;
1458 int endinpos;
1459 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001460 PyUnicodeObject *unicode;
1461 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001462 const unsigned char *q, *e;
1463 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001464 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001465 /* Offsets from q for retrieving byte pairs in the right order. */
1466#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1467 int ihi = 1, ilo = 0;
1468#else
1469 int ihi = 0, ilo = 1;
1470#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001471 PyObject *errorHandler = NULL;
1472 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473
1474 /* Note: size will always be longer than the resulting Unicode
1475 character count */
1476 unicode = _PyUnicode_New(size);
1477 if (!unicode)
1478 return NULL;
1479 if (size == 0)
1480 return (PyObject *)unicode;
1481
1482 /* Unpack UTF-16 encoded data */
1483 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001484 q = (unsigned char *)s;
1485 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001486
1487 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001488 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001490 /* Check for BOM marks (U+FEFF) in the input and adjust current
1491 byte order setting accordingly. In native mode, the leading BOM
1492 mark is skipped, in all other modes, it is copied to the output
1493 stream as-is (giving a ZWNBSP character). */
1494 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001495 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001496#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001497 if (bom == 0xFEFF) {
1498 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001499 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001500 }
1501 else if (bom == 0xFFFE) {
1502 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001503 bo = 1;
1504 }
Tim Petersced69f82003-09-16 20:30:58 +00001505#else
Tim Peters772747b2001-08-09 22:21:55 +00001506 if (bom == 0xFEFF) {
1507 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001508 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001509 }
1510 else if (bom == 0xFFFE) {
1511 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001512 bo = -1;
1513 }
1514#endif
1515 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516
Tim Peters772747b2001-08-09 22:21:55 +00001517 if (bo == -1) {
1518 /* force LE */
1519 ihi = 1;
1520 ilo = 0;
1521 }
1522 else if (bo == 1) {
1523 /* force BE */
1524 ihi = 0;
1525 ilo = 1;
1526 }
1527
1528 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001529 Py_UNICODE ch;
1530 /* remaing bytes at the end? (size should be even) */
1531 if (e-q<2) {
1532 errmsg = "truncated data";
1533 startinpos = ((const char *)q)-starts;
1534 endinpos = ((const char *)e)-starts;
1535 goto utf16Error;
1536 /* The remaining input chars are ignored if the callback
1537 chooses to skip the input */
1538 }
1539 ch = (q[ihi] << 8) | q[ilo];
1540
Tim Peters772747b2001-08-09 22:21:55 +00001541 q += 2;
1542
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543 if (ch < 0xD800 || ch > 0xDFFF) {
1544 *p++ = ch;
1545 continue;
1546 }
1547
1548 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001549 if (q >= e) {
1550 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001551 startinpos = (((const char *)q)-2)-starts;
1552 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001553 goto utf16Error;
1554 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001555 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001556 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1557 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001558 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001559#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001560 *p++ = ch;
1561 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001562#else
1563 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001564#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001565 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001566 }
1567 else {
1568 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001569 startinpos = (((const char *)q)-4)-starts;
1570 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001571 goto utf16Error;
1572 }
1573
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001575 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001576 startinpos = (((const char *)q)-2)-starts;
1577 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001578 /* Fall through to report the error */
1579
1580 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001581 outpos = p-PyUnicode_AS_UNICODE(unicode);
1582 if (unicode_decode_call_errorhandler(
1583 errors, &errorHandler,
1584 "utf16", errmsg,
1585 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1586 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001587 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 }
1589
1590 if (byteorder)
1591 *byteorder = bo;
1592
1593 /* Adjust length */
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001594 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 goto onError;
1596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001597 Py_XDECREF(errorHandler);
1598 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 return (PyObject *)unicode;
1600
1601onError:
1602 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001603 Py_XDECREF(errorHandler);
1604 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001605 return NULL;
1606}
1607
Tim Peters772747b2001-08-09 22:21:55 +00001608PyObject *
1609PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1610 int size,
1611 const char *errors,
1612 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613{
1614 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001615 unsigned char *p;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001616#ifdef Py_UNICODE_WIDE
Tim Peters772747b2001-08-09 22:21:55 +00001617 int i, pairs;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001618#else
1619 const int pairs = 0;
1620#endif
Tim Peters772747b2001-08-09 22:21:55 +00001621 /* Offsets from p for storing byte pairs in the right order. */
1622#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1623 int ihi = 1, ilo = 0;
1624#else
1625 int ihi = 0, ilo = 1;
1626#endif
1627
1628#define STORECHAR(CH) \
1629 do { \
1630 p[ihi] = ((CH) >> 8) & 0xff; \
1631 p[ilo] = (CH) & 0xff; \
1632 p += 2; \
1633 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001634
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001635#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001636 for (i = pairs = 0; i < size; i++)
1637 if (s[i] >= 0x10000)
1638 pairs++;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001639#endif
Tim Petersced69f82003-09-16 20:30:58 +00001640 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001641 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001642 if (v == NULL)
1643 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644
Tim Peters772747b2001-08-09 22:21:55 +00001645 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001647 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001648 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001649 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001650
1651 if (byteorder == -1) {
1652 /* force LE */
1653 ihi = 1;
1654 ilo = 0;
1655 }
1656 else if (byteorder == 1) {
1657 /* force BE */
1658 ihi = 0;
1659 ilo = 1;
1660 }
1661
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001662 while (size-- > 0) {
1663 Py_UNICODE ch = *s++;
1664 Py_UNICODE ch2 = 0;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001665#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001666 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001667 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1668 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 }
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00001670#endif
Tim Peters772747b2001-08-09 22:21:55 +00001671 STORECHAR(ch);
1672 if (ch2)
1673 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001676#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001677}
1678
1679PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1680{
1681 if (!PyUnicode_Check(unicode)) {
1682 PyErr_BadArgument();
1683 return NULL;
1684 }
1685 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1686 PyUnicode_GET_SIZE(unicode),
1687 NULL,
1688 0);
1689}
1690
1691/* --- Unicode Escape Codec ----------------------------------------------- */
1692
Fredrik Lundh06d12682001-01-24 07:59:11 +00001693static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001694
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1696 int size,
1697 const char *errors)
1698{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001699 const char *starts = s;
1700 int startinpos;
1701 int endinpos;
1702 int outpos;
1703 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001705 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001706 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001707 char* message;
1708 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001709 PyObject *errorHandler = NULL;
1710 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001711
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 /* Escaped strings will always be longer than the resulting
1713 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 length after conversion to the true value.
1715 (but if the error callback returns a long replacement string
1716 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 v = _PyUnicode_New(size);
1718 if (v == NULL)
1719 goto onError;
1720 if (size == 0)
1721 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001722
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001723 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001725
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 while (s < end) {
1727 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001728 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001729 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730
1731 /* Non-escape characters are interpreted as Unicode ordinals */
1732 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001733 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 continue;
1735 }
1736
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001737 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 /* \ - Escapes */
1739 s++;
1740 switch (*s++) {
1741
1742 /* \x escapes */
1743 case '\n': break;
1744 case '\\': *p++ = '\\'; break;
1745 case '\'': *p++ = '\''; break;
1746 case '\"': *p++ = '\"'; break;
1747 case 'b': *p++ = '\b'; break;
1748 case 'f': *p++ = '\014'; break; /* FF */
1749 case 't': *p++ = '\t'; break;
1750 case 'n': *p++ = '\n'; break;
1751 case 'r': *p++ = '\r'; break;
1752 case 'v': *p++ = '\013'; break; /* VT */
1753 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1754
1755 /* \OOO (octal) escapes */
1756 case '0': case '1': case '2': case '3':
1757 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001758 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001760 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001762 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001763 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001764 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001765 break;
1766
Fredrik Lundhccc74732001-02-18 22:13:49 +00001767 /* hex escapes */
1768 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001770 digits = 2;
1771 message = "truncated \\xXX escape";
1772 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773
Fredrik Lundhccc74732001-02-18 22:13:49 +00001774 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001776 digits = 4;
1777 message = "truncated \\uXXXX escape";
1778 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779
Fredrik Lundhccc74732001-02-18 22:13:49 +00001780 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001781 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001782 digits = 8;
1783 message = "truncated \\UXXXXXXXX escape";
1784 hexescape:
1785 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001786 outpos = p-PyUnicode_AS_UNICODE(v);
1787 if (s+digits>end) {
1788 endinpos = size;
1789 if (unicode_decode_call_errorhandler(
1790 errors, &errorHandler,
1791 "unicodeescape", "end of string in escape sequence",
1792 starts, size, &startinpos, &endinpos, &exc, &s,
1793 (PyObject **)&v, &outpos, &p))
1794 goto onError;
1795 goto nextByte;
1796 }
1797 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001798 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001799 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001800 endinpos = (s+i+1)-starts;
1801 if (unicode_decode_call_errorhandler(
1802 errors, &errorHandler,
1803 "unicodeescape", message,
1804 starts, size, &startinpos, &endinpos, &exc, &s,
1805 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001806 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001807 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001808 }
1809 chr = (chr<<4) & ~0xF;
1810 if (c >= '0' && c <= '9')
1811 chr += c - '0';
1812 else if (c >= 'a' && c <= 'f')
1813 chr += 10 + c - 'a';
1814 else
1815 chr += 10 + c - 'A';
1816 }
1817 s += i;
Jeremy Hylton504de6b2003-10-06 05:08:26 +00001818 if (chr == 0xffffffff && PyErr_Occurred())
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001819 /* _decoding_error will have already written into the
1820 target buffer. */
1821 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001822 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001823 /* when we get here, chr is a 32-bit unicode character */
1824 if (chr <= 0xffff)
1825 /* UCS-2 character */
1826 *p++ = (Py_UNICODE) chr;
1827 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001828 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001829 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001830#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001831 *p++ = chr;
1832#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001833 chr -= 0x10000L;
1834 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001835 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001836#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001837 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001838 endinpos = s-starts;
1839 outpos = p-PyUnicode_AS_UNICODE(v);
1840 if (unicode_decode_call_errorhandler(
1841 errors, &errorHandler,
1842 "unicodeescape", "illegal Unicode character",
1843 starts, size, &startinpos, &endinpos, &exc, &s,
1844 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001845 goto onError;
1846 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001847 break;
1848
1849 /* \N{name} */
1850 case 'N':
1851 message = "malformed \\N character escape";
1852 if (ucnhash_CAPI == NULL) {
1853 /* load the unicode data module */
1854 PyObject *m, *v;
1855 m = PyImport_ImportModule("unicodedata");
1856 if (m == NULL)
1857 goto ucnhashError;
1858 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1859 Py_DECREF(m);
1860 if (v == NULL)
1861 goto ucnhashError;
1862 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1863 Py_DECREF(v);
1864 if (ucnhash_CAPI == NULL)
1865 goto ucnhashError;
1866 }
1867 if (*s == '{') {
1868 const char *start = s+1;
1869 /* look for the closing brace */
1870 while (*s != '}' && s < end)
1871 s++;
1872 if (s > start && s < end && *s == '}') {
1873 /* found a name. look it up in the unicode database */
1874 message = "unknown Unicode character name";
1875 s++;
1876 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1877 goto store;
1878 }
1879 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001880 endinpos = s-starts;
1881 outpos = p-PyUnicode_AS_UNICODE(v);
1882 if (unicode_decode_call_errorhandler(
1883 errors, &errorHandler,
1884 "unicodeescape", message,
1885 starts, size, &startinpos, &endinpos, &exc, &s,
1886 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001887 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001888 break;
1889
1890 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001891 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001892 message = "\\ at end of string";
1893 s--;
1894 endinpos = s-starts;
1895 outpos = p-PyUnicode_AS_UNICODE(v);
1896 if (unicode_decode_call_errorhandler(
1897 errors, &errorHandler,
1898 "unicodeescape", message,
1899 starts, size, &startinpos, &endinpos, &exc, &s,
1900 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001901 goto onError;
1902 }
1903 else {
1904 *p++ = '\\';
1905 *p++ = (unsigned char)s[-1];
1906 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001907 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001909 nextByte:
1910 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001911 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00001912 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001913 goto onError;
Walter Dörwaldd4ade082003-08-15 15:00:26 +00001914 Py_XDECREF(errorHandler);
1915 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001916 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001917
Fredrik Lundhccc74732001-02-18 22:13:49 +00001918ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001919 PyErr_SetString(
1920 PyExc_UnicodeError,
1921 "\\N escapes not supported (can't load unicodedata module)"
1922 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001923 Py_XDECREF(errorHandler);
1924 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001925 return NULL;
1926
Fredrik Lundhccc74732001-02-18 22:13:49 +00001927onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001929 Py_XDECREF(errorHandler);
1930 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001931 return NULL;
1932}
1933
1934/* Return a Unicode-Escape string version of the Unicode object.
1935
1936 If quotes is true, the string is enclosed in u"" or u'' quotes as
1937 appropriate.
1938
1939*/
1940
Barry Warsaw51ac5802000-03-20 16:36:48 +00001941static const Py_UNICODE *findchar(const Py_UNICODE *s,
1942 int size,
1943 Py_UNICODE ch);
1944
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945static
1946PyObject *unicodeescape_string(const Py_UNICODE *s,
1947 int size,
1948 int quotes)
1949{
1950 PyObject *repr;
1951 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001953 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954
1955 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1956 if (repr == NULL)
1957 return NULL;
1958
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001959 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960
1961 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 *p++ = 'u';
Tim Petersced69f82003-09-16 20:30:58 +00001963 *p++ = (findchar(s, size, '\'') &&
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 !findchar(s, size, '"')) ? '"' : '\'';
1965 }
1966 while (size-- > 0) {
1967 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001968
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969 /* Escape quotes */
Tim Petersced69f82003-09-16 20:30:58 +00001970 if (quotes &&
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001971 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 *p++ = '\\';
1973 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001974 continue;
Tim Petersced69f82003-09-16 20:30:58 +00001975 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001976
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001977#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001978 /* Map 21-bit characters to '\U00xxxxxx' */
1979 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001980 int offset = p - PyString_AS_STRING(repr);
Tim Petersced69f82003-09-16 20:30:58 +00001981
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001982 /* Resize the string if necessary */
1983 if (offset + 12 > PyString_GET_SIZE(repr)) {
1984 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001985 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001986 p = PyString_AS_STRING(repr) + offset;
1987 }
1988
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001989 *p++ = '\\';
1990 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001991 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1992 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1993 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1994 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1995 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1996 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1997 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001998 *p++ = hexdigit[ch & 0x0000000F];
1999 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00002000 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00002001#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002002 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
2003 else if (ch >= 0xD800 && ch < 0xDC00) {
2004 Py_UNICODE ch2;
2005 Py_UCS4 ucs;
Tim Petersced69f82003-09-16 20:30:58 +00002006
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002007 ch2 = *s++;
2008 size--;
2009 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
2010 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
2011 *p++ = '\\';
2012 *p++ = 'U';
2013 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
2014 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
2015 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
2016 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
2017 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
2018 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
2019 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
2020 *p++ = hexdigit[ucs & 0x0000000F];
2021 continue;
2022 }
2023 /* Fall through: isolated surrogates are copied as-is */
2024 s--;
2025 size++;
2026 }
2027
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002029 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 *p++ = '\\';
2031 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002032 *p++ = hexdigit[(ch >> 12) & 0x000F];
2033 *p++ = hexdigit[(ch >> 8) & 0x000F];
2034 *p++ = hexdigit[(ch >> 4) & 0x000F];
2035 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002036 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002037
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002038 /* Map special whitespace to '\t', \n', '\r' */
2039 else if (ch == '\t') {
2040 *p++ = '\\';
2041 *p++ = 't';
2042 }
2043 else if (ch == '\n') {
2044 *p++ = '\\';
2045 *p++ = 'n';
2046 }
2047 else if (ch == '\r') {
2048 *p++ = '\\';
2049 *p++ = 'r';
2050 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002051
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002052 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00002053 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002055 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00002056 *p++ = hexdigit[(ch >> 4) & 0x000F];
2057 *p++ = hexdigit[ch & 0x000F];
Tim Petersced69f82003-09-16 20:30:58 +00002058 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002059
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 /* Copy everything else as-is */
2061 else
2062 *p++ = (char) ch;
2063 }
2064 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00002065 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066
2067 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002068 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 return repr;
2070}
2071
2072PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
2073 int size)
2074{
2075 return unicodeescape_string(s, size, 0);
2076}
2077
2078PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
2079{
2080 if (!PyUnicode_Check(unicode)) {
2081 PyErr_BadArgument();
2082 return NULL;
2083 }
2084 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2085 PyUnicode_GET_SIZE(unicode));
2086}
2087
2088/* --- Raw Unicode Escape Codec ------------------------------------------- */
2089
2090PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2091 int size,
2092 const char *errors)
2093{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002094 const char *starts = s;
2095 int startinpos;
2096 int endinpos;
2097 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002099 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 const char *end;
2101 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002102 PyObject *errorHandler = NULL;
2103 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002104
Guido van Rossumd57fd912000-03-10 22:53:23 +00002105 /* Escaped strings will always be longer than the resulting
2106 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002107 length after conversion to the true value. (But decoding error
2108 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 v = _PyUnicode_New(size);
2110 if (v == NULL)
2111 goto onError;
2112 if (size == 0)
2113 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002114 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 end = s + size;
2116 while (s < end) {
2117 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002118 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002119 int i;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002120 int count;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121
2122 /* Non-escape characters are interpreted as Unicode ordinals */
2123 if (*s != '\\') {
2124 *p++ = (unsigned char)*s++;
2125 continue;
2126 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002127 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128
2129 /* \u-escapes are only interpreted iff the number of leading
2130 backslashes if odd */
2131 bs = s;
2132 for (;s < end;) {
2133 if (*s != '\\')
2134 break;
2135 *p++ = (unsigned char)*s++;
2136 }
2137 if (((s - bs) & 1) == 0 ||
2138 s >= end ||
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002139 (*s != 'u' && *s != 'U')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 continue;
2141 }
2142 p--;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002143 count = *s=='u' ? 4 : 8;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 s++;
2145
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002146 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002147 outpos = p-PyUnicode_AS_UNICODE(v);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002148 for (x = 0, i = 0; i < count; ++i, ++s) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002149 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002151 endinpos = s-starts;
2152 if (unicode_decode_call_errorhandler(
2153 errors, &errorHandler,
2154 "rawunicodeescape", "truncated \\uXXXX",
2155 starts, size, &startinpos, &endinpos, &exc, &s,
2156 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002157 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002158 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 }
2160 x = (x<<4) & ~0xF;
2161 if (c >= '0' && c <= '9')
2162 x += c - '0';
2163 else if (c >= 'a' && c <= 'f')
2164 x += 10 + c - 'a';
2165 else
2166 x += 10 + c - 'A';
2167 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002168#ifndef Py_UNICODE_WIDE
2169 if (x > 0x10000) {
2170 if (unicode_decode_call_errorhandler(
2171 errors, &errorHandler,
2172 "rawunicodeescape", "\\Uxxxxxxxx out of range",
2173 starts, size, &startinpos, &endinpos, &exc, &s,
2174 (PyObject **)&v, &outpos, &p))
2175 goto onError;
2176 }
2177#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002178 *p++ = x;
2179 nextByte:
2180 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002182 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002183 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002184 Py_XDECREF(errorHandler);
2185 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002187
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 onError:
2189 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002190 Py_XDECREF(errorHandler);
2191 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002192 return NULL;
2193}
2194
2195PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2196 int size)
2197{
2198 PyObject *repr;
2199 char *p;
2200 char *q;
2201
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002202 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002204#ifdef Py_UNICODE_WIDE
2205 repr = PyString_FromStringAndSize(NULL, 10 * size);
2206#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 repr = PyString_FromStringAndSize(NULL, 6 * size);
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002208#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 if (repr == NULL)
2210 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002211 if (size == 0)
2212 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213
2214 p = q = PyString_AS_STRING(repr);
2215 while (size-- > 0) {
2216 Py_UNICODE ch = *s++;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002217#ifdef Py_UNICODE_WIDE
2218 /* Map 32-bit characters to '\Uxxxxxxxx' */
2219 if (ch >= 0x10000) {
2220 *p++ = '\\';
2221 *p++ = 'U';
2222 *p++ = hexdigit[(ch >> 28) & 0xf];
2223 *p++ = hexdigit[(ch >> 24) & 0xf];
2224 *p++ = hexdigit[(ch >> 20) & 0xf];
2225 *p++ = hexdigit[(ch >> 16) & 0xf];
2226 *p++ = hexdigit[(ch >> 12) & 0xf];
2227 *p++ = hexdigit[(ch >> 8) & 0xf];
2228 *p++ = hexdigit[(ch >> 4) & 0xf];
2229 *p++ = hexdigit[ch & 15];
Tim Petersced69f82003-09-16 20:30:58 +00002230 }
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00002231 else
2232#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 /* Map 16-bit characters to '\uxxxx' */
2234 if (ch >= 256) {
2235 *p++ = '\\';
2236 *p++ = 'u';
2237 *p++ = hexdigit[(ch >> 12) & 0xf];
2238 *p++ = hexdigit[(ch >> 8) & 0xf];
2239 *p++ = hexdigit[(ch >> 4) & 0xf];
2240 *p++ = hexdigit[ch & 15];
2241 }
2242 /* Copy everything else as-is */
2243 else
2244 *p++ = (char) ch;
2245 }
2246 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002247 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 return repr;
2249}
2250
2251PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2252{
2253 if (!PyUnicode_Check(unicode)) {
2254 PyErr_BadArgument();
2255 return NULL;
2256 }
2257 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2258 PyUnicode_GET_SIZE(unicode));
2259}
2260
2261/* --- Latin-1 Codec ------------------------------------------------------ */
2262
2263PyObject *PyUnicode_DecodeLatin1(const char *s,
2264 int size,
2265 const char *errors)
2266{
2267 PyUnicodeObject *v;
2268 Py_UNICODE *p;
Tim Petersced69f82003-09-16 20:30:58 +00002269
Guido van Rossumd57fd912000-03-10 22:53:23 +00002270 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002271 if (size == 1) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002272 Py_UNICODE r = *(unsigned char*)s;
2273 return PyUnicode_FromUnicode(&r, 1);
2274 }
2275
Guido van Rossumd57fd912000-03-10 22:53:23 +00002276 v = _PyUnicode_New(size);
2277 if (v == NULL)
2278 goto onError;
2279 if (size == 0)
2280 return (PyObject *)v;
2281 p = PyUnicode_AS_UNICODE(v);
2282 while (size-- > 0)
2283 *p++ = (unsigned char)*s++;
2284 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002285
Guido van Rossumd57fd912000-03-10 22:53:23 +00002286 onError:
2287 Py_XDECREF(v);
2288 return NULL;
2289}
2290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002291/* create or adjust a UnicodeEncodeError */
2292static void make_encode_exception(PyObject **exceptionObject,
2293 const char *encoding,
2294 const Py_UNICODE *unicode, int size,
2295 int startpos, int endpos,
2296 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002297{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002298 if (*exceptionObject == NULL) {
2299 *exceptionObject = PyUnicodeEncodeError_Create(
2300 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301 }
2302 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002303 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2304 goto onError;
2305 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2306 goto onError;
2307 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2308 goto onError;
2309 return;
2310 onError:
2311 Py_DECREF(*exceptionObject);
2312 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002313 }
2314}
2315
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002316/* raises a UnicodeEncodeError */
2317static void raise_encode_exception(PyObject **exceptionObject,
2318 const char *encoding,
2319 const Py_UNICODE *unicode, int size,
2320 int startpos, int endpos,
2321 const char *reason)
2322{
2323 make_encode_exception(exceptionObject,
2324 encoding, unicode, size, startpos, endpos, reason);
2325 if (*exceptionObject != NULL)
2326 PyCodec_StrictErrors(*exceptionObject);
2327}
2328
2329/* error handling callback helper:
2330 build arguments, call the callback and check the arguments,
2331 put the result into newpos and return the replacement string, which
2332 has to be freed by the caller */
2333static PyObject *unicode_encode_call_errorhandler(const char *errors,
2334 PyObject **errorHandler,
2335 const char *encoding, const char *reason,
2336 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2337 int startpos, int endpos,
2338 int *newpos)
2339{
2340 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2341
2342 PyObject *restuple;
2343 PyObject *resunicode;
2344
2345 if (*errorHandler == NULL) {
2346 *errorHandler = PyCodec_LookupError(errors);
2347 if (*errorHandler == NULL)
2348 return NULL;
2349 }
2350
2351 make_encode_exception(exceptionObject,
2352 encoding, unicode, size, startpos, endpos, reason);
2353 if (*exceptionObject == NULL)
2354 return NULL;
2355
2356 restuple = PyObject_CallFunctionObjArgs(
2357 *errorHandler, *exceptionObject, NULL);
2358 if (restuple == NULL)
2359 return NULL;
2360 if (!PyTuple_Check(restuple)) {
2361 PyErr_Format(PyExc_TypeError, &argparse[4]);
2362 Py_DECREF(restuple);
2363 return NULL;
2364 }
2365 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2366 &resunicode, newpos)) {
2367 Py_DECREF(restuple);
2368 return NULL;
2369 }
2370 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002371 *newpos = size+*newpos;
2372 if (*newpos<0 || *newpos>size) {
2373 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2374 Py_DECREF(restuple);
2375 return NULL;
2376 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002377 Py_INCREF(resunicode);
2378 Py_DECREF(restuple);
2379 return resunicode;
2380}
2381
2382static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2383 int size,
2384 const char *errors,
2385 int limit)
2386{
2387 /* output object */
2388 PyObject *res;
2389 /* pointers to the beginning and end+1 of input */
2390 const Py_UNICODE *startp = p;
2391 const Py_UNICODE *endp = p + size;
2392 /* pointer to the beginning of the unencodable characters */
2393 /* const Py_UNICODE *badp = NULL; */
2394 /* pointer into the output */
2395 char *str;
2396 /* current output position */
2397 int respos = 0;
2398 int ressize;
2399 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2400 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2401 PyObject *errorHandler = NULL;
2402 PyObject *exc = NULL;
2403 /* the following variable is used for caching string comparisons
2404 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2405 int known_errorHandler = -1;
2406
2407 /* allocate enough for a simple encoding without
2408 replacements, if we need more, we'll resize */
2409 res = PyString_FromStringAndSize(NULL, size);
2410 if (res == NULL)
2411 goto onError;
2412 if (size == 0)
2413 return res;
2414 str = PyString_AS_STRING(res);
2415 ressize = size;
2416
2417 while (p<endp) {
2418 Py_UNICODE c = *p;
2419
2420 /* can we encode this? */
2421 if (c<limit) {
2422 /* no overflow check, because we know that the space is enough */
2423 *str++ = (char)c;
2424 ++p;
2425 }
2426 else {
2427 int unicodepos = p-startp;
2428 int requiredsize;
2429 PyObject *repunicode;
2430 int repsize;
2431 int newpos;
2432 int respos;
2433 Py_UNICODE *uni2;
2434 /* startpos for collecting unencodable chars */
2435 const Py_UNICODE *collstart = p;
2436 const Py_UNICODE *collend = p;
2437 /* find all unecodable characters */
2438 while ((collend < endp) && ((*collend)>=limit))
2439 ++collend;
2440 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2441 if (known_errorHandler==-1) {
2442 if ((errors==NULL) || (!strcmp(errors, "strict")))
2443 known_errorHandler = 1;
2444 else if (!strcmp(errors, "replace"))
2445 known_errorHandler = 2;
2446 else if (!strcmp(errors, "ignore"))
2447 known_errorHandler = 3;
2448 else if (!strcmp(errors, "xmlcharrefreplace"))
2449 known_errorHandler = 4;
2450 else
2451 known_errorHandler = 0;
2452 }
2453 switch (known_errorHandler) {
2454 case 1: /* strict */
2455 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2456 goto onError;
2457 case 2: /* replace */
2458 while (collstart++<collend)
2459 *str++ = '?'; /* fall through */
2460 case 3: /* ignore */
2461 p = collend;
2462 break;
2463 case 4: /* xmlcharrefreplace */
2464 respos = str-PyString_AS_STRING(res);
2465 /* determine replacement size (temporarily (mis)uses p) */
2466 for (p = collstart, repsize = 0; p < collend; ++p) {
2467 if (*p<10)
2468 repsize += 2+1+1;
2469 else if (*p<100)
2470 repsize += 2+2+1;
2471 else if (*p<1000)
2472 repsize += 2+3+1;
2473 else if (*p<10000)
2474 repsize += 2+4+1;
Hye-Shik Chang40e95092003-12-22 01:31:13 +00002475#ifndef Py_UNICODE_WIDE
2476 else
2477 repsize += 2+5+1;
2478#else
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002479 else if (*p<100000)
2480 repsize += 2+5+1;
2481 else if (*p<1000000)
2482 repsize += 2+6+1;
2483 else
2484 repsize += 2+7+1;
Hye-Shik Chang4a264fb2003-12-19 01:59:56 +00002485#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002486 }
2487 requiredsize = respos+repsize+(endp-collend);
2488 if (requiredsize > ressize) {
2489 if (requiredsize<2*ressize)
2490 requiredsize = 2*ressize;
2491 if (_PyString_Resize(&res, requiredsize))
2492 goto onError;
2493 str = PyString_AS_STRING(res) + respos;
2494 ressize = requiredsize;
2495 }
2496 /* generate replacement (temporarily (mis)uses p) */
2497 for (p = collstart; p < collend; ++p) {
2498 str += sprintf(str, "&#%d;", (int)*p);
2499 }
2500 p = collend;
2501 break;
2502 default:
2503 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2504 encoding, reason, startp, size, &exc,
2505 collstart-startp, collend-startp, &newpos);
2506 if (repunicode == NULL)
2507 goto onError;
2508 /* need more space? (at least enough for what we
2509 have+the replacement+the rest of the string, so
2510 we won't have to check space for encodable characters) */
2511 respos = str-PyString_AS_STRING(res);
2512 repsize = PyUnicode_GET_SIZE(repunicode);
2513 requiredsize = respos+repsize+(endp-collend);
2514 if (requiredsize > ressize) {
2515 if (requiredsize<2*ressize)
2516 requiredsize = 2*ressize;
2517 if (_PyString_Resize(&res, requiredsize)) {
2518 Py_DECREF(repunicode);
2519 goto onError;
2520 }
2521 str = PyString_AS_STRING(res) + respos;
2522 ressize = requiredsize;
2523 }
2524 /* check if there is anything unencodable in the replacement
2525 and copy it to the output */
2526 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2527 c = *uni2;
2528 if (c >= limit) {
2529 raise_encode_exception(&exc, encoding, startp, size,
2530 unicodepos, unicodepos+1, reason);
2531 Py_DECREF(repunicode);
2532 goto onError;
2533 }
2534 *str = (char)c;
2535 }
2536 p = startp + newpos;
2537 Py_DECREF(repunicode);
2538 }
2539 }
2540 }
2541 /* Resize if we allocated to much */
2542 respos = str-PyString_AS_STRING(res);
2543 if (respos<ressize)
2544 /* If this falls res will be NULL */
2545 _PyString_Resize(&res, respos);
2546 Py_XDECREF(errorHandler);
2547 Py_XDECREF(exc);
2548 return res;
2549
2550 onError:
2551 Py_XDECREF(res);
2552 Py_XDECREF(errorHandler);
2553 Py_XDECREF(exc);
2554 return NULL;
2555}
2556
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2558 int size,
2559 const char *errors)
2560{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002561 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002562}
2563
2564PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2565{
2566 if (!PyUnicode_Check(unicode)) {
2567 PyErr_BadArgument();
2568 return NULL;
2569 }
2570 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2571 PyUnicode_GET_SIZE(unicode),
2572 NULL);
2573}
2574
2575/* --- 7-bit ASCII Codec -------------------------------------------------- */
2576
Guido van Rossumd57fd912000-03-10 22:53:23 +00002577PyObject *PyUnicode_DecodeASCII(const char *s,
2578 int size,
2579 const char *errors)
2580{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002581 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 PyUnicodeObject *v;
2583 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002584 int startinpos;
2585 int endinpos;
2586 int outpos;
2587 const char *e;
2588 PyObject *errorHandler = NULL;
2589 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002590
Guido van Rossumd57fd912000-03-10 22:53:23 +00002591 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002592 if (size == 1 && *(unsigned char*)s < 128) {
2593 Py_UNICODE r = *(unsigned char*)s;
2594 return PyUnicode_FromUnicode(&r, 1);
2595 }
Tim Petersced69f82003-09-16 20:30:58 +00002596
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 v = _PyUnicode_New(size);
2598 if (v == NULL)
2599 goto onError;
2600 if (size == 0)
2601 return (PyObject *)v;
2602 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002603 e = s + size;
2604 while (s < e) {
2605 register unsigned char c = (unsigned char)*s;
2606 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 ++s;
2609 }
2610 else {
2611 startinpos = s-starts;
2612 endinpos = startinpos + 1;
Jeremy Hyltond8082792003-09-16 19:41:39 +00002613 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002614 if (unicode_decode_call_errorhandler(
2615 errors, &errorHandler,
2616 "ascii", "ordinal not in range(128)",
2617 starts, size, &startinpos, &endinpos, &exc, &s,
2618 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002619 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002620 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002622 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002623 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002624 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002625 Py_XDECREF(errorHandler);
2626 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002628
Guido van Rossumd57fd912000-03-10 22:53:23 +00002629 onError:
2630 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002631 Py_XDECREF(errorHandler);
2632 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002633 return NULL;
2634}
2635
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2637 int size,
2638 const char *errors)
2639{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002640 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641}
2642
2643PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2644{
2645 if (!PyUnicode_Check(unicode)) {
2646 PyErr_BadArgument();
2647 return NULL;
2648 }
2649 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2650 PyUnicode_GET_SIZE(unicode),
2651 NULL);
2652}
2653
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002654#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002655
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002656/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002657
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002658PyObject *PyUnicode_DecodeMBCS(const char *s,
2659 int size,
2660 const char *errors)
2661{
2662 PyUnicodeObject *v;
2663 Py_UNICODE *p;
2664
2665 /* First get the size of the result */
2666 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002667 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002668 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2669
2670 v = _PyUnicode_New(usize);
2671 if (v == NULL)
2672 return NULL;
2673 if (usize == 0)
2674 return (PyObject *)v;
2675 p = PyUnicode_AS_UNICODE(v);
2676 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2677 Py_DECREF(v);
2678 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2679 }
2680
2681 return (PyObject *)v;
2682}
2683
2684PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2685 int size,
2686 const char *errors)
2687{
2688 PyObject *repr;
2689 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002690 DWORD mbcssize;
2691
2692 /* If there are no characters, bail now! */
2693 if (size==0)
2694 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002695
2696 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002697 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002698 if (mbcssize==0)
2699 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2700
2701 repr = PyString_FromStringAndSize(NULL, mbcssize);
2702 if (repr == NULL)
2703 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002704 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002705 return repr;
2706
2707 /* Do the conversion */
2708 s = PyString_AS_STRING(repr);
2709 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2710 Py_DECREF(repr);
2711 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2712 }
2713 return repr;
2714}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002715
Mark Hammond0ccda1e2003-07-01 00:13:27 +00002716PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
2717{
2718 if (!PyUnicode_Check(unicode)) {
2719 PyErr_BadArgument();
2720 return NULL;
2721 }
2722 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2723 PyUnicode_GET_SIZE(unicode),
2724 NULL);
2725}
2726
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002727#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002728
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729/* --- Character Mapping Codec -------------------------------------------- */
2730
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731PyObject *PyUnicode_DecodeCharmap(const char *s,
2732 int size,
2733 PyObject *mapping,
2734 const char *errors)
2735{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002736 const char *starts = s;
2737 int startinpos;
2738 int endinpos;
2739 int outpos;
2740 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 PyUnicodeObject *v;
2742 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002743 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002744 PyObject *errorHandler = NULL;
2745 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00002746
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 /* Default to Latin-1 */
2748 if (mapping == NULL)
2749 return PyUnicode_DecodeLatin1(s, size, errors);
2750
2751 v = _PyUnicode_New(size);
2752 if (v == NULL)
2753 goto onError;
2754 if (size == 0)
2755 return (PyObject *)v;
2756 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002757 e = s + size;
2758 while (s < e) {
2759 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760 PyObject *w, *x;
2761
2762 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2763 w = PyInt_FromLong((long)ch);
2764 if (w == NULL)
2765 goto onError;
2766 x = PyObject_GetItem(mapping, w);
2767 Py_DECREF(w);
2768 if (x == NULL) {
2769 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002770 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002772 x = Py_None;
2773 Py_INCREF(x);
2774 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002775 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 }
2777
2778 /* Apply mapping */
2779 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002780 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 if (value < 0 || value > 65535) {
2782 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002783 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 Py_DECREF(x);
2785 goto onError;
2786 }
2787 *p++ = (Py_UNICODE)value;
2788 }
2789 else if (x == Py_None) {
2790 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002791 outpos = p-PyUnicode_AS_UNICODE(v);
2792 startinpos = s-starts;
2793 endinpos = startinpos+1;
2794 if (unicode_decode_call_errorhandler(
2795 errors, &errorHandler,
2796 "charmap", "character maps to <undefined>",
2797 starts, size, &startinpos, &endinpos, &exc, &s,
2798 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002799 Py_DECREF(x);
2800 goto onError;
2801 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002802 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 }
2804 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002805 int targetsize = PyUnicode_GET_SIZE(x);
2806
2807 if (targetsize == 1)
2808 /* 1-1 mapping */
2809 *p++ = *PyUnicode_AS_UNICODE(x);
2810
2811 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002813 if (targetsize > extrachars) {
2814 /* resize first */
2815 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2816 int needed = (targetsize - extrachars) + \
2817 (targetsize << 2);
2818 extrachars += needed;
Tim Petersced69f82003-09-16 20:30:58 +00002819 if (_PyUnicode_Resize(&v,
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002820 PyUnicode_GET_SIZE(v) + needed) < 0) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002821 Py_DECREF(x);
2822 goto onError;
2823 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002824 p = PyUnicode_AS_UNICODE(v) + oldpos;
2825 }
2826 Py_UNICODE_COPY(p,
2827 PyUnicode_AS_UNICODE(x),
2828 targetsize);
2829 p += targetsize;
2830 extrachars -= targetsize;
2831 }
2832 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 }
2834 else {
2835 /* wrong return value */
2836 PyErr_SetString(PyExc_TypeError,
2837 "character mapping must return integer, None or unicode");
2838 Py_DECREF(x);
2839 goto onError;
2840 }
2841 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002842 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 }
2844 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00002845 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002847 Py_XDECREF(errorHandler);
2848 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 return (PyObject *)v;
Tim Petersced69f82003-09-16 20:30:58 +00002850
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002852 Py_XDECREF(errorHandler);
2853 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 Py_XDECREF(v);
2855 return NULL;
2856}
2857
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002858/* Lookup the character ch in the mapping. If the character
2859 can't be found, Py_None is returned (or NULL, if another
2860 error occured). */
2861static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002862{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002863 PyObject *w = PyInt_FromLong((long)c);
2864 PyObject *x;
2865
2866 if (w == NULL)
2867 return NULL;
2868 x = PyObject_GetItem(mapping, w);
2869 Py_DECREF(w);
2870 if (x == NULL) {
2871 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2872 /* No mapping found means: mapping is undefined. */
2873 PyErr_Clear();
2874 x = Py_None;
2875 Py_INCREF(x);
2876 return x;
2877 } else
2878 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002879 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002880 else if (x == Py_None)
2881 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002882 else if (PyInt_Check(x)) {
2883 long value = PyInt_AS_LONG(x);
2884 if (value < 0 || value > 255) {
2885 PyErr_SetString(PyExc_TypeError,
2886 "character mapping must be in range(256)");
2887 Py_DECREF(x);
2888 return NULL;
2889 }
2890 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002892 else if (PyString_Check(x))
2893 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002894 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002895 /* wrong return value */
2896 PyErr_SetString(PyExc_TypeError,
2897 "character mapping must return integer, None or str");
2898 Py_DECREF(x);
2899 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002900 }
2901}
2902
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002903/* lookup the character, put the result in the output string and adjust
2904 various state variables. Reallocate the output string if not enough
2905 space is available. Return a new reference to the object that
2906 was put in the output buffer, or Py_None, if the mapping was undefined
2907 (in which case no character was written) or NULL, if a
2908 reallocation error ocurred. The called must decref the result */
2909static
2910PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2911 PyObject **outobj, int *outpos)
2912{
2913 PyObject *rep = charmapencode_lookup(c, mapping);
2914
2915 if (rep==NULL)
2916 return NULL;
2917 else if (rep==Py_None)
2918 return rep;
2919 else {
2920 char *outstart = PyString_AS_STRING(*outobj);
2921 int outsize = PyString_GET_SIZE(*outobj);
2922 if (PyInt_Check(rep)) {
2923 int requiredsize = *outpos+1;
2924 if (outsize<requiredsize) {
2925 /* exponentially overallocate to minimize reallocations */
2926 if (requiredsize < 2*outsize)
2927 requiredsize = 2*outsize;
2928 if (_PyString_Resize(outobj, requiredsize)) {
2929 Py_DECREF(rep);
2930 return NULL;
2931 }
2932 outstart = PyString_AS_STRING(*outobj);
2933 }
2934 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2935 }
2936 else {
2937 const char *repchars = PyString_AS_STRING(rep);
2938 int repsize = PyString_GET_SIZE(rep);
2939 int requiredsize = *outpos+repsize;
2940 if (outsize<requiredsize) {
2941 /* exponentially overallocate to minimize reallocations */
2942 if (requiredsize < 2*outsize)
2943 requiredsize = 2*outsize;
2944 if (_PyString_Resize(outobj, requiredsize)) {
2945 Py_DECREF(rep);
2946 return NULL;
2947 }
2948 outstart = PyString_AS_STRING(*outobj);
2949 }
2950 memcpy(outstart + *outpos, repchars, repsize);
2951 *outpos += repsize;
2952 }
2953 }
2954 return rep;
2955}
2956
2957/* handle an error in PyUnicode_EncodeCharmap
2958 Return 0 on success, -1 on error */
2959static
2960int charmap_encoding_error(
2961 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2962 PyObject **exceptionObject,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00002963 int *known_errorHandler, PyObject **errorHandler, const char *errors,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002964 PyObject **res, int *respos)
2965{
2966 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2967 int repsize;
2968 int newpos;
2969 Py_UNICODE *uni2;
2970 /* startpos for collecting unencodable chars */
2971 int collstartpos = *inpos;
2972 int collendpos = *inpos+1;
2973 int collpos;
2974 char *encoding = "charmap";
2975 char *reason = "character maps to <undefined>";
2976
2977 PyObject *x;
2978 /* find all unencodable characters */
2979 while (collendpos < size) {
2980 x = charmapencode_lookup(p[collendpos], mapping);
2981 if (x==NULL)
2982 return -1;
2983 else if (x!=Py_None) {
2984 Py_DECREF(x);
2985 break;
2986 }
2987 Py_DECREF(x);
2988 ++collendpos;
2989 }
2990 /* cache callback name lookup
2991 * (if not done yet, i.e. it's the first error) */
2992 if (*known_errorHandler==-1) {
2993 if ((errors==NULL) || (!strcmp(errors, "strict")))
2994 *known_errorHandler = 1;
2995 else if (!strcmp(errors, "replace"))
2996 *known_errorHandler = 2;
2997 else if (!strcmp(errors, "ignore"))
2998 *known_errorHandler = 3;
2999 else if (!strcmp(errors, "xmlcharrefreplace"))
3000 *known_errorHandler = 4;
3001 else
3002 *known_errorHandler = 0;
3003 }
3004 switch (*known_errorHandler) {
3005 case 1: /* strict */
3006 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3007 return -1;
3008 case 2: /* replace */
3009 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
3010 x = charmapencode_output('?', mapping, res, respos);
3011 if (x==NULL) {
3012 return -1;
3013 }
3014 else if (x==Py_None) {
3015 Py_DECREF(x);
3016 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3017 return -1;
3018 }
3019 Py_DECREF(x);
3020 }
3021 /* fall through */
3022 case 3: /* ignore */
3023 *inpos = collendpos;
3024 break;
3025 case 4: /* xmlcharrefreplace */
3026 /* generate replacement (temporarily (mis)uses p) */
3027 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
3028 char buffer[2+29+1+1];
3029 char *cp;
3030 sprintf(buffer, "&#%d;", (int)p[collpos]);
3031 for (cp = buffer; *cp; ++cp) {
3032 x = charmapencode_output(*cp, mapping, res, respos);
3033 if (x==NULL)
3034 return -1;
3035 else if (x==Py_None) {
3036 Py_DECREF(x);
3037 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3038 return -1;
3039 }
3040 Py_DECREF(x);
3041 }
3042 }
3043 *inpos = collendpos;
3044 break;
3045 default:
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003046 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003047 encoding, reason, p, size, exceptionObject,
3048 collstartpos, collendpos, &newpos);
3049 if (repunicode == NULL)
3050 return -1;
3051 /* generate replacement */
3052 repsize = PyUnicode_GET_SIZE(repunicode);
3053 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3054 x = charmapencode_output(*uni2, mapping, res, respos);
3055 if (x==NULL) {
3056 Py_DECREF(repunicode);
3057 return -1;
3058 }
3059 else if (x==Py_None) {
3060 Py_DECREF(repunicode);
3061 Py_DECREF(x);
3062 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
3063 return -1;
3064 }
3065 Py_DECREF(x);
3066 }
3067 *inpos = newpos;
3068 Py_DECREF(repunicode);
3069 }
3070 return 0;
3071}
3072
Guido van Rossumd57fd912000-03-10 22:53:23 +00003073PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
3074 int size,
3075 PyObject *mapping,
3076 const char *errors)
3077{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003078 /* output object */
3079 PyObject *res = NULL;
3080 /* current input position */
3081 int inpos = 0;
3082 /* current output position */
3083 int respos = 0;
3084 PyObject *errorHandler = NULL;
3085 PyObject *exc = NULL;
3086 /* the following variable is used for caching string comparisons
3087 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3088 * 3=ignore, 4=xmlcharrefreplace */
3089 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003090
3091 /* Default to Latin-1 */
3092 if (mapping == NULL)
3093 return PyUnicode_EncodeLatin1(p, size, errors);
3094
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003095 /* allocate enough for a simple encoding without
3096 replacements, if we need more, we'll resize */
3097 res = PyString_FromStringAndSize(NULL, size);
3098 if (res == NULL)
3099 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00003100 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003101 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003103 while (inpos<size) {
3104 /* try to encode it */
3105 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
3106 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003107 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003108 if (x==Py_None) { /* unencodable character */
3109 if (charmap_encoding_error(p, size, &inpos, mapping,
3110 &exc,
Walter Dörwalde5402fb2003-08-14 20:25:29 +00003111 &known_errorHandler, &errorHandler, errors,
Walter Dörwald9b30f202003-08-15 16:26:34 +00003112 &res, &respos)) {
3113 Py_DECREF(x);
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003114 goto onError;
Walter Dörwald9b30f202003-08-15 16:26:34 +00003115 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003117 else
3118 /* done with this character => adjust input position */
3119 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 Py_DECREF(x);
3121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003122
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003123 /* Resize if we allocated to much */
3124 if (respos<PyString_GET_SIZE(res)) {
3125 if (_PyString_Resize(&res, respos))
3126 goto onError;
3127 }
3128 Py_XDECREF(exc);
3129 Py_XDECREF(errorHandler);
3130 return res;
3131
3132 onError:
3133 Py_XDECREF(res);
3134 Py_XDECREF(exc);
3135 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003136 return NULL;
3137}
3138
3139PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3140 PyObject *mapping)
3141{
3142 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3143 PyErr_BadArgument();
3144 return NULL;
3145 }
3146 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3147 PyUnicode_GET_SIZE(unicode),
3148 mapping,
3149 NULL);
3150}
3151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003152/* create or adjust a UnicodeTranslateError */
3153static void make_translate_exception(PyObject **exceptionObject,
3154 const Py_UNICODE *unicode, int size,
3155 int startpos, int endpos,
3156 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003158 if (*exceptionObject == NULL) {
3159 *exceptionObject = PyUnicodeTranslateError_Create(
3160 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 }
3162 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003163 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3164 goto onError;
3165 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3166 goto onError;
3167 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3168 goto onError;
3169 return;
3170 onError:
3171 Py_DECREF(*exceptionObject);
3172 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003173 }
3174}
3175
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003176/* raises a UnicodeTranslateError */
3177static void raise_translate_exception(PyObject **exceptionObject,
3178 const Py_UNICODE *unicode, int size,
3179 int startpos, int endpos,
3180 const char *reason)
3181{
3182 make_translate_exception(exceptionObject,
3183 unicode, size, startpos, endpos, reason);
3184 if (*exceptionObject != NULL)
3185 PyCodec_StrictErrors(*exceptionObject);
3186}
3187
3188/* error handling callback helper:
3189 build arguments, call the callback and check the arguments,
3190 put the result into newpos and return the replacement string, which
3191 has to be freed by the caller */
3192static PyObject *unicode_translate_call_errorhandler(const char *errors,
3193 PyObject **errorHandler,
3194 const char *reason,
3195 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3196 int startpos, int endpos,
3197 int *newpos)
3198{
3199 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3200
3201 PyObject *restuple;
3202 PyObject *resunicode;
3203
3204 if (*errorHandler == NULL) {
3205 *errorHandler = PyCodec_LookupError(errors);
3206 if (*errorHandler == NULL)
3207 return NULL;
3208 }
3209
3210 make_translate_exception(exceptionObject,
3211 unicode, size, startpos, endpos, reason);
3212 if (*exceptionObject == NULL)
3213 return NULL;
3214
3215 restuple = PyObject_CallFunctionObjArgs(
3216 *errorHandler, *exceptionObject, NULL);
3217 if (restuple == NULL)
3218 return NULL;
3219 if (!PyTuple_Check(restuple)) {
3220 PyErr_Format(PyExc_TypeError, &argparse[4]);
3221 Py_DECREF(restuple);
3222 return NULL;
3223 }
3224 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3225 &resunicode, newpos)) {
3226 Py_DECREF(restuple);
3227 return NULL;
3228 }
3229 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003230 *newpos = size+*newpos;
3231 if (*newpos<0 || *newpos>size) {
3232 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3233 Py_DECREF(restuple);
3234 return NULL;
3235 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003236 Py_INCREF(resunicode);
3237 Py_DECREF(restuple);
3238 return resunicode;
3239}
3240
3241/* Lookup the character ch in the mapping and put the result in result,
3242 which must be decrefed by the caller.
3243 Return 0 on success, -1 on error */
3244static
3245int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3246{
3247 PyObject *w = PyInt_FromLong((long)c);
3248 PyObject *x;
3249
3250 if (w == NULL)
3251 return -1;
3252 x = PyObject_GetItem(mapping, w);
3253 Py_DECREF(w);
3254 if (x == NULL) {
3255 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3256 /* No mapping found means: use 1:1 mapping. */
3257 PyErr_Clear();
3258 *result = NULL;
3259 return 0;
3260 } else
3261 return -1;
3262 }
3263 else if (x == Py_None) {
3264 *result = x;
3265 return 0;
3266 }
3267 else if (PyInt_Check(x)) {
3268 long value = PyInt_AS_LONG(x);
3269 long max = PyUnicode_GetMax();
3270 if (value < 0 || value > max) {
3271 PyErr_Format(PyExc_TypeError,
3272 "character mapping must be in range(0x%lx)", max+1);
3273 Py_DECREF(x);
3274 return -1;
3275 }
3276 *result = x;
3277 return 0;
3278 }
3279 else if (PyUnicode_Check(x)) {
3280 *result = x;
3281 return 0;
3282 }
3283 else {
3284 /* wrong return value */
3285 PyErr_SetString(PyExc_TypeError,
3286 "character mapping must return integer, None or unicode");
Walter Dörwald150523e2003-08-15 16:52:19 +00003287 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003288 return -1;
3289 }
3290}
3291/* ensure that *outobj is at least requiredsize characters long,
3292if not reallocate and adjust various state variables.
3293Return 0 on success, -1 on error */
3294static
Walter Dörwald4894c302003-10-24 14:25:28 +00003295int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003296 int requiredsize)
3297{
Walter Dörwald4894c302003-10-24 14:25:28 +00003298 int oldsize = PyUnicode_GET_SIZE(*outobj);
3299 if (requiredsize > oldsize) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003300 /* remember old output position */
3301 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3302 /* exponentially overallocate to minimize reallocations */
Walter Dörwald4894c302003-10-24 14:25:28 +00003303 if (requiredsize < 2 * oldsize)
3304 requiredsize = 2 * oldsize;
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003305 if (_PyUnicode_Resize(outobj, requiredsize) < 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003306 return -1;
3307 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003308 }
3309 return 0;
3310}
3311/* lookup the character, put the result in the output string and adjust
3312 various state variables. Return a new reference to the object that
3313 was put in the output buffer in *result, or Py_None, if the mapping was
3314 undefined (in which case no character was written).
3315 The called must decref result.
3316 Return 0 on success, -1 on error. */
3317static
Walter Dörwald4894c302003-10-24 14:25:28 +00003318int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
3319 int insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
3320 PyObject **res)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321{
Walter Dörwald4894c302003-10-24 14:25:28 +00003322 if (charmaptranslate_lookup(*curinp, mapping, res))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003323 return -1;
3324 if (*res==NULL) {
3325 /* not found => default to 1:1 mapping */
Walter Dörwald4894c302003-10-24 14:25:28 +00003326 *(*outp)++ = *curinp;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003327 }
3328 else if (*res==Py_None)
3329 ;
3330 else if (PyInt_Check(*res)) {
3331 /* no overflow check, because we know that the space is enough */
3332 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3333 }
3334 else if (PyUnicode_Check(*res)) {
3335 int repsize = PyUnicode_GET_SIZE(*res);
3336 if (repsize==1) {
3337 /* no overflow check, because we know that the space is enough */
3338 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3339 }
3340 else if (repsize!=0) {
3341 /* more than one character */
Walter Dörwald4894c302003-10-24 14:25:28 +00003342 int requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
Walter Dörwaldcd736e72004-02-05 17:36:00 +00003343 (insize - (curinp-startinp)) +
Walter Dörwald4894c302003-10-24 14:25:28 +00003344 repsize - 1;
3345 if (charmaptranslate_makespace(outobj, outp, requiredsize))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 return -1;
3347 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3348 *outp += repsize;
3349 }
3350 }
3351 else
3352 return -1;
3353 return 0;
3354}
3355
3356PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 int size,
3358 PyObject *mapping,
3359 const char *errors)
3360{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003361 /* output object */
3362 PyObject *res = NULL;
3363 /* pointers to the beginning and end+1 of input */
3364 const Py_UNICODE *startp = p;
3365 const Py_UNICODE *endp = p + size;
3366 /* pointer into the output */
3367 Py_UNICODE *str;
3368 /* current output position */
3369 int respos = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003370 char *reason = "character maps to <undefined>";
3371 PyObject *errorHandler = NULL;
3372 PyObject *exc = NULL;
3373 /* the following variable is used for caching string comparisons
3374 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3375 * 3=ignore, 4=xmlcharrefreplace */
3376 int known_errorHandler = -1;
3377
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 if (mapping == NULL) {
3379 PyErr_BadArgument();
3380 return NULL;
3381 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003382
3383 /* allocate enough for a simple 1:1 translation without
3384 replacements, if we need more, we'll resize */
3385 res = PyUnicode_FromUnicode(NULL, size);
3386 if (res == NULL)
Walter Dörwald4894c302003-10-24 14:25:28 +00003387 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003389 return res;
3390 str = PyUnicode_AS_UNICODE(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003391
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003392 while (p<endp) {
3393 /* try to encode it */
3394 PyObject *x = NULL;
Walter Dörwald4894c302003-10-24 14:25:28 +00003395 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003396 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003397 goto onError;
3398 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003399 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003400 if (x!=Py_None) /* it worked => adjust input pointer */
3401 ++p;
3402 else { /* untranslatable character */
3403 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3404 int repsize;
3405 int newpos;
3406 Py_UNICODE *uni2;
3407 /* startpos for collecting untranslatable chars */
3408 const Py_UNICODE *collstart = p;
3409 const Py_UNICODE *collend = p+1;
3410 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003412 /* find all untranslatable characters */
3413 while (collend < endp) {
Walter Dörwald4894c302003-10-24 14:25:28 +00003414 if (charmaptranslate_lookup(*collend, mapping, &x))
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003415 goto onError;
3416 Py_XDECREF(x);
3417 if (x!=Py_None)
3418 break;
3419 ++collend;
3420 }
3421 /* cache callback name lookup
3422 * (if not done yet, i.e. it's the first error) */
3423 if (known_errorHandler==-1) {
3424 if ((errors==NULL) || (!strcmp(errors, "strict")))
3425 known_errorHandler = 1;
3426 else if (!strcmp(errors, "replace"))
3427 known_errorHandler = 2;
3428 else if (!strcmp(errors, "ignore"))
3429 known_errorHandler = 3;
3430 else if (!strcmp(errors, "xmlcharrefreplace"))
3431 known_errorHandler = 4;
3432 else
3433 known_errorHandler = 0;
3434 }
3435 switch (known_errorHandler) {
3436 case 1: /* strict */
3437 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3438 goto onError;
3439 case 2: /* replace */
3440 /* No need to check for space, this is a 1:1 replacement */
3441 for (coll = collstart; coll<collend; ++coll)
3442 *str++ = '?';
3443 /* fall through */
3444 case 3: /* ignore */
3445 p = collend;
3446 break;
3447 case 4: /* xmlcharrefreplace */
3448 /* generate replacement (temporarily (mis)uses p) */
3449 for (p = collstart; p < collend; ++p) {
3450 char buffer[2+29+1+1];
3451 char *cp;
3452 sprintf(buffer, "&#%d;", (int)*p);
Walter Dörwald4894c302003-10-24 14:25:28 +00003453 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003454 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3455 goto onError;
3456 for (cp = buffer; *cp; ++cp)
3457 *str++ = *cp;
3458 }
3459 p = collend;
3460 break;
3461 default:
3462 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3463 reason, startp, size, &exc,
3464 collstart-startp, collend-startp, &newpos);
3465 if (repunicode == NULL)
3466 goto onError;
3467 /* generate replacement */
3468 repsize = PyUnicode_GET_SIZE(repunicode);
Walter Dörwald4894c302003-10-24 14:25:28 +00003469 if (charmaptranslate_makespace(&res, &str,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003470 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3471 Py_DECREF(repunicode);
3472 goto onError;
3473 }
3474 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3475 *str++ = *uni2;
3476 p = startp + newpos;
3477 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 }
3479 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003480 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003481 /* Resize if we allocated to much */
3482 respos = str-PyUnicode_AS_UNICODE(res);
Walter Dörwald4894c302003-10-24 14:25:28 +00003483 if (respos<PyUnicode_GET_SIZE(res)) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00003484 if (_PyUnicode_Resize(&res, respos) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003485 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003486 }
3487 Py_XDECREF(exc);
3488 Py_XDECREF(errorHandler);
3489 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003490
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003491 onError:
3492 Py_XDECREF(res);
3493 Py_XDECREF(exc);
3494 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003495 return NULL;
3496}
3497
3498PyObject *PyUnicode_Translate(PyObject *str,
3499 PyObject *mapping,
3500 const char *errors)
3501{
3502 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00003503
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 str = PyUnicode_FromObject(str);
3505 if (str == NULL)
3506 goto onError;
3507 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3508 PyUnicode_GET_SIZE(str),
3509 mapping,
3510 errors);
3511 Py_DECREF(str);
3512 return result;
Tim Petersced69f82003-09-16 20:30:58 +00003513
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 onError:
3515 Py_XDECREF(str);
3516 return NULL;
3517}
Tim Petersced69f82003-09-16 20:30:58 +00003518
Guido van Rossum9e896b32000-04-05 20:11:21 +00003519/* --- Decimal Encoder ---------------------------------------------------- */
3520
3521int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3522 int length,
3523 char *output,
3524 const char *errors)
3525{
3526 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003527 PyObject *errorHandler = NULL;
3528 PyObject *exc = NULL;
3529 const char *encoding = "decimal";
3530 const char *reason = "invalid decimal Unicode string";
3531 /* the following variable is used for caching string comparisons
3532 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3533 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003534
3535 if (output == NULL) {
3536 PyErr_BadArgument();
3537 return -1;
3538 }
3539
3540 p = s;
3541 end = s + length;
3542 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003543 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003544 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003545 PyObject *repunicode;
3546 int repsize;
3547 int newpos;
3548 Py_UNICODE *uni2;
3549 Py_UNICODE *collstart;
3550 Py_UNICODE *collend;
Tim Petersced69f82003-09-16 20:30:58 +00003551
Guido van Rossum9e896b32000-04-05 20:11:21 +00003552 if (Py_UNICODE_ISSPACE(ch)) {
3553 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003554 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003555 continue;
3556 }
3557 decimal = Py_UNICODE_TODECIMAL(ch);
3558 if (decimal >= 0) {
3559 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003560 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003561 continue;
3562 }
Guido van Rossumba477042000-04-06 18:18:10 +00003563 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003564 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003565 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003566 continue;
3567 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003568 /* All other characters are considered unencodable */
3569 collstart = p;
3570 collend = p+1;
3571 while (collend < end) {
3572 if ((0 < *collend && *collend < 256) ||
3573 !Py_UNICODE_ISSPACE(*collend) ||
3574 Py_UNICODE_TODECIMAL(*collend))
3575 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003576 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003577 /* cache callback name lookup
3578 * (if not done yet, i.e. it's the first error) */
3579 if (known_errorHandler==-1) {
3580 if ((errors==NULL) || (!strcmp(errors, "strict")))
3581 known_errorHandler = 1;
3582 else if (!strcmp(errors, "replace"))
3583 known_errorHandler = 2;
3584 else if (!strcmp(errors, "ignore"))
3585 known_errorHandler = 3;
3586 else if (!strcmp(errors, "xmlcharrefreplace"))
3587 known_errorHandler = 4;
3588 else
3589 known_errorHandler = 0;
3590 }
3591 switch (known_errorHandler) {
3592 case 1: /* strict */
3593 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3594 goto onError;
3595 case 2: /* replace */
3596 for (p = collstart; p < collend; ++p)
3597 *output++ = '?';
3598 /* fall through */
3599 case 3: /* ignore */
3600 p = collend;
3601 break;
3602 case 4: /* xmlcharrefreplace */
3603 /* generate replacement (temporarily (mis)uses p) */
3604 for (p = collstart; p < collend; ++p)
3605 output += sprintf(output, "&#%d;", (int)*p);
3606 p = collend;
3607 break;
3608 default:
3609 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3610 encoding, reason, s, length, &exc,
3611 collstart-s, collend-s, &newpos);
3612 if (repunicode == NULL)
3613 goto onError;
3614 /* generate replacement */
3615 repsize = PyUnicode_GET_SIZE(repunicode);
3616 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3617 Py_UNICODE ch = *uni2;
3618 if (Py_UNICODE_ISSPACE(ch))
3619 *output++ = ' ';
3620 else {
3621 decimal = Py_UNICODE_TODECIMAL(ch);
3622 if (decimal >= 0)
3623 *output++ = '0' + decimal;
3624 else if (0 < ch && ch < 256)
3625 *output++ = (char)ch;
3626 else {
3627 Py_DECREF(repunicode);
3628 raise_encode_exception(&exc, encoding,
3629 s, length, collstart-s, collend-s, reason);
3630 goto onError;
3631 }
3632 }
3633 }
3634 p = s + newpos;
3635 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003636 }
3637 }
3638 /* 0-terminate the output string */
3639 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003640 Py_XDECREF(exc);
3641 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003642 return 0;
3643
3644 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003645 Py_XDECREF(exc);
3646 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003647 return -1;
3648}
3649
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650/* --- Helpers ------------------------------------------------------------ */
3651
Tim Petersced69f82003-09-16 20:30:58 +00003652static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653int count(PyUnicodeObject *self,
3654 int start,
3655 int end,
3656 PyUnicodeObject *substring)
3657{
3658 int count = 0;
3659
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003660 if (start < 0)
3661 start += self->length;
3662 if (start < 0)
3663 start = 0;
3664 if (end > self->length)
3665 end = self->length;
3666 if (end < 0)
3667 end += self->length;
3668 if (end < 0)
3669 end = 0;
3670
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003671 if (substring->length == 0)
3672 return (end - start + 1);
3673
Guido van Rossumd57fd912000-03-10 22:53:23 +00003674 end -= substring->length;
3675
3676 while (start <= end)
3677 if (Py_UNICODE_MATCH(self, start, substring)) {
3678 count++;
3679 start += substring->length;
3680 } else
3681 start++;
3682
3683 return count;
3684}
3685
3686int PyUnicode_Count(PyObject *str,
3687 PyObject *substr,
3688 int start,
3689 int end)
3690{
3691 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003692
Guido van Rossumd57fd912000-03-10 22:53:23 +00003693 str = PyUnicode_FromObject(str);
3694 if (str == NULL)
3695 return -1;
3696 substr = PyUnicode_FromObject(substr);
3697 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003698 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 return -1;
3700 }
Tim Petersced69f82003-09-16 20:30:58 +00003701
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 result = count((PyUnicodeObject *)str,
3703 start, end,
3704 (PyUnicodeObject *)substr);
Tim Petersced69f82003-09-16 20:30:58 +00003705
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 Py_DECREF(str);
3707 Py_DECREF(substr);
3708 return result;
3709}
3710
Tim Petersced69f82003-09-16 20:30:58 +00003711static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712int findstring(PyUnicodeObject *self,
3713 PyUnicodeObject *substring,
3714 int start,
3715 int end,
3716 int direction)
3717{
3718 if (start < 0)
3719 start += self->length;
3720 if (start < 0)
3721 start = 0;
3722
Guido van Rossumd57fd912000-03-10 22:53:23 +00003723 if (end > self->length)
3724 end = self->length;
3725 if (end < 0)
3726 end += self->length;
3727 if (end < 0)
3728 end = 0;
3729
Guido van Rossum76afbd92002-08-20 17:29:29 +00003730 if (substring->length == 0)
3731 return (direction > 0) ? start : end;
3732
Guido van Rossumd57fd912000-03-10 22:53:23 +00003733 end -= substring->length;
3734
3735 if (direction < 0) {
3736 for (; end >= start; end--)
3737 if (Py_UNICODE_MATCH(self, end, substring))
3738 return end;
3739 } else {
3740 for (; start <= end; start++)
3741 if (Py_UNICODE_MATCH(self, start, substring))
3742 return start;
3743 }
3744
3745 return -1;
3746}
3747
3748int PyUnicode_Find(PyObject *str,
3749 PyObject *substr,
3750 int start,
3751 int end,
3752 int direction)
3753{
3754 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003755
Guido van Rossumd57fd912000-03-10 22:53:23 +00003756 str = PyUnicode_FromObject(str);
3757 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003758 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003759 substr = PyUnicode_FromObject(substr);
3760 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003761 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003762 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763 }
Tim Petersced69f82003-09-16 20:30:58 +00003764
Guido van Rossumd57fd912000-03-10 22:53:23 +00003765 result = findstring((PyUnicodeObject *)str,
3766 (PyUnicodeObject *)substr,
3767 start, end, direction);
3768 Py_DECREF(str);
3769 Py_DECREF(substr);
3770 return result;
3771}
3772
Tim Petersced69f82003-09-16 20:30:58 +00003773static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003774int tailmatch(PyUnicodeObject *self,
3775 PyUnicodeObject *substring,
3776 int start,
3777 int end,
3778 int direction)
3779{
3780 if (start < 0)
3781 start += self->length;
3782 if (start < 0)
3783 start = 0;
3784
3785 if (substring->length == 0)
3786 return 1;
3787
3788 if (end > self->length)
3789 end = self->length;
3790 if (end < 0)
3791 end += self->length;
3792 if (end < 0)
3793 end = 0;
3794
3795 end -= substring->length;
3796 if (end < start)
3797 return 0;
3798
3799 if (direction > 0) {
3800 if (Py_UNICODE_MATCH(self, end, substring))
3801 return 1;
3802 } else {
3803 if (Py_UNICODE_MATCH(self, start, substring))
3804 return 1;
3805 }
3806
3807 return 0;
3808}
3809
3810int PyUnicode_Tailmatch(PyObject *str,
3811 PyObject *substr,
3812 int start,
3813 int end,
3814 int direction)
3815{
3816 int result;
Tim Petersced69f82003-09-16 20:30:58 +00003817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818 str = PyUnicode_FromObject(str);
3819 if (str == NULL)
3820 return -1;
3821 substr = PyUnicode_FromObject(substr);
3822 if (substr == NULL) {
3823 Py_DECREF(substr);
3824 return -1;
3825 }
Tim Petersced69f82003-09-16 20:30:58 +00003826
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827 result = tailmatch((PyUnicodeObject *)str,
3828 (PyUnicodeObject *)substr,
3829 start, end, direction);
3830 Py_DECREF(str);
3831 Py_DECREF(substr);
3832 return result;
3833}
3834
Tim Petersced69f82003-09-16 20:30:58 +00003835static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003836const Py_UNICODE *findchar(const Py_UNICODE *s,
3837 int size,
3838 Py_UNICODE ch)
3839{
3840 /* like wcschr, but doesn't stop at NULL characters */
3841
3842 while (size-- > 0) {
3843 if (*s == ch)
3844 return s;
3845 s++;
3846 }
3847
3848 return NULL;
3849}
3850
3851/* Apply fixfct filter to the Unicode object self and return a
3852 reference to the modified object */
3853
Tim Petersced69f82003-09-16 20:30:58 +00003854static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003855PyObject *fixup(PyUnicodeObject *self,
3856 int (*fixfct)(PyUnicodeObject *s))
3857{
3858
3859 PyUnicodeObject *u;
3860
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003861 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003862 if (u == NULL)
3863 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003864
3865 Py_UNICODE_COPY(u->str, self->str, self->length);
3866
Tim Peters7a29bd52001-09-12 03:03:31 +00003867 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868 /* fixfct should return TRUE if it modified the buffer. If
3869 FALSE, return a reference to the original buffer instead
3870 (to save space, not time) */
3871 Py_INCREF(self);
3872 Py_DECREF(u);
3873 return (PyObject*) self;
3874 }
3875 return (PyObject*) u;
3876}
3877
Tim Petersced69f82003-09-16 20:30:58 +00003878static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879int fixupper(PyUnicodeObject *self)
3880{
3881 int len = self->length;
3882 Py_UNICODE *s = self->str;
3883 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003884
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885 while (len-- > 0) {
3886 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003887
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 ch = Py_UNICODE_TOUPPER(*s);
3889 if (ch != *s) {
3890 status = 1;
3891 *s = ch;
3892 }
3893 s++;
3894 }
3895
3896 return status;
3897}
3898
Tim Petersced69f82003-09-16 20:30:58 +00003899static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900int fixlower(PyUnicodeObject *self)
3901{
3902 int len = self->length;
3903 Py_UNICODE *s = self->str;
3904 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003905
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 while (len-- > 0) {
3907 register Py_UNICODE ch;
Tim Petersced69f82003-09-16 20:30:58 +00003908
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 ch = Py_UNICODE_TOLOWER(*s);
3910 if (ch != *s) {
3911 status = 1;
3912 *s = ch;
3913 }
3914 s++;
3915 }
3916
3917 return status;
3918}
3919
Tim Petersced69f82003-09-16 20:30:58 +00003920static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003921int fixswapcase(PyUnicodeObject *self)
3922{
3923 int len = self->length;
3924 Py_UNICODE *s = self->str;
3925 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003926
Guido van Rossumd57fd912000-03-10 22:53:23 +00003927 while (len-- > 0) {
3928 if (Py_UNICODE_ISUPPER(*s)) {
3929 *s = Py_UNICODE_TOLOWER(*s);
3930 status = 1;
3931 } else if (Py_UNICODE_ISLOWER(*s)) {
3932 *s = Py_UNICODE_TOUPPER(*s);
3933 status = 1;
3934 }
3935 s++;
3936 }
3937
3938 return status;
3939}
3940
Tim Petersced69f82003-09-16 20:30:58 +00003941static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942int fixcapitalize(PyUnicodeObject *self)
3943{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003944 int len = self->length;
3945 Py_UNICODE *s = self->str;
3946 int status = 0;
Tim Petersced69f82003-09-16 20:30:58 +00003947
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003948 if (len == 0)
3949 return 0;
3950 if (Py_UNICODE_ISLOWER(*s)) {
3951 *s = Py_UNICODE_TOUPPER(*s);
3952 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003953 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003954 s++;
3955 while (--len > 0) {
3956 if (Py_UNICODE_ISUPPER(*s)) {
3957 *s = Py_UNICODE_TOLOWER(*s);
3958 status = 1;
3959 }
3960 s++;
3961 }
3962 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003963}
3964
3965static
3966int fixtitle(PyUnicodeObject *self)
3967{
3968 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3969 register Py_UNICODE *e;
3970 int previous_is_cased;
3971
3972 /* Shortcut for single character strings */
3973 if (PyUnicode_GET_SIZE(self) == 1) {
3974 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3975 if (*p != ch) {
3976 *p = ch;
3977 return 1;
3978 }
3979 else
3980 return 0;
3981 }
Tim Petersced69f82003-09-16 20:30:58 +00003982
Guido van Rossumd57fd912000-03-10 22:53:23 +00003983 e = p + PyUnicode_GET_SIZE(self);
3984 previous_is_cased = 0;
3985 for (; p < e; p++) {
3986 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00003987
Guido van Rossumd57fd912000-03-10 22:53:23 +00003988 if (previous_is_cased)
3989 *p = Py_UNICODE_TOLOWER(ch);
3990 else
3991 *p = Py_UNICODE_TOTITLE(ch);
Tim Petersced69f82003-09-16 20:30:58 +00003992
3993 if (Py_UNICODE_ISLOWER(ch) ||
3994 Py_UNICODE_ISUPPER(ch) ||
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995 Py_UNICODE_ISTITLE(ch))
3996 previous_is_cased = 1;
3997 else
3998 previous_is_cased = 0;
3999 }
4000 return 1;
4001}
4002
4003PyObject *PyUnicode_Join(PyObject *separator,
4004 PyObject *seq)
4005{
4006 Py_UNICODE *sep;
4007 int seplen;
4008 PyUnicodeObject *res = NULL;
4009 int reslen = 0;
4010 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011 int sz = 100;
4012 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00004013 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004014
Tim Peters2cfe3682001-05-05 05:36:48 +00004015 it = PyObject_GetIter(seq);
4016 if (it == NULL)
4017 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018
4019 if (separator == NULL) {
4020 Py_UNICODE blank = ' ';
4021 sep = &blank;
4022 seplen = 1;
4023 }
4024 else {
4025 separator = PyUnicode_FromObject(separator);
4026 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00004027 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 sep = PyUnicode_AS_UNICODE(separator);
4029 seplen = PyUnicode_GET_SIZE(separator);
4030 }
Tim Petersced69f82003-09-16 20:30:58 +00004031
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032 res = _PyUnicode_New(sz);
4033 if (res == NULL)
4034 goto onError;
4035 p = PyUnicode_AS_UNICODE(res);
4036 reslen = 0;
4037
Tim Peters2cfe3682001-05-05 05:36:48 +00004038 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00004040 PyObject *item = PyIter_Next(it);
4041 if (item == NULL) {
4042 if (PyErr_Occurred())
4043 goto onError;
4044 break;
4045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 if (!PyUnicode_Check(item)) {
4047 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004048 if (!PyString_Check(item)) {
4049 PyErr_Format(PyExc_TypeError,
4050 "sequence item %i: expected string or Unicode,"
4051 " %.80s found",
4052 i, item->ob_type->tp_name);
4053 Py_DECREF(item);
4054 goto onError;
4055 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056 v = PyUnicode_FromObject(item);
4057 Py_DECREF(item);
4058 item = v;
4059 if (item == NULL)
4060 goto onError;
4061 }
4062 itemlen = PyUnicode_GET_SIZE(item);
4063 while (reslen + itemlen + seplen >= sz) {
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004064 if (_PyUnicode_Resize(&res, sz*2) < 0) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004065 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004066 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00004067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004068 sz *= 2;
4069 p = PyUnicode_AS_UNICODE(res) + reslen;
4070 }
4071 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004072 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073 p += seplen;
4074 reslen += seplen;
4075 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004076 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 p += itemlen;
4078 reslen += itemlen;
4079 Py_DECREF(item);
4080 }
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00004081 if (_PyUnicode_Resize(&res, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 goto onError;
4083
4084 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004085 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086 return (PyObject *)res;
4087
4088 onError:
4089 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00004090 Py_XDECREF(res);
4091 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 return NULL;
4093}
4094
Tim Petersced69f82003-09-16 20:30:58 +00004095static
4096PyUnicodeObject *pad(PyUnicodeObject *self,
4097 int left,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098 int right,
4099 Py_UNICODE fill)
4100{
4101 PyUnicodeObject *u;
4102
4103 if (left < 0)
4104 left = 0;
4105 if (right < 0)
4106 right = 0;
4107
Tim Peters7a29bd52001-09-12 03:03:31 +00004108 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 Py_INCREF(self);
4110 return self;
4111 }
4112
4113 u = _PyUnicode_New(left + self->length + right);
4114 if (u) {
4115 if (left)
4116 Py_UNICODE_FILL(u->str, fill, left);
4117 Py_UNICODE_COPY(u->str + left, self->str, self->length);
4118 if (right)
4119 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
4120 }
4121
4122 return u;
4123}
4124
4125#define SPLIT_APPEND(data, left, right) \
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004126 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
Guido van Rossumd57fd912000-03-10 22:53:23 +00004127 if (!str) \
4128 goto onError; \
4129 if (PyList_Append(list, str)) { \
4130 Py_DECREF(str); \
4131 goto onError; \
4132 } \
4133 else \
4134 Py_DECREF(str);
4135
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004136#define SPLIT_INSERT(data, left, right) \
4137 str = PyUnicode_FromUnicode((data) + (left), (right) - (left)); \
4138 if (!str) \
4139 goto onError; \
4140 if (PyList_Insert(list, 0, str)) { \
4141 Py_DECREF(str); \
4142 goto onError; \
4143 } \
4144 else \
4145 Py_DECREF(str);
4146
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147static
4148PyObject *split_whitespace(PyUnicodeObject *self,
4149 PyObject *list,
4150 int maxcount)
4151{
4152 register int i;
4153 register int j;
4154 int len = self->length;
4155 PyObject *str;
4156
4157 for (i = j = 0; i < len; ) {
4158 /* find a token */
4159 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4160 i++;
4161 j = i;
4162 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4163 i++;
4164 if (j < i) {
4165 if (maxcount-- <= 0)
4166 break;
4167 SPLIT_APPEND(self->str, j, i);
4168 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4169 i++;
4170 j = i;
4171 }
4172 }
4173 if (j < len) {
4174 SPLIT_APPEND(self->str, j, len);
4175 }
4176 return list;
4177
4178 onError:
4179 Py_DECREF(list);
4180 return NULL;
4181}
4182
4183PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004184 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185{
4186 register int i;
4187 register int j;
4188 int len;
4189 PyObject *list;
4190 PyObject *str;
4191 Py_UNICODE *data;
4192
4193 string = PyUnicode_FromObject(string);
4194 if (string == NULL)
4195 return NULL;
4196 data = PyUnicode_AS_UNICODE(string);
4197 len = PyUnicode_GET_SIZE(string);
4198
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 list = PyList_New(0);
4200 if (!list)
4201 goto onError;
4202
4203 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004204 int eol;
Tim Petersced69f82003-09-16 20:30:58 +00004205
Guido van Rossumd57fd912000-03-10 22:53:23 +00004206 /* Find a line and append it */
4207 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4208 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209
4210 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004211 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004212 if (i < len) {
4213 if (data[i] == '\r' && i + 1 < len &&
4214 data[i+1] == '\n')
4215 i += 2;
4216 else
4217 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004218 if (keepends)
4219 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220 }
Guido van Rossum86662912000-04-11 15:38:46 +00004221 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004222 j = i;
4223 }
4224 if (j < len) {
4225 SPLIT_APPEND(data, j, len);
4226 }
4227
4228 Py_DECREF(string);
4229 return list;
4230
4231 onError:
4232 Py_DECREF(list);
4233 Py_DECREF(string);
4234 return NULL;
4235}
4236
Tim Petersced69f82003-09-16 20:30:58 +00004237static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238PyObject *split_char(PyUnicodeObject *self,
4239 PyObject *list,
4240 Py_UNICODE ch,
4241 int maxcount)
4242{
4243 register int i;
4244 register int j;
4245 int len = self->length;
4246 PyObject *str;
4247
4248 for (i = j = 0; i < len; ) {
4249 if (self->str[i] == ch) {
4250 if (maxcount-- <= 0)
4251 break;
4252 SPLIT_APPEND(self->str, j, i);
4253 i = j = i + 1;
4254 } else
4255 i++;
4256 }
4257 if (j <= len) {
4258 SPLIT_APPEND(self->str, j, len);
4259 }
4260 return list;
4261
4262 onError:
4263 Py_DECREF(list);
4264 return NULL;
4265}
4266
Tim Petersced69f82003-09-16 20:30:58 +00004267static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004268PyObject *split_substring(PyUnicodeObject *self,
4269 PyObject *list,
4270 PyUnicodeObject *substring,
4271 int maxcount)
4272{
4273 register int i;
4274 register int j;
4275 int len = self->length;
4276 int sublen = substring->length;
4277 PyObject *str;
4278
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004279 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 if (Py_UNICODE_MATCH(self, i, substring)) {
4281 if (maxcount-- <= 0)
4282 break;
4283 SPLIT_APPEND(self->str, j, i);
4284 i = j = i + sublen;
4285 } else
4286 i++;
4287 }
4288 if (j <= len) {
4289 SPLIT_APPEND(self->str, j, len);
4290 }
4291 return list;
4292
4293 onError:
4294 Py_DECREF(list);
4295 return NULL;
4296}
4297
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004298static
4299PyObject *rsplit_whitespace(PyUnicodeObject *self,
4300 PyObject *list,
4301 int maxcount)
4302{
4303 register int i;
4304 register int j;
4305 int len = self->length;
4306 PyObject *str;
4307
4308 for (i = j = len - 1; i >= 0; ) {
4309 /* find a token */
4310 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4311 i--;
4312 j = i;
4313 while (i >= 0 && !Py_UNICODE_ISSPACE(self->str[i]))
4314 i--;
4315 if (j > i) {
4316 if (maxcount-- <= 0)
4317 break;
4318 SPLIT_INSERT(self->str, i + 1, j + 1);
4319 while (i >= 0 && Py_UNICODE_ISSPACE(self->str[i]))
4320 i--;
4321 j = i;
4322 }
4323 }
4324 if (j >= 0) {
4325 SPLIT_INSERT(self->str, 0, j + 1);
4326 }
4327 return list;
4328
4329 onError:
4330 Py_DECREF(list);
4331 return NULL;
4332}
4333
4334static
4335PyObject *rsplit_char(PyUnicodeObject *self,
4336 PyObject *list,
4337 Py_UNICODE ch,
4338 int maxcount)
4339{
4340 register int i;
4341 register int j;
4342 int len = self->length;
4343 PyObject *str;
4344
4345 for (i = j = len - 1; i >= 0; ) {
4346 if (self->str[i] == ch) {
4347 if (maxcount-- <= 0)
4348 break;
4349 SPLIT_INSERT(self->str, i + 1, j + 1);
4350 j = i = i - 1;
4351 } else
4352 i--;
4353 }
Hye-Shik Chang7fc4cf52003-12-23 09:10:16 +00004354 if (j >= -1) {
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004355 SPLIT_INSERT(self->str, 0, j + 1);
4356 }
4357 return list;
4358
4359 onError:
4360 Py_DECREF(list);
4361 return NULL;
4362}
4363
4364static
4365PyObject *rsplit_substring(PyUnicodeObject *self,
4366 PyObject *list,
4367 PyUnicodeObject *substring,
4368 int maxcount)
4369{
4370 register int i;
4371 register int j;
4372 int len = self->length;
4373 int sublen = substring->length;
4374 PyObject *str;
4375
4376 for (i = len - sublen, j = len; i >= 0; ) {
4377 if (Py_UNICODE_MATCH(self, i, substring)) {
4378 if (maxcount-- <= 0)
4379 break;
4380 SPLIT_INSERT(self->str, i + sublen, j);
4381 j = i;
4382 i -= sublen;
4383 } else
4384 i--;
4385 }
4386 if (j >= 0) {
4387 SPLIT_INSERT(self->str, 0, j);
4388 }
4389 return list;
4390
4391 onError:
4392 Py_DECREF(list);
4393 return NULL;
4394}
4395
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396#undef SPLIT_APPEND
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004397#undef SPLIT_INSERT
Guido van Rossumd57fd912000-03-10 22:53:23 +00004398
4399static
4400PyObject *split(PyUnicodeObject *self,
4401 PyUnicodeObject *substring,
4402 int maxcount)
4403{
4404 PyObject *list;
4405
4406 if (maxcount < 0)
4407 maxcount = INT_MAX;
4408
4409 list = PyList_New(0);
4410 if (!list)
4411 return NULL;
4412
4413 if (substring == NULL)
4414 return split_whitespace(self,list,maxcount);
4415
4416 else if (substring->length == 1)
4417 return split_char(self,list,substring->str[0],maxcount);
4418
4419 else if (substring->length == 0) {
4420 Py_DECREF(list);
4421 PyErr_SetString(PyExc_ValueError, "empty separator");
4422 return NULL;
4423 }
4424 else
4425 return split_substring(self,list,substring,maxcount);
4426}
4427
Tim Petersced69f82003-09-16 20:30:58 +00004428static
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00004429PyObject *rsplit(PyUnicodeObject *self,
4430 PyUnicodeObject *substring,
4431 int maxcount)
4432{
4433 PyObject *list;
4434
4435 if (maxcount < 0)
4436 maxcount = INT_MAX;
4437
4438 list = PyList_New(0);
4439 if (!list)
4440 return NULL;
4441
4442 if (substring == NULL)
4443 return rsplit_whitespace(self,list,maxcount);
4444
4445 else if (substring->length == 1)
4446 return rsplit_char(self,list,substring->str[0],maxcount);
4447
4448 else if (substring->length == 0) {
4449 Py_DECREF(list);
4450 PyErr_SetString(PyExc_ValueError, "empty separator");
4451 return NULL;
4452 }
4453 else
4454 return rsplit_substring(self,list,substring,maxcount);
4455}
4456
4457static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458PyObject *replace(PyUnicodeObject *self,
4459 PyUnicodeObject *str1,
4460 PyUnicodeObject *str2,
4461 int maxcount)
4462{
4463 PyUnicodeObject *u;
4464
4465 if (maxcount < 0)
4466 maxcount = INT_MAX;
4467
4468 if (str1->length == 1 && str2->length == 1) {
4469 int i;
4470
4471 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004472 if (!findchar(self->str, self->length, str1->str[0]) &&
4473 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004474 /* nothing to replace, return original string */
4475 Py_INCREF(self);
4476 u = self;
4477 } else {
4478 Py_UNICODE u1 = str1->str[0];
4479 Py_UNICODE u2 = str2->str[0];
Tim Petersced69f82003-09-16 20:30:58 +00004480
Guido van Rossumd57fd912000-03-10 22:53:23 +00004481 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004482 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004483 self->length
4484 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004485 if (u != NULL) {
Tim Petersced69f82003-09-16 20:30:58 +00004486 Py_UNICODE_COPY(u->str, self->str,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004487 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004488 for (i = 0; i < u->length; i++)
4489 if (u->str[i] == u1) {
4490 if (--maxcount < 0)
4491 break;
4492 u->str[i] = u2;
4493 }
4494 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004496
4497 } else {
4498 int n, i;
4499 Py_UNICODE *p;
4500
4501 /* replace strings */
4502 n = count(self, 0, self->length, str1);
4503 if (n > maxcount)
4504 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004505 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004506 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004507 if (PyUnicode_CheckExact(self)) {
4508 Py_INCREF(self);
4509 u = self;
4510 }
4511 else {
4512 u = (PyUnicodeObject *)
4513 PyUnicode_FromUnicode(self->str, self->length);
4514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004515 } else {
4516 u = _PyUnicode_New(
4517 self->length + n * (str2->length - str1->length));
4518 if (u) {
4519 i = 0;
4520 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004521 if (str1->length > 0) {
4522 while (i <= self->length - str1->length)
4523 if (Py_UNICODE_MATCH(self, i, str1)) {
4524 /* replace string segment */
4525 Py_UNICODE_COPY(p, str2->str, str2->length);
4526 p += str2->length;
4527 i += str1->length;
4528 if (--n <= 0) {
4529 /* copy remaining part */
4530 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4531 break;
4532 }
4533 } else
4534 *p++ = self->str[i++];
4535 } else {
4536 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004537 Py_UNICODE_COPY(p, str2->str, str2->length);
4538 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004539 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004541 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004542 }
4543 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4544 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004545 }
4546 }
4547 }
Tim Petersced69f82003-09-16 20:30:58 +00004548
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549 return (PyObject *) u;
4550}
4551
4552/* --- Unicode Object Methods --------------------------------------------- */
4553
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004554PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555"S.title() -> unicode\n\
4556\n\
4557Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004558characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004559
4560static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004561unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004563 return fixup(self, fixtitle);
4564}
4565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004566PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567"S.capitalize() -> unicode\n\
4568\n\
4569Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004570have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571
4572static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004573unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575 return fixup(self, fixcapitalize);
4576}
4577
4578#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004579PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580"S.capwords() -> unicode\n\
4581\n\
4582Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004583normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584
4585static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004586unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587{
4588 PyObject *list;
4589 PyObject *item;
4590 int i;
4591
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 /* Split into words */
4593 list = split(self, NULL, -1);
4594 if (!list)
4595 return NULL;
4596
4597 /* Capitalize each word */
4598 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4599 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4600 fixcapitalize);
4601 if (item == NULL)
4602 goto onError;
4603 Py_DECREF(PyList_GET_ITEM(list, i));
4604 PyList_SET_ITEM(list, i, item);
4605 }
4606
4607 /* Join the words to form a new string */
4608 item = PyUnicode_Join(NULL, list);
4609
4610onError:
4611 Py_DECREF(list);
4612 return (PyObject *)item;
4613}
4614#endif
4615
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004616/* Argument converter. Coerces to a single unicode character */
4617
4618static int
4619convert_uc(PyObject *obj, void *addr)
4620{
4621 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
4622 PyObject *uniobj;
4623 Py_UNICODE *unistr;
4624
4625 uniobj = PyUnicode_FromObject(obj);
4626 if (uniobj == NULL) {
4627 PyErr_SetString(PyExc_TypeError,
4628 "The fill character cannot be converted to Unicode");
4629 return 0;
4630 }
4631 if (PyUnicode_GET_SIZE(uniobj) != 1) {
4632 PyErr_SetString(PyExc_TypeError,
4633 "The fill character must be exactly one character long");
4634 Py_DECREF(uniobj);
4635 return 0;
4636 }
4637 unistr = PyUnicode_AS_UNICODE(uniobj);
4638 *fillcharloc = unistr[0];
4639 Py_DECREF(uniobj);
4640 return 1;
4641}
4642
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004643PyDoc_STRVAR(center__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004644"S.center(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004645\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004646Return S centered in a Unicode string of length width. Padding is\n\
4647done using the specified fill character (default is a space)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648
4649static PyObject *
4650unicode_center(PyUnicodeObject *self, PyObject *args)
4651{
4652 int marg, left;
4653 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004654 Py_UNICODE fillchar = ' ';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004656 if (!PyArg_ParseTuple(args, "i|O&:center", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657 return NULL;
4658
Tim Peters7a29bd52001-09-12 03:03:31 +00004659 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660 Py_INCREF(self);
4661 return (PyObject*) self;
4662 }
4663
4664 marg = width - self->length;
4665 left = marg / 2 + (marg & width & 1);
4666
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00004667 return (PyObject*) pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668}
4669
Marc-André Lemburge5034372000-08-08 08:04:29 +00004670#if 0
4671
4672/* This code should go into some future Unicode collation support
4673 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004674 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004675
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004676/* speedy UTF-16 code point order comparison */
4677/* gleaned from: */
4678/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4679
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004680static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004681{
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004682 0, 0, 0, 0, 0, 0, 0, 0,
Tim Petersced69f82003-09-16 20:30:58 +00004683 0, 0, 0, 0, 0, 0, 0, 0,
4684 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004685 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004686};
4687
Guido van Rossumd57fd912000-03-10 22:53:23 +00004688static int
4689unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4690{
4691 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004692
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 Py_UNICODE *s1 = str1->str;
4694 Py_UNICODE *s2 = str2->str;
4695
4696 len1 = str1->length;
4697 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004698
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004700 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004701
4702 c1 = *s1++;
4703 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004704
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004705 if (c1 > (1<<11) * 26)
4706 c1 += utf16Fixup[c1>>11];
4707 if (c2 > (1<<11) * 26)
4708 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004709 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004710
4711 if (c1 != c2)
4712 return (c1 < c2) ? -1 : 1;
Tim Petersced69f82003-09-16 20:30:58 +00004713
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004714 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715 }
4716
4717 return (len1 < len2) ? -1 : (len1 != len2);
4718}
4719
Marc-André Lemburge5034372000-08-08 08:04:29 +00004720#else
4721
4722static int
4723unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4724{
4725 register int len1, len2;
4726
4727 Py_UNICODE *s1 = str1->str;
4728 Py_UNICODE *s2 = str2->str;
4729
4730 len1 = str1->length;
4731 len2 = str2->length;
Tim Petersced69f82003-09-16 20:30:58 +00004732
Marc-André Lemburge5034372000-08-08 08:04:29 +00004733 while (len1 > 0 && len2 > 0) {
Tim Petersced69f82003-09-16 20:30:58 +00004734 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004735
Fredrik Lundh45714e92001-06-26 16:39:36 +00004736 c1 = *s1++;
4737 c2 = *s2++;
4738
4739 if (c1 != c2)
4740 return (c1 < c2) ? -1 : 1;
4741
Marc-André Lemburge5034372000-08-08 08:04:29 +00004742 len1--; len2--;
4743 }
4744
4745 return (len1 < len2) ? -1 : (len1 != len2);
4746}
4747
4748#endif
4749
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750int PyUnicode_Compare(PyObject *left,
4751 PyObject *right)
4752{
4753 PyUnicodeObject *u = NULL, *v = NULL;
4754 int result;
4755
4756 /* Coerce the two arguments */
4757 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4758 if (u == NULL)
4759 goto onError;
4760 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4761 if (v == NULL)
4762 goto onError;
4763
Thomas Wouters7e474022000-07-16 12:04:32 +00004764 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 if (v == u) {
4766 Py_DECREF(u);
4767 Py_DECREF(v);
4768 return 0;
4769 }
4770
4771 result = unicode_compare(u, v);
4772
4773 Py_DECREF(u);
4774 Py_DECREF(v);
4775 return result;
4776
4777onError:
4778 Py_XDECREF(u);
4779 Py_XDECREF(v);
4780 return -1;
4781}
4782
Guido van Rossum403d68b2000-03-13 15:55:09 +00004783int PyUnicode_Contains(PyObject *container,
4784 PyObject *element)
4785{
4786 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004787 int result, size;
4788 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004789
4790 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004791 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004792 if (v == NULL) {
4793 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004794 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004795 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004796 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004797 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004798 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004799 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004800
Barry Warsaw817918c2002-08-06 16:58:21 +00004801 size = PyUnicode_GET_SIZE(v);
4802 rhs = PyUnicode_AS_UNICODE(v);
4803 lhs = PyUnicode_AS_UNICODE(u);
4804
Guido van Rossum403d68b2000-03-13 15:55:09 +00004805 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004806 if (size == 1) {
4807 end = lhs + PyUnicode_GET_SIZE(u);
4808 while (lhs < end) {
4809 if (*lhs++ == *rhs) {
4810 result = 1;
4811 break;
4812 }
4813 }
4814 }
4815 else {
4816 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4817 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004818 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004819 result = 1;
4820 break;
4821 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004822 }
4823 }
4824
4825 Py_DECREF(u);
4826 Py_DECREF(v);
4827 return result;
4828
4829onError:
4830 Py_XDECREF(u);
4831 Py_XDECREF(v);
4832 return -1;
4833}
4834
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835/* Concat to string or Unicode object giving a new Unicode object. */
4836
4837PyObject *PyUnicode_Concat(PyObject *left,
4838 PyObject *right)
4839{
4840 PyUnicodeObject *u = NULL, *v = NULL, *w;
4841
4842 /* Coerce the two arguments */
4843 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4844 if (u == NULL)
4845 goto onError;
4846 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4847 if (v == NULL)
4848 goto onError;
4849
4850 /* Shortcuts */
4851 if (v == unicode_empty) {
4852 Py_DECREF(v);
4853 return (PyObject *)u;
4854 }
4855 if (u == unicode_empty) {
4856 Py_DECREF(u);
4857 return (PyObject *)v;
4858 }
4859
4860 /* Concat the two Unicode strings */
4861 w = _PyUnicode_New(u->length + v->length);
4862 if (w == NULL)
4863 goto onError;
4864 Py_UNICODE_COPY(w->str, u->str, u->length);
4865 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4866
4867 Py_DECREF(u);
4868 Py_DECREF(v);
4869 return (PyObject *)w;
4870
4871onError:
4872 Py_XDECREF(u);
4873 Py_XDECREF(v);
4874 return NULL;
4875}
4876
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004877PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878"S.count(sub[, start[, end]]) -> int\n\
4879\n\
4880Return the number of occurrences of substring sub in Unicode string\n\
4881S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004882interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883
4884static PyObject *
4885unicode_count(PyUnicodeObject *self, PyObject *args)
4886{
4887 PyUnicodeObject *substring;
4888 int start = 0;
4889 int end = INT_MAX;
4890 PyObject *result;
4891
Guido van Rossumb8872e62000-05-09 14:14:27 +00004892 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4893 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 return NULL;
4895
4896 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4897 (PyObject *)substring);
4898 if (substring == NULL)
4899 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00004900
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901 if (start < 0)
4902 start += self->length;
4903 if (start < 0)
4904 start = 0;
4905 if (end > self->length)
4906 end = self->length;
4907 if (end < 0)
4908 end += self->length;
4909 if (end < 0)
4910 end = 0;
4911
4912 result = PyInt_FromLong((long) count(self, start, end, substring));
4913
4914 Py_DECREF(substring);
4915 return result;
4916}
4917
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004918PyDoc_STRVAR(encode__doc__,
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004919"S.encode([encoding[,errors]]) -> string or unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920\n\
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004921Encodes S using the codec registered for encoding. encoding defaults\n\
4922to the default encoding. errors may be given to set a different error\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004923handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004924a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4925'xmlcharrefreplace' as well as any other name registered with\n\
4926codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927
4928static PyObject *
4929unicode_encode(PyUnicodeObject *self, PyObject *args)
4930{
4931 char *encoding = NULL;
4932 char *errors = NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004933 PyObject *v;
4934
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4936 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004937 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004938 if (v == NULL)
4939 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004940 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
4941 PyErr_Format(PyExc_TypeError,
4942 "encoder did not return a string/unicode object "
4943 "(type=%.400s)",
4944 v->ob_type->tp_name);
4945 Py_DECREF(v);
4946 return NULL;
4947 }
4948 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004949
4950 onError:
4951 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004952}
4953
4954PyDoc_STRVAR(decode__doc__,
4955"S.decode([encoding[,errors]]) -> string or unicode\n\
4956\n\
4957Decodes S using the codec registered for encoding. encoding defaults\n\
4958to the default encoding. errors may be given to set a different error\n\
4959handling scheme. Default is 'strict' meaning that encoding errors raise\n\
4960a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
4961as well as any other name registerd with codecs.register_error that is\n\
4962able to handle UnicodeDecodeErrors.");
4963
4964static PyObject *
Marc-André Lemburg126b44c2004-07-10 12:04:20 +00004965unicode_decode(PyUnicodeObject *self, PyObject *args)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004966{
4967 char *encoding = NULL;
4968 char *errors = NULL;
4969 PyObject *v;
4970
4971 if (!PyArg_ParseTuple(args, "|ss:decode", &encoding, &errors))
4972 return NULL;
4973 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004974 if (v == NULL)
4975 goto onError;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00004976 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
4977 PyErr_Format(PyExc_TypeError,
4978 "decoder did not return a string/unicode object "
4979 "(type=%.400s)",
4980 v->ob_type->tp_name);
4981 Py_DECREF(v);
4982 return NULL;
4983 }
4984 return v;
Marc-André Lemburg1dffb122004-07-08 19:13:55 +00004985
4986 onError:
4987 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988}
4989
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004990PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991"S.expandtabs([tabsize]) -> unicode\n\
4992\n\
4993Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004994If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004995
4996static PyObject*
4997unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4998{
4999 Py_UNICODE *e;
5000 Py_UNICODE *p;
5001 Py_UNICODE *q;
5002 int i, j;
5003 PyUnicodeObject *u;
5004 int tabsize = 8;
5005
5006 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
5007 return NULL;
5008
Thomas Wouters7e474022000-07-16 12:04:32 +00005009 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010 i = j = 0;
5011 e = self->str + self->length;
5012 for (p = self->str; p < e; p++)
5013 if (*p == '\t') {
5014 if (tabsize > 0)
5015 j += tabsize - (j % tabsize);
5016 }
5017 else {
5018 j++;
5019 if (*p == '\n' || *p == '\r') {
5020 i += j;
5021 j = 0;
5022 }
5023 }
5024
5025 /* Second pass: create output string and fill it */
5026 u = _PyUnicode_New(i + j);
5027 if (!u)
5028 return NULL;
5029
5030 j = 0;
5031 q = u->str;
5032
5033 for (p = self->str; p < e; p++)
5034 if (*p == '\t') {
5035 if (tabsize > 0) {
5036 i = tabsize - (j % tabsize);
5037 j += i;
5038 while (i--)
5039 *q++ = ' ';
5040 }
5041 }
5042 else {
5043 j++;
5044 *q++ = *p;
5045 if (*p == '\n' || *p == '\r')
5046 j = 0;
5047 }
5048
5049 return (PyObject*) u;
5050}
5051
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005052PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053"S.find(sub [,start [,end]]) -> int\n\
5054\n\
5055Return the lowest index in S where substring sub is found,\n\
5056such that sub is contained within s[start,end]. Optional\n\
5057arguments start and end are interpreted as in slice notation.\n\
5058\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005059Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060
5061static PyObject *
5062unicode_find(PyUnicodeObject *self, PyObject *args)
5063{
5064 PyUnicodeObject *substring;
5065 int start = 0;
5066 int end = INT_MAX;
5067 PyObject *result;
5068
Guido van Rossumb8872e62000-05-09 14:14:27 +00005069 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
5070 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071 return NULL;
5072 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5073 (PyObject *)substring);
5074 if (substring == NULL)
5075 return NULL;
5076
5077 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
5078
5079 Py_DECREF(substring);
5080 return result;
5081}
5082
5083static PyObject *
5084unicode_getitem(PyUnicodeObject *self, int index)
5085{
5086 if (index < 0 || index >= self->length) {
5087 PyErr_SetString(PyExc_IndexError, "string index out of range");
5088 return NULL;
5089 }
5090
5091 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
5092}
5093
5094static long
5095unicode_hash(PyUnicodeObject *self)
5096{
Fredrik Lundhdde61642000-07-10 18:27:47 +00005097 /* Since Unicode objects compare equal to their ASCII string
5098 counterparts, they should use the individual character values
5099 as basis for their hash value. This is needed to assure that
5100 strings and Unicode objects behave in the same way as
5101 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102
Fredrik Lundhdde61642000-07-10 18:27:47 +00005103 register int len;
5104 register Py_UNICODE *p;
5105 register long x;
5106
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 if (self->hash != -1)
5108 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00005109 len = PyUnicode_GET_SIZE(self);
5110 p = PyUnicode_AS_UNICODE(self);
5111 x = *p << 7;
5112 while (--len >= 0)
5113 x = (1000003*x) ^ *p++;
5114 x ^= PyUnicode_GET_SIZE(self);
5115 if (x == -1)
5116 x = -2;
5117 self->hash = x;
5118 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005119}
5120
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005121PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122"S.index(sub [,start [,end]]) -> int\n\
5123\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005124Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125
5126static PyObject *
5127unicode_index(PyUnicodeObject *self, PyObject *args)
5128{
5129 int result;
5130 PyUnicodeObject *substring;
5131 int start = 0;
5132 int end = INT_MAX;
5133
Guido van Rossumb8872e62000-05-09 14:14:27 +00005134 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
5135 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +00005137
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5139 (PyObject *)substring);
5140 if (substring == NULL)
5141 return NULL;
5142
5143 result = findstring(self, substring, start, end, 1);
5144
5145 Py_DECREF(substring);
5146 if (result < 0) {
5147 PyErr_SetString(PyExc_ValueError, "substring not found");
5148 return NULL;
5149 }
5150 return PyInt_FromLong(result);
5151}
5152
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005153PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005154"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005156Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005157at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158
5159static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005160unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161{
5162 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5163 register const Py_UNICODE *e;
5164 int cased;
5165
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 /* Shortcut for single character strings */
5167 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005168 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005170 /* Special case for empty strings */
5171 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005172 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005173
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 e = p + PyUnicode_GET_SIZE(self);
5175 cased = 0;
5176 for (; p < e; p++) {
5177 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005178
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005180 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 else if (!cased && Py_UNICODE_ISLOWER(ch))
5182 cased = 1;
5183 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005184 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185}
5186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005187PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005188"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005190Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005191at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192
5193static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005194unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195{
5196 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5197 register const Py_UNICODE *e;
5198 int cased;
5199
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 /* Shortcut for single character strings */
5201 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005202 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005204 /* Special case for empty strings */
5205 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005206 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005207
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 e = p + PyUnicode_GET_SIZE(self);
5209 cased = 0;
5210 for (; p < e; p++) {
5211 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005212
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005214 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215 else if (!cased && Py_UNICODE_ISUPPER(ch))
5216 cased = 1;
5217 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005218 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219}
5220
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005221PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005222"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005224Return True if S is a titlecased string and there is at least one\n\
5225character in S, i.e. upper- and titlecase characters may only\n\
5226follow uncased characters and lowercase characters only cased ones.\n\
5227Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228
5229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005230unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231{
5232 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5233 register const Py_UNICODE *e;
5234 int cased, previous_is_cased;
5235
Guido van Rossumd57fd912000-03-10 22:53:23 +00005236 /* Shortcut for single character strings */
5237 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005238 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
5239 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005240
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005241 /* Special case for empty strings */
5242 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005243 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005244
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 e = p + PyUnicode_GET_SIZE(self);
5246 cased = 0;
5247 previous_is_cased = 0;
5248 for (; p < e; p++) {
5249 register const Py_UNICODE ch = *p;
Tim Petersced69f82003-09-16 20:30:58 +00005250
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
5252 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005253 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254 previous_is_cased = 1;
5255 cased = 1;
5256 }
5257 else if (Py_UNICODE_ISLOWER(ch)) {
5258 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005259 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005260 previous_is_cased = 1;
5261 cased = 1;
5262 }
5263 else
5264 previous_is_cased = 0;
5265 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005266 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267}
5268
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005269PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005270"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005272Return True if all characters in S are whitespace\n\
5273and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
5275static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005276unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277{
5278 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5279 register const Py_UNICODE *e;
5280
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 /* Shortcut for single character strings */
5282 if (PyUnicode_GET_SIZE(self) == 1 &&
5283 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005284 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005286 /* Special case for empty strings */
5287 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005288 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005289
Guido van Rossumd57fd912000-03-10 22:53:23 +00005290 e = p + PyUnicode_GET_SIZE(self);
5291 for (; p < e; p++) {
5292 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005293 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005295 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296}
5297
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005298PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005299"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005300\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005301Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005302and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005303
5304static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005305unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005306{
5307 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5308 register const Py_UNICODE *e;
5309
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005310 /* Shortcut for single character strings */
5311 if (PyUnicode_GET_SIZE(self) == 1 &&
5312 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005313 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005314
5315 /* Special case for empty strings */
5316 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005317 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005318
5319 e = p + PyUnicode_GET_SIZE(self);
5320 for (; p < e; p++) {
5321 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005322 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005323 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005324 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005325}
5326
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005327PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005328"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005329\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005330Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005331and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005332
5333static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005334unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005335{
5336 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5337 register const Py_UNICODE *e;
5338
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005339 /* Shortcut for single character strings */
5340 if (PyUnicode_GET_SIZE(self) == 1 &&
5341 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005342 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005343
5344 /* Special case for empty strings */
5345 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005346 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005347
5348 e = p + PyUnicode_GET_SIZE(self);
5349 for (; p < e; p++) {
5350 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005351 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005352 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005353 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00005354}
5355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005356PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005357"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005359Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005360False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361
5362static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005363unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364{
5365 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5366 register const Py_UNICODE *e;
5367
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 /* Shortcut for single character strings */
5369 if (PyUnicode_GET_SIZE(self) == 1 &&
5370 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005371 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005373 /* Special case for empty strings */
5374 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005375 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005376
Guido van Rossumd57fd912000-03-10 22:53:23 +00005377 e = p + PyUnicode_GET_SIZE(self);
5378 for (; p < e; p++) {
5379 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005380 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005382 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383}
5384
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005385PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005386"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387\n\
Martin v. Löwis6828e182003-10-18 09:55:08 +00005388Return True if all characters in S are digits\n\
5389and there is at least one character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005390
5391static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005392unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393{
5394 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5395 register const Py_UNICODE *e;
5396
Guido van Rossumd57fd912000-03-10 22:53:23 +00005397 /* Shortcut for single character strings */
5398 if (PyUnicode_GET_SIZE(self) == 1 &&
5399 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005400 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005402 /* Special case for empty strings */
5403 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005404 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005405
Guido van Rossumd57fd912000-03-10 22:53:23 +00005406 e = p + PyUnicode_GET_SIZE(self);
5407 for (; p < e; p++) {
5408 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005409 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005410 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005411 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005412}
5413
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005414PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005415"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005416\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005417Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005418False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419
5420static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005421unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422{
5423 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5424 register const Py_UNICODE *e;
5425
Guido van Rossumd57fd912000-03-10 22:53:23 +00005426 /* Shortcut for single character strings */
5427 if (PyUnicode_GET_SIZE(self) == 1 &&
5428 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005429 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005431 /* Special case for empty strings */
5432 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005433 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005434
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 e = p + PyUnicode_GET_SIZE(self);
5436 for (; p < e; p++) {
5437 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005438 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005439 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005440 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441}
5442
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005443PyDoc_STRVAR(iswide__doc__,
5444"S.iswide() -> bool\n\
5445\n\
5446Return True if all characters in S are wide width\n\
5447and there is at least one character in S, False otherwise.");
5448
5449static PyObject*
5450unicode_iswide(PyUnicodeObject *self)
5451{
5452 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5453 register const Py_UNICODE *e;
5454
5455 /* Shortcut for single character strings */
5456 if (PyUnicode_GET_SIZE(self) == 1 &&
5457 Py_UNICODE_ISWIDE(*p))
5458 Py_RETURN_TRUE;
5459
5460 /* Special case for empty strings */
5461 if (PyString_GET_SIZE(self) == 0)
5462 Py_RETURN_FALSE;
5463
5464 e = p + PyUnicode_GET_SIZE(self);
5465 for (; p < e; p++) {
5466 if (!Py_UNICODE_ISWIDE(*p))
5467 Py_RETURN_FALSE;
5468 }
5469 Py_RETURN_TRUE;
5470}
5471
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005472PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473"S.join(sequence) -> unicode\n\
5474\n\
5475Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005476sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005477
5478static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005479unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005480{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005481 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482}
5483
5484static int
5485unicode_length(PyUnicodeObject *self)
5486{
5487 return self->length;
5488}
5489
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005490PyDoc_STRVAR(ljust__doc__,
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00005491"S.ljust(width[, fillchar]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005492\n\
5493Return S left justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005494done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005495
5496static PyObject *
5497unicode_ljust(PyUnicodeObject *self, PyObject *args)
5498{
5499 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005500 Py_UNICODE fillchar = ' ';
5501
5502 if (!PyArg_ParseTuple(args, "i|O&:ljust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 return NULL;
5504
Tim Peters7a29bd52001-09-12 03:03:31 +00005505 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005506 Py_INCREF(self);
5507 return (PyObject*) self;
5508 }
5509
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005510 return (PyObject*) pad(self, 0, width - self->length, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511}
5512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005513PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005514"S.lower() -> unicode\n\
5515\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005516Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517
5518static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005519unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005520{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005521 return fixup(self, fixlower);
5522}
5523
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005524#define LEFTSTRIP 0
5525#define RIGHTSTRIP 1
5526#define BOTHSTRIP 2
5527
5528/* Arrays indexed by above */
5529static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5530
5531#define STRIPNAME(i) (stripformat[i]+3)
5532
5533static const Py_UNICODE *
5534unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5535{
Tim Peters030a5ce2002-04-22 19:00:10 +00005536 size_t i;
5537 for (i = 0; i < n; ++i)
5538 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005539 return s+i;
5540 return NULL;
5541}
5542
5543/* externally visible for str.strip(unicode) */
5544PyObject *
5545_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5546{
5547 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5548 int len = PyUnicode_GET_SIZE(self);
5549 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5550 int seplen = PyUnicode_GET_SIZE(sepobj);
5551 int i, j;
5552
5553 i = 0;
5554 if (striptype != RIGHTSTRIP) {
5555 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5556 i++;
5557 }
5558 }
5559
5560 j = len;
5561 if (striptype != LEFTSTRIP) {
5562 do {
5563 j--;
5564 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5565 j++;
5566 }
5567
5568 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5569 Py_INCREF(self);
5570 return (PyObject*)self;
5571 }
5572 else
5573 return PyUnicode_FromUnicode(s+i, j-i);
5574}
5575
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576
5577static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005578do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005580 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5581 int len = PyUnicode_GET_SIZE(self), i, j;
5582
5583 i = 0;
5584 if (striptype != RIGHTSTRIP) {
5585 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5586 i++;
5587 }
5588 }
5589
5590 j = len;
5591 if (striptype != LEFTSTRIP) {
5592 do {
5593 j--;
5594 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5595 j++;
5596 }
5597
5598 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5599 Py_INCREF(self);
5600 return (PyObject*)self;
5601 }
5602 else
5603 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604}
5605
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005606
5607static PyObject *
5608do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5609{
5610 PyObject *sep = NULL;
5611
5612 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5613 return NULL;
5614
5615 if (sep != NULL && sep != Py_None) {
5616 if (PyUnicode_Check(sep))
5617 return _PyUnicode_XStrip(self, striptype, sep);
5618 else if (PyString_Check(sep)) {
5619 PyObject *res;
5620 sep = PyUnicode_FromObject(sep);
5621 if (sep==NULL)
5622 return NULL;
5623 res = _PyUnicode_XStrip(self, striptype, sep);
5624 Py_DECREF(sep);
5625 return res;
5626 }
5627 else {
5628 PyErr_Format(PyExc_TypeError,
5629 "%s arg must be None, unicode or str",
5630 STRIPNAME(striptype));
5631 return NULL;
5632 }
5633 }
5634
5635 return do_strip(self, striptype);
5636}
5637
5638
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005639PyDoc_STRVAR(strip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005640"S.strip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005641\n\
5642Return a copy of the string S with leading and trailing\n\
5643whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005644If chars is given and not None, remove characters in chars instead.\n\
5645If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005646
5647static PyObject *
5648unicode_strip(PyUnicodeObject *self, PyObject *args)
5649{
5650 if (PyTuple_GET_SIZE(args) == 0)
5651 return do_strip(self, BOTHSTRIP); /* Common case */
5652 else
5653 return do_argstrip(self, BOTHSTRIP, args);
5654}
5655
5656
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005657PyDoc_STRVAR(lstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005658"S.lstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005659\n\
5660Return a copy of the string S with leading whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005661If chars is given and not None, remove characters in chars instead.\n\
5662If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005663
5664static PyObject *
5665unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5666{
5667 if (PyTuple_GET_SIZE(args) == 0)
5668 return do_strip(self, LEFTSTRIP); /* Common case */
5669 else
5670 return do_argstrip(self, LEFTSTRIP, args);
5671}
5672
5673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005674PyDoc_STRVAR(rstrip__doc__,
Neal Norwitzffe33b72003-04-10 22:35:32 +00005675"S.rstrip([chars]) -> unicode\n\
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005676\n\
5677Return a copy of the string S with trailing whitespace removed.\n\
Neal Norwitzffe33b72003-04-10 22:35:32 +00005678If chars is given and not None, remove characters in chars instead.\n\
5679If chars is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005680
5681static PyObject *
5682unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5683{
5684 if (PyTuple_GET_SIZE(args) == 0)
5685 return do_strip(self, RIGHTSTRIP); /* Common case */
5686 else
5687 return do_argstrip(self, RIGHTSTRIP, args);
5688}
5689
5690
Guido van Rossumd57fd912000-03-10 22:53:23 +00005691static PyObject*
5692unicode_repeat(PyUnicodeObject *str, int len)
5693{
5694 PyUnicodeObject *u;
5695 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005696 int nchars;
5697 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698
5699 if (len < 0)
5700 len = 0;
5701
Tim Peters7a29bd52001-09-12 03:03:31 +00005702 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005703 /* no repeat, return original string */
5704 Py_INCREF(str);
5705 return (PyObject*) str;
5706 }
Tim Peters8f422462000-09-09 06:13:41 +00005707
5708 /* ensure # of chars needed doesn't overflow int and # of bytes
5709 * needed doesn't overflow size_t
5710 */
5711 nchars = len * str->length;
5712 if (len && nchars / len != str->length) {
5713 PyErr_SetString(PyExc_OverflowError,
5714 "repeated string is too long");
5715 return NULL;
5716 }
5717 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5718 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5719 PyErr_SetString(PyExc_OverflowError,
5720 "repeated string is too long");
5721 return NULL;
5722 }
5723 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724 if (!u)
5725 return NULL;
5726
5727 p = u->str;
5728
5729 while (len-- > 0) {
5730 Py_UNICODE_COPY(p, str->str, str->length);
5731 p += str->length;
5732 }
5733
5734 return (PyObject*) u;
5735}
5736
5737PyObject *PyUnicode_Replace(PyObject *obj,
5738 PyObject *subobj,
5739 PyObject *replobj,
5740 int maxcount)
5741{
5742 PyObject *self;
5743 PyObject *str1;
5744 PyObject *str2;
5745 PyObject *result;
5746
5747 self = PyUnicode_FromObject(obj);
5748 if (self == NULL)
5749 return NULL;
5750 str1 = PyUnicode_FromObject(subobj);
5751 if (str1 == NULL) {
5752 Py_DECREF(self);
5753 return NULL;
5754 }
5755 str2 = PyUnicode_FromObject(replobj);
5756 if (str2 == NULL) {
5757 Py_DECREF(self);
5758 Py_DECREF(str1);
5759 return NULL;
5760 }
Tim Petersced69f82003-09-16 20:30:58 +00005761 result = replace((PyUnicodeObject *)self,
5762 (PyUnicodeObject *)str1,
5763 (PyUnicodeObject *)str2,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 maxcount);
5765 Py_DECREF(self);
5766 Py_DECREF(str1);
5767 Py_DECREF(str2);
5768 return result;
5769}
5770
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005771PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772"S.replace (old, new[, maxsplit]) -> unicode\n\
5773\n\
5774Return a copy of S with all occurrences of substring\n\
5775old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005776given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777
5778static PyObject*
5779unicode_replace(PyUnicodeObject *self, PyObject *args)
5780{
5781 PyUnicodeObject *str1;
5782 PyUnicodeObject *str2;
5783 int maxcount = -1;
5784 PyObject *result;
5785
5786 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5787 return NULL;
5788 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5789 if (str1 == NULL)
5790 return NULL;
5791 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005792 if (str2 == NULL) {
5793 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005794 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005796
5797 result = replace(self, str1, str2, maxcount);
5798
5799 Py_DECREF(str1);
5800 Py_DECREF(str2);
5801 return result;
5802}
5803
5804static
5805PyObject *unicode_repr(PyObject *unicode)
5806{
5807 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5808 PyUnicode_GET_SIZE(unicode),
5809 1);
5810}
5811
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005812PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813"S.rfind(sub [,start [,end]]) -> int\n\
5814\n\
5815Return the highest index in S where substring sub is found,\n\
5816such that sub is contained within s[start,end]. Optional\n\
5817arguments start and end are interpreted as in slice notation.\n\
5818\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005819Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820
5821static PyObject *
5822unicode_rfind(PyUnicodeObject *self, PyObject *args)
5823{
5824 PyUnicodeObject *substring;
5825 int start = 0;
5826 int end = INT_MAX;
5827 PyObject *result;
5828
Guido van Rossumb8872e62000-05-09 14:14:27 +00005829 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5830 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005831 return NULL;
5832 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5833 (PyObject *)substring);
5834 if (substring == NULL)
5835 return NULL;
5836
5837 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5838
5839 Py_DECREF(substring);
5840 return result;
5841}
5842
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005843PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005844"S.rindex(sub [,start [,end]]) -> int\n\
5845\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005846Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847
5848static PyObject *
5849unicode_rindex(PyUnicodeObject *self, PyObject *args)
5850{
5851 int result;
5852 PyUnicodeObject *substring;
5853 int start = 0;
5854 int end = INT_MAX;
5855
Guido van Rossumb8872e62000-05-09 14:14:27 +00005856 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5857 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005858 return NULL;
5859 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5860 (PyObject *)substring);
5861 if (substring == NULL)
5862 return NULL;
5863
5864 result = findstring(self, substring, start, end, -1);
5865
5866 Py_DECREF(substring);
5867 if (result < 0) {
5868 PyErr_SetString(PyExc_ValueError, "substring not found");
5869 return NULL;
5870 }
5871 return PyInt_FromLong(result);
5872}
5873
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005874PyDoc_STRVAR(rjust__doc__,
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005875"S.rjust(width[, fillchar]) -> unicode\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005876\n\
5877Return S right justified in a Unicode string of length width. Padding is\n\
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005878done using the specified fill character (default is a space).");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879
5880static PyObject *
5881unicode_rjust(PyUnicodeObject *self, PyObject *args)
5882{
5883 int width;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005884 Py_UNICODE fillchar = ' ';
5885
5886 if (!PyArg_ParseTuple(args, "i|O&:rjust", &width, convert_uc, &fillchar))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005887 return NULL;
5888
Tim Peters7a29bd52001-09-12 03:03:31 +00005889 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005890 Py_INCREF(self);
5891 return (PyObject*) self;
5892 }
5893
Raymond Hettinger4f8f9762003-11-26 08:21:35 +00005894 return (PyObject*) pad(self, width - self->length, 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005895}
5896
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897static PyObject*
5898unicode_slice(PyUnicodeObject *self, int start, int end)
5899{
5900 /* standard clamping */
5901 if (start < 0)
5902 start = 0;
5903 if (end < 0)
5904 end = 0;
5905 if (end > self->length)
5906 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005907 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 /* full slice, return original string */
5909 Py_INCREF(self);
5910 return (PyObject*) self;
5911 }
5912 if (start > end)
5913 start = end;
5914 /* copy slice */
5915 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5916 end - start);
5917}
5918
5919PyObject *PyUnicode_Split(PyObject *s,
5920 PyObject *sep,
5921 int maxsplit)
5922{
5923 PyObject *result;
Tim Petersced69f82003-09-16 20:30:58 +00005924
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925 s = PyUnicode_FromObject(s);
5926 if (s == NULL)
5927 return NULL;
5928 if (sep != NULL) {
5929 sep = PyUnicode_FromObject(sep);
5930 if (sep == NULL) {
5931 Py_DECREF(s);
5932 return NULL;
5933 }
5934 }
5935
5936 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5937
5938 Py_DECREF(s);
5939 Py_XDECREF(sep);
5940 return result;
5941}
5942
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005943PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005944"S.split([sep [,maxsplit]]) -> list of strings\n\
5945\n\
5946Return a list of the words in S, using sep as the\n\
5947delimiter string. If maxsplit is given, at most maxsplit\n\
5948splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005949is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005950
5951static PyObject*
5952unicode_split(PyUnicodeObject *self, PyObject *args)
5953{
5954 PyObject *substring = Py_None;
5955 int maxcount = -1;
5956
5957 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5958 return NULL;
5959
5960 if (substring == Py_None)
5961 return split(self, NULL, maxcount);
5962 else if (PyUnicode_Check(substring))
5963 return split(self, (PyUnicodeObject *)substring, maxcount);
5964 else
5965 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5966}
5967
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00005968PyObject *PyUnicode_RSplit(PyObject *s,
5969 PyObject *sep,
5970 int maxsplit)
5971{
5972 PyObject *result;
5973
5974 s = PyUnicode_FromObject(s);
5975 if (s == NULL)
5976 return NULL;
5977 if (sep != NULL) {
5978 sep = PyUnicode_FromObject(sep);
5979 if (sep == NULL) {
5980 Py_DECREF(s);
5981 return NULL;
5982 }
5983 }
5984
5985 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5986
5987 Py_DECREF(s);
5988 Py_XDECREF(sep);
5989 return result;
5990}
5991
5992PyDoc_STRVAR(rsplit__doc__,
5993"S.rsplit([sep [,maxsplit]]) -> list of strings\n\
5994\n\
5995Return a list of the words in S, using sep as the\n\
5996delimiter string, starting at the end of the string and\n\
5997working to the front. If maxsplit is given, at most maxsplit\n\
5998splits are done. If sep is not specified, any whitespace string\n\
5999is a separator.");
6000
6001static PyObject*
6002unicode_rsplit(PyUnicodeObject *self, PyObject *args)
6003{
6004 PyObject *substring = Py_None;
6005 int maxcount = -1;
6006
6007 if (!PyArg_ParseTuple(args, "|Oi:rsplit", &substring, &maxcount))
6008 return NULL;
6009
6010 if (substring == Py_None)
6011 return rsplit(self, NULL, maxcount);
6012 else if (PyUnicode_Check(substring))
6013 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
6014 else
6015 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
6016}
6017
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006018PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00006019"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006020\n\
6021Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00006022Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006023is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024
6025static PyObject*
6026unicode_splitlines(PyUnicodeObject *self, PyObject *args)
6027{
Guido van Rossum86662912000-04-11 15:38:46 +00006028 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006029
Guido van Rossum86662912000-04-11 15:38:46 +00006030 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006031 return NULL;
6032
Guido van Rossum86662912000-04-11 15:38:46 +00006033 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034}
6035
6036static
6037PyObject *unicode_str(PyUnicodeObject *self)
6038{
Fred Drakee4315f52000-05-09 19:53:39 +00006039 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006040}
6041
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006042PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043"S.swapcase() -> unicode\n\
6044\n\
6045Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006046and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047
6048static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006049unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006050{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006051 return fixup(self, fixswapcase);
6052}
6053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006054PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055"S.translate(table) -> unicode\n\
6056\n\
6057Return a copy of the string S, where all characters have been mapped\n\
6058through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00006059Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
6060Unmapped characters are left untouched. Characters mapped to None\n\
6061are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006062
6063static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006064unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006065{
Tim Petersced69f82003-09-16 20:30:58 +00006066 return PyUnicode_TranslateCharmap(self->str,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006067 self->length,
Tim Petersced69f82003-09-16 20:30:58 +00006068 table,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 "ignore");
6070}
6071
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006072PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006073"S.upper() -> unicode\n\
6074\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006075Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006076
6077static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006078unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006079{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 return fixup(self, fixupper);
6081}
6082
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006083PyDoc_STRVAR(width__doc__,
6084"S.width() -> unicode\n\
6085\n\
6086Return a fixed-width representation length of S.");
6087
6088static PyObject*
6089unicode_width(PyObject *self)
6090{
6091 int width = PyUnicode_GetWidth(self);
6092 if (width == -1)
6093 return NULL;
6094 else
6095 return PyInt_FromLong((long)width);
6096}
6097
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006098PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006099"S.zfill(width) -> unicode\n\
6100\n\
6101Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006102of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006103
6104static PyObject *
6105unicode_zfill(PyUnicodeObject *self, PyObject *args)
6106{
6107 int fill;
6108 PyUnicodeObject *u;
6109
6110 int width;
6111 if (!PyArg_ParseTuple(args, "i:zfill", &width))
6112 return NULL;
6113
6114 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00006115 if (PyUnicode_CheckExact(self)) {
6116 Py_INCREF(self);
6117 return (PyObject*) self;
6118 }
6119 else
6120 return PyUnicode_FromUnicode(
6121 PyUnicode_AS_UNICODE(self),
6122 PyUnicode_GET_SIZE(self)
6123 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 }
6125
6126 fill = width - self->length;
6127
6128 u = pad(self, fill, 0, '0');
6129
Walter Dörwald068325e2002-04-15 13:36:47 +00006130 if (u == NULL)
6131 return NULL;
6132
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133 if (u->str[fill] == '+' || u->str[fill] == '-') {
6134 /* move sign to beginning of string */
6135 u->str[0] = u->str[fill];
6136 u->str[fill] = '0';
6137 }
6138
6139 return (PyObject*) u;
6140}
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141
6142#if 0
6143static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006144unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146 return PyInt_FromLong(unicode_freelist_size);
6147}
6148#endif
6149
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006150PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006151"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006153Return True if S starts with the specified prefix, False otherwise.\n\
6154With optional start, test S beginning at that position.\n\
6155With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
6157static PyObject *
6158unicode_startswith(PyUnicodeObject *self,
6159 PyObject *args)
6160{
6161 PyUnicodeObject *substring;
6162 int start = 0;
6163 int end = INT_MAX;
6164 PyObject *result;
6165
Guido van Rossumb8872e62000-05-09 14:14:27 +00006166 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
6167 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006168 return NULL;
6169 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6170 (PyObject *)substring);
6171 if (substring == NULL)
6172 return NULL;
6173
Guido van Rossum77f6a652002-04-03 22:41:51 +00006174 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175
6176 Py_DECREF(substring);
6177 return result;
6178}
6179
6180
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006181PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00006182"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00006183\n\
Guido van Rossuma7132182003-04-09 19:32:45 +00006184Return True if S ends with the specified suffix, False otherwise.\n\
6185With optional start, test S beginning at that position.\n\
6186With optional end, stop comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006187
6188static PyObject *
6189unicode_endswith(PyUnicodeObject *self,
6190 PyObject *args)
6191{
6192 PyUnicodeObject *substring;
6193 int start = 0;
6194 int end = INT_MAX;
6195 PyObject *result;
6196
Guido van Rossumb8872e62000-05-09 14:14:27 +00006197 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
6198 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 return NULL;
6200 substring = (PyUnicodeObject *)PyUnicode_FromObject(
6201 (PyObject *)substring);
6202 if (substring == NULL)
6203 return NULL;
6204
Guido van Rossum77f6a652002-04-03 22:41:51 +00006205 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006206
6207 Py_DECREF(substring);
6208 return result;
6209}
6210
6211
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006212
6213static PyObject *
6214unicode_getnewargs(PyUnicodeObject *v)
6215{
6216 return Py_BuildValue("(u#)", v->str, v->length);
6217}
6218
6219
Guido van Rossumd57fd912000-03-10 22:53:23 +00006220static PyMethodDef unicode_methods[] = {
6221
6222 /* Order is according to common usage: often used methods should
6223 appear first, since lookup is done sequentially. */
6224
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006225 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
6226 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
6227 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +00006228 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006229 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
6230 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
6231 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
6232 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
6233 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
6234 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
6235 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
6236 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
6237 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
6238 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006239 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00006240 {"decode", (PyCFunction) unicode_decode, METH_VARARGS, decode__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006241/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
6242 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
6243 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
6244 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006245 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006246 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00006247 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006248 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
6249 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
6250 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
6251 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
6252 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
6253 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
6254 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
6255 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
6256 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
6257 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
6258 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
6259 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
6260 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
6261 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Hye-Shik Chang974ed7c2004-06-02 16:49:17 +00006262 {"iswide", (PyCFunction) unicode_iswide, METH_NOARGS, iswide__doc__},
6263 {"width", (PyCFunction) unicode_width, METH_NOARGS, width__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006264 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00006265#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006266 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267#endif
6268
6269#if 0
6270 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00006271 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006272#endif
6273
Guido van Rossum5d9113d2003-01-29 17:58:45 +00006274 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00006275 {NULL, NULL}
6276};
6277
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006278static PyObject *
6279unicode_mod(PyObject *v, PyObject *w)
6280{
6281 if (!PyUnicode_Check(v)) {
6282 Py_INCREF(Py_NotImplemented);
6283 return Py_NotImplemented;
6284 }
6285 return PyUnicode_Format(v, w);
6286}
6287
6288static PyNumberMethods unicode_as_number = {
6289 0, /*nb_add*/
6290 0, /*nb_subtract*/
6291 0, /*nb_multiply*/
6292 0, /*nb_divide*/
6293 unicode_mod, /*nb_remainder*/
6294};
6295
Guido van Rossumd57fd912000-03-10 22:53:23 +00006296static PySequenceMethods unicode_as_sequence = {
6297 (inquiry) unicode_length, /* sq_length */
6298 (binaryfunc) PyUnicode_Concat, /* sq_concat */
6299 (intargfunc) unicode_repeat, /* sq_repeat */
6300 (intargfunc) unicode_getitem, /* sq_item */
6301 (intintargfunc) unicode_slice, /* sq_slice */
6302 0, /* sq_ass_item */
6303 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00006304 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305};
6306
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006307static PyObject*
6308unicode_subscript(PyUnicodeObject* self, PyObject* item)
6309{
6310 if (PyInt_Check(item)) {
6311 long i = PyInt_AS_LONG(item);
6312 if (i < 0)
6313 i += PyString_GET_SIZE(self);
6314 return unicode_getitem(self, i);
6315 } else if (PyLong_Check(item)) {
6316 long i = PyLong_AsLong(item);
6317 if (i == -1 && PyErr_Occurred())
6318 return NULL;
6319 if (i < 0)
6320 i += PyString_GET_SIZE(self);
6321 return unicode_getitem(self, i);
6322 } else if (PySlice_Check(item)) {
6323 int start, stop, step, slicelength, cur, i;
6324 Py_UNICODE* source_buf;
6325 Py_UNICODE* result_buf;
6326 PyObject* result;
6327
6328 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
6329 &start, &stop, &step, &slicelength) < 0) {
6330 return NULL;
6331 }
6332
6333 if (slicelength <= 0) {
6334 return PyUnicode_FromUnicode(NULL, 0);
6335 } else {
6336 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
6337 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
6338
6339 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
6340 result_buf[i] = source_buf[cur];
6341 }
Tim Petersced69f82003-09-16 20:30:58 +00006342
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006343 result = PyUnicode_FromUnicode(result_buf, slicelength);
6344 PyMem_FREE(result_buf);
6345 return result;
6346 }
6347 } else {
6348 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
6349 return NULL;
6350 }
6351}
6352
6353static PyMappingMethods unicode_as_mapping = {
6354 (inquiry)unicode_length, /* mp_length */
6355 (binaryfunc)unicode_subscript, /* mp_subscript */
6356 (objobjargproc)0, /* mp_ass_subscript */
6357};
6358
Guido van Rossumd57fd912000-03-10 22:53:23 +00006359static int
6360unicode_buffer_getreadbuf(PyUnicodeObject *self,
6361 int index,
6362 const void **ptr)
6363{
6364 if (index != 0) {
6365 PyErr_SetString(PyExc_SystemError,
6366 "accessing non-existent unicode segment");
6367 return -1;
6368 }
6369 *ptr = (void *) self->str;
6370 return PyUnicode_GET_DATA_SIZE(self);
6371}
6372
6373static int
6374unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
6375 const void **ptr)
6376{
6377 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00006378 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006379 return -1;
6380}
6381
6382static int
6383unicode_buffer_getsegcount(PyUnicodeObject *self,
6384 int *lenp)
6385{
6386 if (lenp)
6387 *lenp = PyUnicode_GET_DATA_SIZE(self);
6388 return 1;
6389}
6390
6391static int
6392unicode_buffer_getcharbuf(PyUnicodeObject *self,
6393 int index,
6394 const void **ptr)
6395{
6396 PyObject *str;
Tim Petersced69f82003-09-16 20:30:58 +00006397
Guido van Rossumd57fd912000-03-10 22:53:23 +00006398 if (index != 0) {
6399 PyErr_SetString(PyExc_SystemError,
6400 "accessing non-existent unicode segment");
6401 return -1;
6402 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006403 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006404 if (str == NULL)
6405 return -1;
6406 *ptr = (void *) PyString_AS_STRING(str);
6407 return PyString_GET_SIZE(str);
6408}
6409
6410/* Helpers for PyUnicode_Format() */
6411
6412static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00006413getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006414{
6415 int argidx = *p_argidx;
6416 if (argidx < arglen) {
6417 (*p_argidx)++;
6418 if (arglen < 0)
6419 return args;
6420 else
6421 return PyTuple_GetItem(args, argidx);
6422 }
6423 PyErr_SetString(PyExc_TypeError,
6424 "not enough arguments for format string");
6425 return NULL;
6426}
6427
6428#define F_LJUST (1<<0)
6429#define F_SIGN (1<<1)
6430#define F_BLANK (1<<2)
6431#define F_ALT (1<<3)
6432#define F_ZERO (1<<4)
6433
6434static
Guido van Rossumd57fd912000-03-10 22:53:23 +00006435int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006436{
6437 register int i;
6438 int len;
6439 va_list va;
6440 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442
6443 /* First, format the string as char array, then expand to Py_UNICODE
6444 array. */
6445 charbuffer = (char *)buffer;
6446 len = vsprintf(charbuffer, format, va);
6447 for (i = len - 1; i >= 0; i--)
6448 buffer[i] = (Py_UNICODE) charbuffer[i];
6449
6450 va_end(va);
6451 return len;
6452}
6453
Guido van Rossum078151d2002-08-11 04:24:12 +00006454/* XXX To save some code duplication, formatfloat/long/int could have been
6455 shared with stringobject.c, converting from 8-bit to Unicode after the
6456 formatting is done. */
6457
Guido van Rossumd57fd912000-03-10 22:53:23 +00006458static int
6459formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006460 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006461 int flags,
6462 int prec,
6463 int type,
6464 PyObject *v)
6465{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006466 /* fmt = '%#.' + `prec` + `type`
6467 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006468 char fmt[20];
6469 double x;
Tim Petersced69f82003-09-16 20:30:58 +00006470
Guido van Rossumd57fd912000-03-10 22:53:23 +00006471 x = PyFloat_AsDouble(v);
6472 if (x == -1.0 && PyErr_Occurred())
6473 return -1;
6474 if (prec < 0)
6475 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6477 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006478 /* Worst case length calc to ensure no buffer overrun:
6479
6480 'g' formats:
6481 fmt = %#.<prec>g
6482 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6483 for any double rep.)
6484 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6485
6486 'f' formats:
6487 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6488 len = 1 + 50 + 1 + prec = 52 + prec
6489
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006490 If prec=0 the effective precision is 1 (the leading digit is
Tim Petersced69f82003-09-16 20:30:58 +00006491 always given), therefore increase the length by one.
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006492
6493 */
6494 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6495 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006496 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006497 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006498 return -1;
6499 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006500 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6501 (flags&F_ALT) ? "#" : "",
6502 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006503 return usprintf(buf, fmt, x);
6504}
6505
Tim Peters38fd5b62000-09-21 05:43:11 +00006506static PyObject*
6507formatlong(PyObject *val, int flags, int prec, int type)
6508{
6509 char *buf;
6510 int i, len;
6511 PyObject *str; /* temporary string object. */
6512 PyUnicodeObject *result;
6513
6514 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6515 if (!str)
6516 return NULL;
6517 result = _PyUnicode_New(len);
6518 for (i = 0; i < len; i++)
6519 result->str[i] = buf[i];
6520 result->str[len] = 0;
6521 Py_DECREF(str);
6522 return (PyObject*)result;
6523}
6524
Guido van Rossumd57fd912000-03-10 22:53:23 +00006525static int
6526formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006527 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006528 int flags,
6529 int prec,
6530 int type,
6531 PyObject *v)
6532{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006533 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006534 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6535 * + 1 + 1
6536 * = 24
6537 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006538 char fmt[64]; /* plenty big enough! */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006539 char *sign;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006540 long x;
6541
6542 x = PyInt_AsLong(v);
6543 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006544 return -1;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006545 if (x < 0 && type == 'u') {
6546 type = 'd';
Guido van Rossum078151d2002-08-11 04:24:12 +00006547 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006548 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
6549 sign = "-";
6550 else
6551 sign = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006553 prec = 1;
6554
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006555 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
6556 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006557 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006558 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006559 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006560 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006561 return -1;
6562 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006563
6564 if ((flags & F_ALT) &&
6565 (type == 'x' || type == 'X')) {
Tim Petersced69f82003-09-16 20:30:58 +00006566 /* When converting under %#x or %#X, there are a number
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006567 * of issues that cause pain:
6568 * - when 0 is being converted, the C standard leaves off
6569 * the '0x' or '0X', which is inconsistent with other
6570 * %#x/%#X conversions and inconsistent with Python's
6571 * hex() function
6572 * - there are platforms that violate the standard and
6573 * convert 0 with the '0x' or '0X'
6574 * (Metrowerks, Compaq Tru64)
6575 * - there are platforms that give '0x' when converting
Tim Petersced69f82003-09-16 20:30:58 +00006576 * under %#X, but convert 0 in accordance with the
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006577 * standard (OS/2 EMX)
Tim Petersced69f82003-09-16 20:30:58 +00006578 *
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006579 * We can achieve the desired consistency by inserting our
6580 * own '0x' or '0X' prefix, and substituting %x/%X in place
6581 * of %#x/%#X.
6582 *
6583 * Note that this is the same approach as used in
6584 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006585 */
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006586 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
6587 sign, type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006588 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006589 else {
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006590 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
6591 sign, (flags&F_ALT) ? "#" : "",
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006592 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006593 }
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006594 if (sign[0])
6595 return usprintf(buf, fmt, -x);
6596 else
6597 return usprintf(buf, fmt, x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006598}
6599
6600static int
6601formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006602 size_t buflen,
6603 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006605 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006606 if (PyUnicode_Check(v)) {
6607 if (PyUnicode_GET_SIZE(v) != 1)
6608 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006609 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006611
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006612 else if (PyString_Check(v)) {
Tim Petersced69f82003-09-16 20:30:58 +00006613 if (PyString_GET_SIZE(v) != 1)
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006614 goto onError;
6615 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006617
6618 else {
6619 /* Integer input truncated to a character */
6620 long x;
6621 x = PyInt_AsLong(v);
6622 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006623 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006624#ifdef Py_UNICODE_WIDE
6625 if (x < 0 || x > 0x10ffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006626 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006627 "%c arg not in range(0x110000) "
6628 "(wide Python build)");
6629 return -1;
6630 }
6631#else
6632 if (x < 0 || x > 0xffff) {
Walter Dörwald44f527f2003-04-02 16:37:24 +00006633 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006634 "%c arg not in range(0x10000) "
6635 "(narrow Python build)");
6636 return -1;
6637 }
6638#endif
6639 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640 }
6641 buf[1] = '\0';
6642 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006643
6644 onError:
6645 PyErr_SetString(PyExc_TypeError,
6646 "%c requires int or char");
6647 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006648}
6649
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006650/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6651
6652 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6653 chars are formatted. XXX This is a magic number. Each formatting
6654 routine does bounds checking to ensure no overflow, but a better
6655 solution may be to malloc a buffer of appropriate size for each
6656 format. For now, the current solution is sufficient.
6657*/
6658#define FORMATBUFLEN (size_t)120
6659
Guido van Rossumd57fd912000-03-10 22:53:23 +00006660PyObject *PyUnicode_Format(PyObject *format,
6661 PyObject *args)
6662{
6663 Py_UNICODE *fmt, *res;
6664 int fmtcnt, rescnt, reslen, arglen, argidx;
6665 int args_owned = 0;
6666 PyUnicodeObject *result = NULL;
6667 PyObject *dict = NULL;
6668 PyObject *uformat;
Tim Petersced69f82003-09-16 20:30:58 +00006669
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 if (format == NULL || args == NULL) {
6671 PyErr_BadInternalCall();
6672 return NULL;
6673 }
6674 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006675 if (uformat == NULL)
6676 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677 fmt = PyUnicode_AS_UNICODE(uformat);
6678 fmtcnt = PyUnicode_GET_SIZE(uformat);
6679
6680 reslen = rescnt = fmtcnt + 100;
6681 result = _PyUnicode_New(reslen);
6682 if (result == NULL)
6683 goto onError;
6684 res = PyUnicode_AS_UNICODE(result);
6685
6686 if (PyTuple_Check(args)) {
6687 arglen = PyTuple_Size(args);
6688 argidx = 0;
6689 }
6690 else {
6691 arglen = -1;
6692 argidx = -2;
6693 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006694 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6695 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 dict = args;
6697
6698 while (--fmtcnt >= 0) {
6699 if (*fmt != '%') {
6700 if (--rescnt < 0) {
6701 rescnt = fmtcnt + 100;
6702 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006703 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006704 return NULL;
6705 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6706 --rescnt;
6707 }
6708 *res++ = *fmt++;
6709 }
6710 else {
6711 /* Got a format specifier */
6712 int flags = 0;
6713 int width = -1;
6714 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715 Py_UNICODE c = '\0';
6716 Py_UNICODE fill;
6717 PyObject *v = NULL;
6718 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006719 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720 Py_UNICODE sign;
6721 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006722 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723
6724 fmt++;
6725 if (*fmt == '(') {
6726 Py_UNICODE *keystart;
6727 int keylen;
6728 PyObject *key;
6729 int pcount = 1;
6730
6731 if (dict == NULL) {
6732 PyErr_SetString(PyExc_TypeError,
Tim Petersced69f82003-09-16 20:30:58 +00006733 "format requires a mapping");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 goto onError;
6735 }
6736 ++fmt;
6737 --fmtcnt;
6738 keystart = fmt;
6739 /* Skip over balanced parentheses */
6740 while (pcount > 0 && --fmtcnt >= 0) {
6741 if (*fmt == ')')
6742 --pcount;
6743 else if (*fmt == '(')
6744 ++pcount;
6745 fmt++;
6746 }
6747 keylen = fmt - keystart - 1;
6748 if (fmtcnt < 0 || pcount > 0) {
6749 PyErr_SetString(PyExc_ValueError,
6750 "incomplete format key");
6751 goto onError;
6752 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006753#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006754 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006755 then looked up since Python uses strings to hold
6756 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006757 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006758 key = PyUnicode_EncodeUTF8(keystart,
6759 keylen,
6760 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006761#else
6762 key = PyUnicode_FromUnicode(keystart, keylen);
6763#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006764 if (key == NULL)
6765 goto onError;
6766 if (args_owned) {
6767 Py_DECREF(args);
6768 args_owned = 0;
6769 }
6770 args = PyObject_GetItem(dict, key);
6771 Py_DECREF(key);
6772 if (args == NULL) {
6773 goto onError;
6774 }
6775 args_owned = 1;
6776 arglen = -1;
6777 argidx = -2;
6778 }
6779 while (--fmtcnt >= 0) {
6780 switch (c = *fmt++) {
6781 case '-': flags |= F_LJUST; continue;
6782 case '+': flags |= F_SIGN; continue;
6783 case ' ': flags |= F_BLANK; continue;
6784 case '#': flags |= F_ALT; continue;
6785 case '0': flags |= F_ZERO; continue;
6786 }
6787 break;
6788 }
6789 if (c == '*') {
6790 v = getnextarg(args, arglen, &argidx);
6791 if (v == NULL)
6792 goto onError;
6793 if (!PyInt_Check(v)) {
6794 PyErr_SetString(PyExc_TypeError,
6795 "* wants int");
6796 goto onError;
6797 }
6798 width = PyInt_AsLong(v);
6799 if (width < 0) {
6800 flags |= F_LJUST;
6801 width = -width;
6802 }
6803 if (--fmtcnt >= 0)
6804 c = *fmt++;
6805 }
6806 else if (c >= '0' && c <= '9') {
6807 width = c - '0';
6808 while (--fmtcnt >= 0) {
6809 c = *fmt++;
6810 if (c < '0' || c > '9')
6811 break;
6812 if ((width*10) / 10 != width) {
6813 PyErr_SetString(PyExc_ValueError,
6814 "width too big");
6815 goto onError;
6816 }
6817 width = width*10 + (c - '0');
6818 }
6819 }
6820 if (c == '.') {
6821 prec = 0;
6822 if (--fmtcnt >= 0)
6823 c = *fmt++;
6824 if (c == '*') {
6825 v = getnextarg(args, arglen, &argidx);
6826 if (v == NULL)
6827 goto onError;
6828 if (!PyInt_Check(v)) {
6829 PyErr_SetString(PyExc_TypeError,
6830 "* wants int");
6831 goto onError;
6832 }
6833 prec = PyInt_AsLong(v);
6834 if (prec < 0)
6835 prec = 0;
6836 if (--fmtcnt >= 0)
6837 c = *fmt++;
6838 }
6839 else if (c >= '0' && c <= '9') {
6840 prec = c - '0';
6841 while (--fmtcnt >= 0) {
6842 c = Py_CHARMASK(*fmt++);
6843 if (c < '0' || c > '9')
6844 break;
6845 if ((prec*10) / 10 != prec) {
6846 PyErr_SetString(PyExc_ValueError,
6847 "prec too big");
6848 goto onError;
6849 }
6850 prec = prec*10 + (c - '0');
6851 }
6852 }
6853 } /* prec */
6854 if (fmtcnt >= 0) {
6855 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006856 if (--fmtcnt >= 0)
6857 c = *fmt++;
6858 }
6859 }
6860 if (fmtcnt < 0) {
6861 PyErr_SetString(PyExc_ValueError,
6862 "incomplete format");
6863 goto onError;
6864 }
6865 if (c != '%') {
6866 v = getnextarg(args, arglen, &argidx);
6867 if (v == NULL)
6868 goto onError;
6869 }
6870 sign = 0;
6871 fill = ' ';
6872 switch (c) {
6873
6874 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006875 pbuf = formatbuf;
6876 /* presume that buffer length is at least 1 */
6877 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006878 len = 1;
6879 break;
6880
6881 case 's':
6882 case 'r':
6883 if (PyUnicode_Check(v) && c == 's') {
6884 temp = v;
6885 Py_INCREF(temp);
6886 }
6887 else {
6888 PyObject *unicode;
6889 if (c == 's')
6890 temp = PyObject_Str(v);
6891 else
6892 temp = PyObject_Repr(v);
6893 if (temp == NULL)
6894 goto onError;
6895 if (!PyString_Check(temp)) {
6896 /* XXX Note: this should never happen, since
6897 PyObject_Repr() and PyObject_Str() assure
6898 this */
6899 Py_DECREF(temp);
6900 PyErr_SetString(PyExc_TypeError,
6901 "%s argument has non-string str()");
6902 goto onError;
6903 }
Fred Drakee4315f52000-05-09 19:53:39 +00006904 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006905 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006906 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006907 "strict");
6908 Py_DECREF(temp);
6909 temp = unicode;
6910 if (temp == NULL)
6911 goto onError;
6912 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006913 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006914 len = PyUnicode_GET_SIZE(temp);
6915 if (prec >= 0 && len > prec)
6916 len = prec;
6917 break;
6918
6919 case 'i':
6920 case 'd':
6921 case 'u':
6922 case 'o':
6923 case 'x':
6924 case 'X':
6925 if (c == 'i')
6926 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006927 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006928 temp = formatlong(v, flags, prec, c);
6929 if (!temp)
6930 goto onError;
6931 pbuf = PyUnicode_AS_UNICODE(temp);
6932 len = PyUnicode_GET_SIZE(temp);
Tim Peters38fd5b62000-09-21 05:43:11 +00006933 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006934 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006935 else {
6936 pbuf = formatbuf;
6937 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6938 flags, prec, c, v);
6939 if (len < 0)
6940 goto onError;
Guido van Rossum6c9e1302003-11-29 23:52:13 +00006941 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006942 }
6943 if (flags & F_ZERO)
6944 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945 break;
6946
6947 case 'e':
6948 case 'E':
6949 case 'f':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006950 case 'F':
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 case 'g':
6952 case 'G':
Raymond Hettinger9bfe5332003-08-27 04:55:52 +00006953 if (c == 'F')
6954 c = 'f';
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006955 pbuf = formatbuf;
6956 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6957 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006958 if (len < 0)
6959 goto onError;
6960 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006961 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006962 fill = '0';
6963 break;
6964
6965 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006966 pbuf = formatbuf;
6967 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968 if (len < 0)
6969 goto onError;
6970 break;
6971
6972 default:
6973 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006974 "unsupported format character '%c' (0x%x) "
6975 "at index %i",
Tim Petersced69f82003-09-16 20:30:58 +00006976 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006977 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006978 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006979 goto onError;
6980 }
6981 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006982 if (*pbuf == '-' || *pbuf == '+') {
6983 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006984 len--;
6985 }
6986 else if (flags & F_SIGN)
6987 sign = '+';
6988 else if (flags & F_BLANK)
6989 sign = ' ';
6990 else
6991 sign = 0;
6992 }
6993 if (width < len)
6994 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006995 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 reslen -= rescnt;
6997 rescnt = width + fmtcnt + 100;
6998 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006999 if (reslen < 0) {
7000 Py_DECREF(result);
7001 return PyErr_NoMemory();
7002 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007003 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007004 return NULL;
7005 res = PyUnicode_AS_UNICODE(result)
7006 + reslen - rescnt;
7007 }
7008 if (sign) {
7009 if (fill != ' ')
7010 *res++ = sign;
7011 rescnt--;
7012 if (width > len)
7013 width--;
7014 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007015 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
7016 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007017 assert(pbuf[1] == c);
7018 if (fill != ' ') {
7019 *res++ = *pbuf++;
7020 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00007021 }
Tim Petersfff53252001-04-12 18:38:48 +00007022 rescnt -= 2;
7023 width -= 2;
7024 if (width < 0)
7025 width = 0;
7026 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00007027 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007028 if (width > len && !(flags & F_LJUST)) {
7029 do {
7030 --rescnt;
7031 *res++ = fill;
7032 } while (--width > len);
7033 }
Tim Peters38fd5b62000-09-21 05:43:11 +00007034 if (fill == ' ') {
7035 if (sign)
7036 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00007037 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00007038 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00007039 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00007040 *res++ = *pbuf++;
7041 *res++ = *pbuf++;
7042 }
7043 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007044 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007045 res += len;
7046 rescnt -= len;
7047 while (--width >= len) {
7048 --rescnt;
7049 *res++ = ' ';
7050 }
7051 if (dict && (argidx < arglen) && c != '%') {
7052 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007053 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007054 goto onError;
7055 }
7056 Py_XDECREF(temp);
7057 } /* '%' */
7058 } /* until end */
7059 if (argidx < arglen && !dict) {
7060 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00007061 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007062 goto onError;
7063 }
7064
7065 if (args_owned) {
7066 Py_DECREF(args);
7067 }
7068 Py_DECREF(uformat);
Jeremy Hyltondeb2dc62003-09-16 03:41:45 +00007069 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007070 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 return (PyObject *)result;
7072
7073 onError:
7074 Py_XDECREF(result);
7075 Py_DECREF(uformat);
7076 if (args_owned) {
7077 Py_DECREF(args);
7078 }
7079 return NULL;
7080}
7081
7082static PyBufferProcs unicode_as_buffer = {
7083 (getreadbufferproc) unicode_buffer_getreadbuf,
7084 (getwritebufferproc) unicode_buffer_getwritebuf,
7085 (getsegcountproc) unicode_buffer_getsegcount,
7086 (getcharbufferproc) unicode_buffer_getcharbuf,
7087};
7088
Jeremy Hylton938ace62002-07-17 16:30:39 +00007089static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00007090unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
7091
Tim Peters6d6c1a32001-08-02 04:15:00 +00007092static PyObject *
7093unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7094{
7095 PyObject *x = NULL;
7096 static char *kwlist[] = {"string", "encoding", "errors", 0};
7097 char *encoding = NULL;
7098 char *errors = NULL;
7099
Guido van Rossume023fe02001-08-30 03:12:59 +00007100 if (type != &PyUnicode_Type)
7101 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00007102 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
7103 kwlist, &x, &encoding, &errors))
7104 return NULL;
7105 if (x == NULL)
7106 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00007107 if (encoding == NULL && errors == NULL)
7108 return PyObject_Unicode(x);
7109 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00007110 return PyUnicode_FromEncodedObject(x, encoding, errors);
7111}
7112
Guido van Rossume023fe02001-08-30 03:12:59 +00007113static PyObject *
7114unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
7115{
Tim Petersaf90b3e2001-09-12 05:18:58 +00007116 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007117 int n;
7118
7119 assert(PyType_IsSubtype(type, &PyUnicode_Type));
7120 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
7121 if (tmp == NULL)
7122 return NULL;
7123 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00007124 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007125 if (pnew == NULL) {
7126 Py_DECREF(tmp);
Guido van Rossume023fe02001-08-30 03:12:59 +00007127 return NULL;
Raymond Hettingerf4667932003-06-28 20:04:25 +00007128 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007129 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
7130 if (pnew->str == NULL) {
7131 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007132 PyObject_Del(pnew);
Raymond Hettingerf4667932003-06-28 20:04:25 +00007133 Py_DECREF(tmp);
Neal Norwitzec74f2f2003-02-11 23:05:40 +00007134 return PyErr_NoMemory();
Guido van Rossume023fe02001-08-30 03:12:59 +00007135 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00007136 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
7137 pnew->length = n;
7138 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00007139 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00007140 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00007141}
7142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00007143PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00007144"unicode(string [, encoding[, errors]]) -> object\n\
7145\n\
7146Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00007147encoding defaults to the current default string encoding.\n\
7148errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00007149
Guido van Rossumd57fd912000-03-10 22:53:23 +00007150PyTypeObject PyUnicode_Type = {
7151 PyObject_HEAD_INIT(&PyType_Type)
7152 0, /* ob_size */
7153 "unicode", /* tp_name */
7154 sizeof(PyUnicodeObject), /* tp_size */
7155 0, /* tp_itemsize */
7156 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00007157 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007158 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007159 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007160 0, /* tp_setattr */
7161 (cmpfunc) unicode_compare, /* tp_compare */
7162 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007163 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007164 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00007165 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007166 (hashfunc) unicode_hash, /* tp_hash*/
7167 0, /* tp_call*/
7168 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007169 PyObject_GenericGetAttr, /* tp_getattro */
7170 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00007172 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
7173 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007174 unicode_doc, /* tp_doc */
7175 0, /* tp_traverse */
7176 0, /* tp_clear */
7177 0, /* tp_richcompare */
7178 0, /* tp_weaklistoffset */
7179 0, /* tp_iter */
7180 0, /* tp_iternext */
7181 unicode_methods, /* tp_methods */
7182 0, /* tp_members */
7183 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00007184 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00007185 0, /* tp_dict */
7186 0, /* tp_descr_get */
7187 0, /* tp_descr_set */
7188 0, /* tp_dictoffset */
7189 0, /* tp_init */
7190 0, /* tp_alloc */
7191 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007192 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00007193};
7194
7195/* Initialize the Unicode implementation */
7196
Thomas Wouters78890102000-07-22 19:25:51 +00007197void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007198{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007199 int i;
7200
Fred Drakee4315f52000-05-09 19:53:39 +00007201 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007202 unicode_freelist = NULL;
7203 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00007205 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007206 for (i = 0; i < 256; i++)
7207 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00007208 if (PyType_Ready(&PyUnicode_Type) < 0)
7209 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00007210}
7211
7212/* Finalize the Unicode implementation */
7213
7214void
Thomas Wouters78890102000-07-22 19:25:51 +00007215_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007216{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007217 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007218 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007219
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00007220 Py_XDECREF(unicode_empty);
7221 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007222
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00007223 for (i = 0; i < 256; i++) {
7224 if (unicode_latin1[i]) {
7225 Py_DECREF(unicode_latin1[i]);
7226 unicode_latin1[i] = NULL;
7227 }
7228 }
7229
Barry Warsaw5b4c2282000-10-03 20:45:26 +00007230 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00007231 PyUnicodeObject *v = u;
7232 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00007233 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00007234 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00007235 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00007236 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007237 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00007238 unicode_freelist = NULL;
7239 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007240}
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00007241
7242/*
7243Local variables:
7244c-basic-offset: 4
7245indent-tabs-mode: nil
7246End:
7247*/