blob: 6a358da6f1e04d5e45a44a88bebd781327cd01a4 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
Guido van Rossum049cd6b2002-10-11 00:43:48 +0000264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1 || length < 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +0000393PyObject *PyUnicode_FromOrdinal(int ordinal)
394{
395 Py_UNICODE s[2];
396
397#ifdef Py_UNICODE_WIDE
398 if (ordinal < 0 || ordinal > 0x10ffff) {
399 PyErr_SetString(PyExc_ValueError,
400 "unichr() arg not in range(0x110000) "
401 "(wide Python build)");
402 return NULL;
403 }
404#else
405 if (ordinal < 0 || ordinal > 0xffff) {
406 PyErr_SetString(PyExc_ValueError,
407 "unichr() arg not in range(0x10000) "
408 "(narrow Python build)");
409 return NULL;
410 }
411#endif
412
413 if (ordinal <= 0xffff) {
414 /* UCS-2 character */
415 s[0] = (Py_UNICODE) ordinal;
416 return PyUnicode_FromUnicode(s, 1);
417 }
418 else {
419#ifndef Py_UNICODE_WIDE
420 /* UCS-4 character. store as two surrogate characters */
421 ordinal -= 0x10000L;
422 s[0] = 0xD800 + (Py_UNICODE) (ordinal >> 10);
423 s[1] = 0xDC00 + (Py_UNICODE) (ordinal & 0x03FF);
424 return PyUnicode_FromUnicode(s, 2);
425#else
426 s[0] = (Py_UNICODE)ordinal;
427 return PyUnicode_FromUnicode(s, 1);
428#endif
429 }
430}
431
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432PyObject *PyUnicode_FromObject(register PyObject *obj)
433{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000434 /* XXX Perhaps we should make this API an alias of
435 PyObject_Unicode() instead ?! */
436 if (PyUnicode_CheckExact(obj)) {
437 Py_INCREF(obj);
438 return obj;
439 }
440 if (PyUnicode_Check(obj)) {
441 /* For a Unicode subtype that's not a Unicode object,
442 return a true Unicode object with the same data. */
443 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
444 PyUnicode_GET_SIZE(obj));
445 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000446 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
447}
448
449PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
450 const char *encoding,
451 const char *errors)
452{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000453 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000456
457 if (obj == NULL) {
458 PyErr_BadInternalCall();
459 return NULL;
460 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000461
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000462#if 0
463 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000464 that no encodings is given and then redirect to
465 PyObject_Unicode() which then applies the additional logic for
466 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000467
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000468 NOTE: This API should really only be used for object which
469 represent *encoded* Unicode !
470
471 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000472 if (PyUnicode_Check(obj)) {
473 if (encoding) {
474 PyErr_SetString(PyExc_TypeError,
475 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000476 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000477 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000478 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000479 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000480#else
481 if (PyUnicode_Check(obj)) {
482 PyErr_SetString(PyExc_TypeError,
483 "decoding Unicode is not supported");
484 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000485 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000486#endif
487
488 /* Coerce object */
489 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000490 s = PyString_AS_STRING(obj);
491 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000492 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000493 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
494 /* Overwrite the error message with something more useful in
495 case of a TypeError. */
496 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000497 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000498 "coercing to Unicode: need string or buffer, "
499 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000500 obj->ob_type->tp_name);
501 goto onError;
502 }
503
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000504 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 if (len == 0) {
506 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000507 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000509 else
510 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000511
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000512 return v;
513
514 onError:
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516}
517
518PyObject *PyUnicode_Decode(const char *s,
519 int size,
520 const char *encoding,
521 const char *errors)
522{
523 PyObject *buffer = NULL, *unicode;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000524
525 if (encoding == NULL)
Fred Drakee4315f52000-05-09 19:53:39 +0000526 encoding = PyUnicode_GetDefaultEncoding();
527
528 /* Shortcuts for common default encodings */
529 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000531 else if (strcmp(encoding, "latin-1") == 0)
532 return PyUnicode_DecodeLatin1(s, size, errors);
533 else if (strcmp(encoding, "ascii") == 0)
534 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000535
536 /* Decode via the codec registry */
537 buffer = PyBuffer_FromMemory((void *)s, size);
538 if (buffer == NULL)
539 goto onError;
540 unicode = PyCodec_Decode(buffer, encoding, errors);
541 if (unicode == NULL)
542 goto onError;
543 if (!PyUnicode_Check(unicode)) {
544 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000545 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000546 unicode->ob_type->tp_name);
547 Py_DECREF(unicode);
548 goto onError;
549 }
550 Py_DECREF(buffer);
551 return unicode;
552
553 onError:
554 Py_XDECREF(buffer);
555 return NULL;
556}
557
558PyObject *PyUnicode_Encode(const Py_UNICODE *s,
559 int size,
560 const char *encoding,
561 const char *errors)
562{
563 PyObject *v, *unicode;
564
565 unicode = PyUnicode_FromUnicode(s, size);
566 if (unicode == NULL)
567 return NULL;
568 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
569 Py_DECREF(unicode);
570 return v;
571}
572
573PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
574 const char *encoding,
575 const char *errors)
576{
577 PyObject *v;
578
579 if (!PyUnicode_Check(unicode)) {
580 PyErr_BadArgument();
581 goto onError;
582 }
Fred Drakee4315f52000-05-09 19:53:39 +0000583
584 if (encoding == NULL)
585 encoding = PyUnicode_GetDefaultEncoding();
586
587 /* Shortcuts for common default encodings */
588 if (errors == NULL) {
589 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000590 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000591 else if (strcmp(encoding, "latin-1") == 0)
592 return PyUnicode_AsLatin1String(unicode);
593 else if (strcmp(encoding, "ascii") == 0)
594 return PyUnicode_AsASCIIString(unicode);
595 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596
597 /* Encode via the codec registry */
598 v = PyCodec_Encode(unicode, encoding, errors);
599 if (v == NULL)
600 goto onError;
601 /* XXX Should we really enforce this ? */
602 if (!PyString_Check(v)) {
603 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000604 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000605 v->ob_type->tp_name);
606 Py_DECREF(v);
607 goto onError;
608 }
609 return v;
610
611 onError:
612 return NULL;
613}
614
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000615PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
616 const char *errors)
617{
618 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
619
620 if (v)
621 return v;
622 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
623 if (v && errors == NULL)
624 ((PyUnicodeObject *)unicode)->defenc = v;
625 return v;
626}
627
Guido van Rossumd57fd912000-03-10 22:53:23 +0000628Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
629{
630 if (!PyUnicode_Check(unicode)) {
631 PyErr_BadArgument();
632 goto onError;
633 }
634 return PyUnicode_AS_UNICODE(unicode);
635
636 onError:
637 return NULL;
638}
639
640int PyUnicode_GetSize(PyObject *unicode)
641{
642 if (!PyUnicode_Check(unicode)) {
643 PyErr_BadArgument();
644 goto onError;
645 }
646 return PyUnicode_GET_SIZE(unicode);
647
648 onError:
649 return -1;
650}
651
Thomas Wouters78890102000-07-22 19:25:51 +0000652const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000653{
654 return unicode_default_encoding;
655}
656
657int PyUnicode_SetDefaultEncoding(const char *encoding)
658{
659 PyObject *v;
660
661 /* Make sure the encoding is valid. As side effect, this also
662 loads the encoding into the codec registry cache. */
663 v = _PyCodec_Lookup(encoding);
664 if (v == NULL)
665 goto onError;
666 Py_DECREF(v);
667 strncpy(unicode_default_encoding,
668 encoding,
669 sizeof(unicode_default_encoding));
670 return 0;
671
672 onError:
673 return -1;
674}
675
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000676/* error handling callback helper:
677 build arguments, call the callback and check the arguments,
678 if no exception occured, copy the replacement to the output
679 and adjust various state variables.
680 return 0 on success, -1 on error
681*/
682
683static
684int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
685 const char *encoding, const char *reason,
686 const char *input, int insize, int *startinpos, int *endinpos, PyObject **exceptionObject, const char **inptr,
687 PyObject **output, int *outpos, Py_UNICODE **outptr)
688{
689 static char *argparse = "O!i;decoding error handler must return (unicode, int) tuple";
690
691 PyObject *restuple = NULL;
692 PyObject *repunicode = NULL;
693 int outsize = PyUnicode_GET_SIZE(*output);
694 int requiredsize;
695 int newpos;
696 Py_UNICODE *repptr;
697 int repsize;
698 int res = -1;
699
700 if (*errorHandler == NULL) {
701 *errorHandler = PyCodec_LookupError(errors);
702 if (*errorHandler == NULL)
703 goto onError;
704 }
705
706 if (*exceptionObject == NULL) {
707 *exceptionObject = PyUnicodeDecodeError_Create(
708 encoding, input, insize, *startinpos, *endinpos, reason);
709 if (*exceptionObject == NULL)
710 goto onError;
711 }
712 else {
713 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
714 goto onError;
715 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
716 goto onError;
717 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
718 goto onError;
719 }
720
721 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
722 if (restuple == NULL)
723 goto onError;
724 if (!PyTuple_Check(restuple)) {
725 PyErr_Format(PyExc_TypeError, &argparse[4]);
726 goto onError;
727 }
728 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
729 goto onError;
730 if (newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +0000731 newpos = insize+newpos;
732 if (newpos<0 || newpos>insize) {
733 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", newpos);
734 goto onError;
735 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000736
737 /* need more space? (at least enough for what we
738 have+the replacement+the rest of the string (starting
739 at the new input position), so we won't have to check space
740 when there are no errors in the rest of the string) */
741 repptr = PyUnicode_AS_UNICODE(repunicode);
742 repsize = PyUnicode_GET_SIZE(repunicode);
743 requiredsize = *outpos + repsize + insize-newpos;
744 if (requiredsize > outsize) {
745 if (requiredsize<2*outsize)
746 requiredsize = 2*outsize;
747 if (PyUnicode_Resize(output, requiredsize))
748 goto onError;
749 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
750 }
751 *endinpos = newpos;
752 *inptr = input + newpos;
753 Py_UNICODE_COPY(*outptr, repptr, repsize);
754 *outptr += repsize;
755 *outpos += repsize;
756 /* we made it! */
757 res = 0;
758
759 onError:
760 Py_XDECREF(restuple);
761 return res;
762}
763
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000764/* --- UTF-7 Codec -------------------------------------------------------- */
765
766/* see RFC2152 for details */
767
768static
769char utf7_special[128] = {
770 /* indicate whether a UTF-7 character is special i.e. cannot be directly
771 encoded:
772 0 - not special
773 1 - special
774 2 - whitespace (optional)
775 3 - RFC2152 Set O (optional) */
776 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
777 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
778 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
779 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
780 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
781 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
782 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
783 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
784
785};
786
787#define SPECIAL(c, encodeO, encodeWS) \
788 (((c)>127 || utf7_special[(c)] == 1) || \
789 (encodeWS && (utf7_special[(c)] == 2)) || \
790 (encodeO && (utf7_special[(c)] == 3)))
791
792#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
793#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
794#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
795 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
796
797#define ENCODE(out, ch, bits) \
798 while (bits >= 6) { \
799 *out++ = B64(ch >> (bits-6)); \
800 bits -= 6; \
801 }
802
803#define DECODE(out, ch, bits, surrogate) \
804 while (bits >= 16) { \
805 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
806 bits -= 16; \
807 if (surrogate) { \
808 /* We have already generated an error for the high surrogate
809 so let's not bother seeing if the low surrogate is correct or not */\
810 surrogate = 0; \
811 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
812 /* This is a surrogate pair. Unfortunately we can't represent \
813 it in a 16-bit character */ \
814 surrogate = 1; \
815 errmsg = "code pairs are not supported"; \
816 goto utf7Error; \
817 } else { \
818 *out++ = outCh; \
819 } \
820 } \
821
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000822PyObject *PyUnicode_DecodeUTF7(const char *s,
823 int size,
824 const char *errors)
825{
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000826 const char *starts = s;
827 int startinpos;
828 int endinpos;
829 int outpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000830 const char *e;
831 PyUnicodeObject *unicode;
832 Py_UNICODE *p;
833 const char *errmsg = "";
834 int inShift = 0;
835 unsigned int bitsleft = 0;
836 unsigned long charsleft = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000837 int surrogate = 0;
838 PyObject *errorHandler = NULL;
839 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000840
841 unicode = _PyUnicode_New(size);
842 if (!unicode)
843 return NULL;
844 if (size == 0)
845 return (PyObject *)unicode;
846
847 p = unicode->str;
848 e = s + size;
849
850 while (s < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000851 Py_UNICODE ch;
852 restart:
853 ch = *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000854
855 if (inShift) {
856 if ((ch == '-') || !B64CHAR(ch)) {
857 inShift = 0;
858 s++;
859
860 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
861 if (bitsleft >= 6) {
862 /* The shift sequence has a partial character in it. If
863 bitsleft < 6 then we could just classify it as padding
864 but that is not the case here */
865
866 errmsg = "partial character in shift sequence";
867 goto utf7Error;
868 }
869 /* According to RFC2152 the remaining bits should be zero. We
870 choose to signal an error/insert a replacement character
871 here so indicate the potential of a misencoded character. */
872
873 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
874 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
875 errmsg = "non-zero padding bits in shift sequence";
876 goto utf7Error;
877 }
878
879 if (ch == '-') {
880 if ((s < e) && (*(s) == '-')) {
881 *p++ = '-';
882 inShift = 1;
883 }
884 } else if (SPECIAL(ch,0,0)) {
885 errmsg = "unexpected special character";
886 goto utf7Error;
887 } else {
888 *p++ = ch;
889 }
890 } else {
891 charsleft = (charsleft << 6) | UB64(ch);
892 bitsleft += 6;
893 s++;
894 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
895 }
896 }
897 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000898 startinpos = s-starts;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000899 s++;
900 if (s < e && *s == '-') {
901 s++;
902 *p++ = '+';
903 } else
904 {
905 inShift = 1;
906 bitsleft = 0;
907 }
908 }
909 else if (SPECIAL(ch,0,0)) {
910 errmsg = "unexpected special character";
911 s++;
912 goto utf7Error;
913 }
914 else {
915 *p++ = ch;
916 s++;
917 }
918 continue;
919 utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000920 outpos = p-PyUnicode_AS_UNICODE(unicode);
921 endinpos = s-starts;
922 if (unicode_decode_call_errorhandler(
923 errors, &errorHandler,
924 "utf7", errmsg,
925 starts, size, &startinpos, &endinpos, &exc, &s,
926 (PyObject **)&unicode, &outpos, &p))
927 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000928 }
929
930 if (inShift) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000931 outpos = p-PyUnicode_AS_UNICODE(unicode);
932 endinpos = size;
933 if (unicode_decode_call_errorhandler(
934 errors, &errorHandler,
935 "utf7", "unterminated shift sequence",
936 starts, size, &startinpos, &endinpos, &exc, &s,
937 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000938 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000939 if (s < e)
940 goto restart;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000941 }
942
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000943 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000944 goto onError;
945
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000946 Py_XDECREF(errorHandler);
947 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000948 return (PyObject *)unicode;
949
950onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +0000951 Py_XDECREF(errorHandler);
952 Py_XDECREF(exc);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000953 Py_DECREF(unicode);
954 return NULL;
955}
956
957
958PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
959 int size,
960 int encodeSetO,
961 int encodeWhiteSpace,
962 const char *errors)
963{
964 PyObject *v;
965 /* It might be possible to tighten this worst case */
966 unsigned int cbAllocated = 5 * size;
967 int inShift = 0;
968 int i = 0;
969 unsigned int bitsleft = 0;
970 unsigned long charsleft = 0;
971 char * out;
972 char * start;
973
974 if (size == 0)
975 return PyString_FromStringAndSize(NULL, 0);
976
977 v = PyString_FromStringAndSize(NULL, cbAllocated);
978 if (v == NULL)
979 return NULL;
980
981 start = out = PyString_AS_STRING(v);
982 for (;i < size; ++i) {
983 Py_UNICODE ch = s[i];
984
985 if (!inShift) {
986 if (ch == '+') {
987 *out++ = '+';
988 *out++ = '-';
989 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
990 charsleft = ch;
991 bitsleft = 16;
992 *out++ = '+';
993 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
994 inShift = bitsleft > 0;
995 } else {
996 *out++ = (char) ch;
997 }
998 } else {
999 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
1000 *out++ = B64(charsleft << (6-bitsleft));
1001 charsleft = 0;
1002 bitsleft = 0;
1003 /* Characters not in the BASE64 set implicitly unshift the sequence
1004 so no '-' is required, except if the character is itself a '-' */
1005 if (B64CHAR(ch) || ch == '-') {
1006 *out++ = '-';
1007 }
1008 inShift = 0;
1009 *out++ = (char) ch;
1010 } else {
1011 bitsleft += 16;
1012 charsleft = (charsleft << 16) | ch;
1013 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
1014
1015 /* If the next character is special then we dont' need to terminate
1016 the shift sequence. If the next character is not a BASE64 character
1017 or '-' then the shift sequence will be terminated implicitly and we
1018 don't have to insert a '-'. */
1019
1020 if (bitsleft == 0) {
1021 if (i + 1 < size) {
1022 Py_UNICODE ch2 = s[i+1];
1023
1024 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
1025
1026 } else if (B64CHAR(ch2) || ch2 == '-') {
1027 *out++ = '-';
1028 inShift = 0;
1029 } else {
1030 inShift = 0;
1031 }
1032
1033 }
1034 else {
1035 *out++ = '-';
1036 inShift = 0;
1037 }
1038 }
1039 }
1040 }
1041 }
1042 if (bitsleft) {
1043 *out++= B64(charsleft << (6-bitsleft) );
1044 *out++ = '-';
1045 }
1046
Tim Peters5de98422002-04-27 18:44:32 +00001047 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00001048 return v;
1049}
1050
1051#undef SPECIAL
1052#undef B64
1053#undef B64CHAR
1054#undef UB64
1055#undef ENCODE
1056#undef DECODE
1057
Guido van Rossumd57fd912000-03-10 22:53:23 +00001058/* --- UTF-8 Codec -------------------------------------------------------- */
1059
1060static
1061char utf8_code_length[256] = {
1062 /* Map UTF-8 encoded prefix byte to sequence length. zero means
1063 illegal prefix. see RFC 2279 for details */
1064 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1065 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1066 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1067 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1068 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1069 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1070 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1071 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1072 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1073 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1074 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1077 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1078 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1079 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
1080};
1081
Guido van Rossumd57fd912000-03-10 22:53:23 +00001082PyObject *PyUnicode_DecodeUTF8(const char *s,
1083 int size,
1084 const char *errors)
1085{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001086 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 int n;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001088 int startinpos;
1089 int endinpos;
1090 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001091 const char *e;
1092 PyUnicodeObject *unicode;
1093 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001094 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001095 PyObject *errorHandler = NULL;
1096 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097
1098 /* Note: size will always be longer than the resulting Unicode
1099 character count */
1100 unicode = _PyUnicode_New(size);
1101 if (!unicode)
1102 return NULL;
1103 if (size == 0)
1104 return (PyObject *)unicode;
1105
1106 /* Unpack UTF-8 encoded data */
1107 p = unicode->str;
1108 e = s + size;
1109
1110 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001111 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112
1113 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001114 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 s++;
1116 continue;
1117 }
1118
1119 n = utf8_code_length[ch];
1120
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001121 if (s + n > e) {
1122 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001123 startinpos = s-starts;
1124 endinpos = size;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001125 goto utf8Error;
1126 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127
1128 switch (n) {
1129
1130 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001131 errmsg = "unexpected code byte";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001132 startinpos = s-starts;
1133 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001134 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135
1136 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001137 errmsg = "internal error";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001138 startinpos = s-starts;
1139 endinpos = startinpos+1;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001140 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141
1142 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001143 if ((s[1] & 0xc0) != 0x80) {
1144 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001145 startinpos = s-starts;
1146 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001147 goto utf8Error;
1148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001150 if (ch < 0x80) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001151 startinpos = s-starts;
1152 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001153 errmsg = "illegal encoding";
1154 goto utf8Error;
1155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001157 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158 break;
1159
1160 case 3:
1161 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001162 (s[2] & 0xc0) != 0x80) {
1163 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001164 startinpos = s-starts;
1165 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001166 goto utf8Error;
1167 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001169 if (ch < 0x0800) {
1170 /* Note: UTF-8 encodings of surrogates are considered
1171 legal UTF-8 sequences;
1172
1173 XXX For wide builds (UCS-4) we should probably try
1174 to recombine the surrogates into a single code
1175 unit.
1176 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001177 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001178 startinpos = s-starts;
1179 endinpos = startinpos+3;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001180 goto utf8Error;
1181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001183 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001184 break;
1185
1186 case 4:
1187 if ((s[1] & 0xc0) != 0x80 ||
1188 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001189 (s[3] & 0xc0) != 0x80) {
1190 errmsg = "invalid data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001191 startinpos = s-starts;
1192 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001193 goto utf8Error;
1194 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001195 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1196 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1197 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001198 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001199 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001200 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001201 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001202 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001203 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001204 startinpos = s-starts;
1205 endinpos = startinpos+4;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001206 goto utf8Error;
1207 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001208#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001209 *p++ = (Py_UNICODE)ch;
1210#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001211 /* compute and append the two surrogates: */
1212
1213 /* translate from 10000..10FFFF to 0..FFFF */
1214 ch -= 0x10000;
1215
1216 /* high surrogate = top 10 bits added to D800 */
1217 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1218
1219 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001220 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001221#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001222 break;
1223
1224 default:
1225 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001226 errmsg = "unsupported Unicode code range";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001227 startinpos = s-starts;
1228 endinpos = startinpos+n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001229 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
1231 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001232 continue;
1233
1234 utf8Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001235 outpos = p-PyUnicode_AS_UNICODE(unicode);
1236 if (unicode_decode_call_errorhandler(
1237 errors, &errorHandler,
1238 "utf8", errmsg,
1239 starts, size, &startinpos, &endinpos, &exc, &s,
1240 (PyObject **)&unicode, &outpos, &p))
1241 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 }
1243
1244 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001245 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001246 goto onError;
1247
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001248 Py_XDECREF(errorHandler);
1249 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250 return (PyObject *)unicode;
1251
1252onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001253 Py_XDECREF(errorHandler);
1254 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 Py_DECREF(unicode);
1256 return NULL;
1257}
1258
Tim Peters602f7402002-04-27 18:03:26 +00001259/* Allocation strategy: if the string is short, convert into a stack buffer
1260 and allocate exactly as much space needed at the end. Else allocate the
1261 maximum possible needed (4 result bytes per Unicode character), and return
1262 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001263*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001264PyObject *
1265PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1266 int size,
1267 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268{
Tim Peters602f7402002-04-27 18:03:26 +00001269#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001270
Tim Peters602f7402002-04-27 18:03:26 +00001271 int i; /* index into s of next input byte */
1272 PyObject *v; /* result string object */
1273 char *p; /* next free byte in output buffer */
1274 int nallocated; /* number of result bytes allocated */
1275 int nneeded; /* number of result bytes needed */
1276 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001277
Tim Peters602f7402002-04-27 18:03:26 +00001278 assert(s != NULL);
1279 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280
Tim Peters602f7402002-04-27 18:03:26 +00001281 if (size <= MAX_SHORT_UNICHARS) {
1282 /* Write into the stack buffer; nallocated can't overflow.
1283 * At the end, we'll allocate exactly as much heap space as it
1284 * turns out we need.
1285 */
1286 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1287 v = NULL; /* will allocate after we're done */
1288 p = stackbuf;
1289 }
1290 else {
1291 /* Overallocate on the heap, and give the excess back at the end. */
1292 nallocated = size * 4;
1293 if (nallocated / 4 != size) /* overflow! */
1294 return PyErr_NoMemory();
1295 v = PyString_FromStringAndSize(NULL, nallocated);
1296 if (v == NULL)
1297 return NULL;
1298 p = PyString_AS_STRING(v);
1299 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001300
Tim Peters602f7402002-04-27 18:03:26 +00001301 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001302 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001303
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001304 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001305 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001307
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001309 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001310 *p++ = (char)(0xc0 | (ch >> 6));
1311 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001312 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001313 else {
Tim Peters602f7402002-04-27 18:03:26 +00001314 /* Encode UCS2 Unicode ordinals */
1315 if (ch < 0x10000) {
1316 /* Special case: check for high surrogate */
1317 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1318 Py_UCS4 ch2 = s[i];
1319 /* Check for low surrogate and combine the two to
1320 form a UCS4 value */
1321 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001322 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001323 i++;
1324 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001325 }
Tim Peters602f7402002-04-27 18:03:26 +00001326 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001327 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001328 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001329 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1330 *p++ = (char)(0x80 | (ch & 0x3f));
1331 continue;
1332 }
1333encodeUCS4:
1334 /* Encode UCS4 Unicode ordinals */
1335 *p++ = (char)(0xf0 | (ch >> 18));
1336 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1337 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1338 *p++ = (char)(0x80 | (ch & 0x3f));
1339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001341
Tim Peters602f7402002-04-27 18:03:26 +00001342 if (v == NULL) {
1343 /* This was stack allocated. */
1344 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1345 assert(nneeded <= nallocated);
1346 v = PyString_FromStringAndSize(stackbuf, nneeded);
1347 }
1348 else {
1349 /* Cut back to size actually needed. */
1350 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1351 assert(nneeded <= nallocated);
1352 _PyString_Resize(&v, nneeded);
1353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001355
Tim Peters602f7402002-04-27 18:03:26 +00001356#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357}
1358
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1360{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361 if (!PyUnicode_Check(unicode)) {
1362 PyErr_BadArgument();
1363 return NULL;
1364 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001365 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1366 PyUnicode_GET_SIZE(unicode),
1367 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001368}
1369
1370/* --- UTF-16 Codec ------------------------------------------------------- */
1371
Tim Peters772747b2001-08-09 22:21:55 +00001372PyObject *
1373PyUnicode_DecodeUTF16(const char *s,
1374 int size,
1375 const char *errors,
1376 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001377{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001378 const char *starts = s;
1379 int startinpos;
1380 int endinpos;
1381 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001382 PyUnicodeObject *unicode;
1383 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001384 const unsigned char *q, *e;
1385 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001386 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001387 /* Offsets from q for retrieving byte pairs in the right order. */
1388#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1389 int ihi = 1, ilo = 0;
1390#else
1391 int ihi = 0, ilo = 1;
1392#endif
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001393 PyObject *errorHandler = NULL;
1394 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395
1396 /* Note: size will always be longer than the resulting Unicode
1397 character count */
1398 unicode = _PyUnicode_New(size);
1399 if (!unicode)
1400 return NULL;
1401 if (size == 0)
1402 return (PyObject *)unicode;
1403
1404 /* Unpack UTF-16 encoded data */
1405 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001406 q = (unsigned char *)s;
1407 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408
1409 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001410 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001412 /* Check for BOM marks (U+FEFF) in the input and adjust current
1413 byte order setting accordingly. In native mode, the leading BOM
1414 mark is skipped, in all other modes, it is copied to the output
1415 stream as-is (giving a ZWNBSP character). */
1416 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001417 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001418#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001419 if (bom == 0xFEFF) {
1420 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001421 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001422 }
1423 else if (bom == 0xFFFE) {
1424 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001425 bo = 1;
1426 }
1427#else
Tim Peters772747b2001-08-09 22:21:55 +00001428 if (bom == 0xFEFF) {
1429 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001430 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001431 }
1432 else if (bom == 0xFFFE) {
1433 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001434 bo = -1;
1435 }
1436#endif
1437 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438
Tim Peters772747b2001-08-09 22:21:55 +00001439 if (bo == -1) {
1440 /* force LE */
1441 ihi = 1;
1442 ilo = 0;
1443 }
1444 else if (bo == 1) {
1445 /* force BE */
1446 ihi = 0;
1447 ilo = 1;
1448 }
1449
1450 while (q < e) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001451 Py_UNICODE ch;
1452 /* remaing bytes at the end? (size should be even) */
1453 if (e-q<2) {
1454 errmsg = "truncated data";
1455 startinpos = ((const char *)q)-starts;
1456 endinpos = ((const char *)e)-starts;
1457 goto utf16Error;
1458 /* The remaining input chars are ignored if the callback
1459 chooses to skip the input */
1460 }
1461 ch = (q[ihi] << 8) | q[ilo];
1462
Tim Peters772747b2001-08-09 22:21:55 +00001463 q += 2;
1464
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 if (ch < 0xD800 || ch > 0xDFFF) {
1466 *p++ = ch;
1467 continue;
1468 }
1469
1470 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001471 if (q >= e) {
1472 errmsg = "unexpected end of data";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001473 startinpos = (((const char *)q)-2)-starts;
1474 endinpos = ((const char *)e)-starts;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001475 goto utf16Error;
1476 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001477 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001478 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1479 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001480 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001481#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001482 *p++ = ch;
1483 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001484#else
1485 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001486#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001487 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001488 }
1489 else {
1490 errmsg = "illegal UTF-16 surrogate";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001491 startinpos = (((const char *)q)-4)-starts;
1492 endinpos = startinpos+2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001493 goto utf16Error;
1494 }
1495
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001497 errmsg = "illegal encoding";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001498 startinpos = (((const char *)q)-2)-starts;
1499 endinpos = startinpos+2;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001500 /* Fall through to report the error */
1501
1502 utf16Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001503 outpos = p-PyUnicode_AS_UNICODE(unicode);
1504 if (unicode_decode_call_errorhandler(
1505 errors, &errorHandler,
1506 "utf16", errmsg,
1507 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
1508 (PyObject **)&unicode, &outpos, &p))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001509 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510 }
1511
1512 if (byteorder)
1513 *byteorder = bo;
1514
1515 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001516 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 goto onError;
1518
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001519 Py_XDECREF(errorHandler);
1520 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521 return (PyObject *)unicode;
1522
1523onError:
1524 Py_DECREF(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001525 Py_XDECREF(errorHandler);
1526 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527 return NULL;
1528}
1529
Tim Peters772747b2001-08-09 22:21:55 +00001530PyObject *
1531PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1532 int size,
1533 const char *errors,
1534 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535{
1536 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001537 unsigned char *p;
1538 int i, pairs;
1539 /* Offsets from p for storing byte pairs in the right order. */
1540#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1541 int ihi = 1, ilo = 0;
1542#else
1543 int ihi = 0, ilo = 1;
1544#endif
1545
1546#define STORECHAR(CH) \
1547 do { \
1548 p[ihi] = ((CH) >> 8) & 0xff; \
1549 p[ilo] = (CH) & 0xff; \
1550 p += 2; \
1551 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001552
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001553 for (i = pairs = 0; i < size; i++)
1554 if (s[i] >= 0x10000)
1555 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001557 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 if (v == NULL)
1559 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560
Tim Peters772747b2001-08-09 22:21:55 +00001561 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001563 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001564 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001565 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001566
1567 if (byteorder == -1) {
1568 /* force LE */
1569 ihi = 1;
1570 ilo = 0;
1571 }
1572 else if (byteorder == 1) {
1573 /* force BE */
1574 ihi = 0;
1575 ilo = 1;
1576 }
1577
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001578 while (size-- > 0) {
1579 Py_UNICODE ch = *s++;
1580 Py_UNICODE ch2 = 0;
1581 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001582 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1583 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584 }
Tim Peters772747b2001-08-09 22:21:55 +00001585 STORECHAR(ch);
1586 if (ch2)
1587 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001589 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001590#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591}
1592
1593PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1594{
1595 if (!PyUnicode_Check(unicode)) {
1596 PyErr_BadArgument();
1597 return NULL;
1598 }
1599 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1600 PyUnicode_GET_SIZE(unicode),
1601 NULL,
1602 0);
1603}
1604
1605/* --- Unicode Escape Codec ----------------------------------------------- */
1606
Fredrik Lundh06d12682001-01-24 07:59:11 +00001607static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001608
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1610 int size,
1611 const char *errors)
1612{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001613 const char *starts = s;
1614 int startinpos;
1615 int endinpos;
1616 int outpos;
1617 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001619 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001621 char* message;
1622 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001623 PyObject *errorHandler = NULL;
1624 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001625
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 /* Escaped strings will always be longer than the resulting
1627 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001628 length after conversion to the true value.
1629 (but if the error callback returns a long replacement string
1630 we'll have to allocate more space) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631 v = _PyUnicode_New(size);
1632 if (v == NULL)
1633 goto onError;
1634 if (size == 0)
1635 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001636
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001637 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001639
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640 while (s < end) {
1641 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001642 Py_UNICODE x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001643 int digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644
1645 /* Non-escape characters are interpreted as Unicode ordinals */
1646 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001647 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001648 continue;
1649 }
1650
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001651 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001652 /* \ - Escapes */
1653 s++;
1654 switch (*s++) {
1655
1656 /* \x escapes */
1657 case '\n': break;
1658 case '\\': *p++ = '\\'; break;
1659 case '\'': *p++ = '\''; break;
1660 case '\"': *p++ = '\"'; break;
1661 case 'b': *p++ = '\b'; break;
1662 case 'f': *p++ = '\014'; break; /* FF */
1663 case 't': *p++ = '\t'; break;
1664 case 'n': *p++ = '\n'; break;
1665 case 'r': *p++ = '\r'; break;
1666 case 'v': *p++ = '\013'; break; /* VT */
1667 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1668
1669 /* \OOO (octal) escapes */
1670 case '0': case '1': case '2': case '3':
1671 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001672 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001674 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001676 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001677 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001678 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 break;
1680
Fredrik Lundhccc74732001-02-18 22:13:49 +00001681 /* hex escapes */
1682 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001683 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001684 digits = 2;
1685 message = "truncated \\xXX escape";
1686 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687
Fredrik Lundhccc74732001-02-18 22:13:49 +00001688 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001690 digits = 4;
1691 message = "truncated \\uXXXX escape";
1692 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693
Fredrik Lundhccc74732001-02-18 22:13:49 +00001694 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001695 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001696 digits = 8;
1697 message = "truncated \\UXXXXXXXX escape";
1698 hexescape:
1699 chr = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001700 outpos = p-PyUnicode_AS_UNICODE(v);
1701 if (s+digits>end) {
1702 endinpos = size;
1703 if (unicode_decode_call_errorhandler(
1704 errors, &errorHandler,
1705 "unicodeescape", "end of string in escape sequence",
1706 starts, size, &startinpos, &endinpos, &exc, &s,
1707 (PyObject **)&v, &outpos, &p))
1708 goto onError;
1709 goto nextByte;
1710 }
1711 for (i = 0; i < digits; ++i) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001712 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001713 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001714 endinpos = (s+i+1)-starts;
1715 if (unicode_decode_call_errorhandler(
1716 errors, &errorHandler,
1717 "unicodeescape", message,
1718 starts, size, &startinpos, &endinpos, &exc, &s,
1719 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001720 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001721 goto nextByte;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001722 }
1723 chr = (chr<<4) & ~0xF;
1724 if (c >= '0' && c <= '9')
1725 chr += c - '0';
1726 else if (c >= 'a' && c <= 'f')
1727 chr += 10 + c - 'a';
1728 else
1729 chr += 10 + c - 'A';
1730 }
1731 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001732 if (chr == 0xffffffff)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001733 /* _decoding_error will have already written into the
1734 target buffer. */
1735 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001736 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001737 /* when we get here, chr is a 32-bit unicode character */
1738 if (chr <= 0xffff)
1739 /* UCS-2 character */
1740 *p++ = (Py_UNICODE) chr;
1741 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001742 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001743 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001744#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001745 *p++ = chr;
1746#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001747 chr -= 0x10000L;
1748 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001749 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001750#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001751 } else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001752 endinpos = s-starts;
1753 outpos = p-PyUnicode_AS_UNICODE(v);
1754 if (unicode_decode_call_errorhandler(
1755 errors, &errorHandler,
1756 "unicodeescape", "illegal Unicode character",
1757 starts, size, &startinpos, &endinpos, &exc, &s,
1758 (PyObject **)&v, &outpos, &p))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001759 goto onError;
1760 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001761 break;
1762
1763 /* \N{name} */
1764 case 'N':
1765 message = "malformed \\N character escape";
1766 if (ucnhash_CAPI == NULL) {
1767 /* load the unicode data module */
1768 PyObject *m, *v;
1769 m = PyImport_ImportModule("unicodedata");
1770 if (m == NULL)
1771 goto ucnhashError;
1772 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1773 Py_DECREF(m);
1774 if (v == NULL)
1775 goto ucnhashError;
1776 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1777 Py_DECREF(v);
1778 if (ucnhash_CAPI == NULL)
1779 goto ucnhashError;
1780 }
1781 if (*s == '{') {
1782 const char *start = s+1;
1783 /* look for the closing brace */
1784 while (*s != '}' && s < end)
1785 s++;
1786 if (s > start && s < end && *s == '}') {
1787 /* found a name. look it up in the unicode database */
1788 message = "unknown Unicode character name";
1789 s++;
1790 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1791 goto store;
1792 }
1793 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001794 endinpos = s-starts;
1795 outpos = p-PyUnicode_AS_UNICODE(v);
1796 if (unicode_decode_call_errorhandler(
1797 errors, &errorHandler,
1798 "unicodeescape", message,
1799 starts, size, &startinpos, &endinpos, &exc, &s,
1800 (PyObject **)&v, &outpos, &p))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001801 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001802 break;
1803
1804 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001805 if (s > end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001806 message = "\\ at end of string";
1807 s--;
1808 endinpos = s-starts;
1809 outpos = p-PyUnicode_AS_UNICODE(v);
1810 if (unicode_decode_call_errorhandler(
1811 errors, &errorHandler,
1812 "unicodeescape", message,
1813 starts, size, &startinpos, &endinpos, &exc, &s,
1814 (PyObject **)&v, &outpos, &p))
Walter Dörwald8c077222002-03-25 11:16:18 +00001815 goto onError;
1816 }
1817 else {
1818 *p++ = '\\';
1819 *p++ = (unsigned char)s[-1];
1820 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001821 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001823 nextByte:
1824 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001826 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
1827 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001829
Fredrik Lundhccc74732001-02-18 22:13:49 +00001830ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001831 PyErr_SetString(
1832 PyExc_UnicodeError,
1833 "\\N escapes not supported (can't load unicodedata module)"
1834 );
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001835 Py_XDECREF(errorHandler);
1836 Py_XDECREF(exc);
Fredrik Lundhf6056062001-01-20 11:15:25 +00001837 return NULL;
1838
Fredrik Lundhccc74732001-02-18 22:13:49 +00001839onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00001841 Py_XDECREF(errorHandler);
1842 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843 return NULL;
1844}
1845
1846/* Return a Unicode-Escape string version of the Unicode object.
1847
1848 If quotes is true, the string is enclosed in u"" or u'' quotes as
1849 appropriate.
1850
1851*/
1852
Barry Warsaw51ac5802000-03-20 16:36:48 +00001853static const Py_UNICODE *findchar(const Py_UNICODE *s,
1854 int size,
1855 Py_UNICODE ch);
1856
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857static
1858PyObject *unicodeescape_string(const Py_UNICODE *s,
1859 int size,
1860 int quotes)
1861{
1862 PyObject *repr;
1863 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001865 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866
1867 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1868 if (repr == NULL)
1869 return NULL;
1870
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001871 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872
1873 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001874 *p++ = 'u';
1875 *p++ = (findchar(s, size, '\'') &&
1876 !findchar(s, size, '"')) ? '"' : '\'';
1877 }
1878 while (size-- > 0) {
1879 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001880
Guido van Rossumd57fd912000-03-10 22:53:23 +00001881 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001882 if (quotes &&
1883 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001884 *p++ = '\\';
1885 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001886 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001888
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001889#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001890 /* Map 21-bit characters to '\U00xxxxxx' */
1891 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001892 int offset = p - PyString_AS_STRING(repr);
1893
1894 /* Resize the string if necessary */
1895 if (offset + 12 > PyString_GET_SIZE(repr)) {
1896 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001897 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001898 p = PyString_AS_STRING(repr) + offset;
1899 }
1900
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001901 *p++ = '\\';
1902 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001903 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1904 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1905 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1906 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1907 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1908 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1909 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001910 *p++ = hexdigit[ch & 0x0000000F];
1911 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001912 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001913#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001914 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1915 else if (ch >= 0xD800 && ch < 0xDC00) {
1916 Py_UNICODE ch2;
1917 Py_UCS4 ucs;
1918
1919 ch2 = *s++;
1920 size--;
1921 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1922 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1923 *p++ = '\\';
1924 *p++ = 'U';
1925 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1926 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1927 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1928 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1929 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1930 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1931 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1932 *p++ = hexdigit[ucs & 0x0000000F];
1933 continue;
1934 }
1935 /* Fall through: isolated surrogates are copied as-is */
1936 s--;
1937 size++;
1938 }
1939
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001941 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 *p++ = '\\';
1943 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001944 *p++ = hexdigit[(ch >> 12) & 0x000F];
1945 *p++ = hexdigit[(ch >> 8) & 0x000F];
1946 *p++ = hexdigit[(ch >> 4) & 0x000F];
1947 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001948 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001949
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001950 /* Map special whitespace to '\t', \n', '\r' */
1951 else if (ch == '\t') {
1952 *p++ = '\\';
1953 *p++ = 't';
1954 }
1955 else if (ch == '\n') {
1956 *p++ = '\\';
1957 *p++ = 'n';
1958 }
1959 else if (ch == '\r') {
1960 *p++ = '\\';
1961 *p++ = 'r';
1962 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001963
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001964 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001965 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001967 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001968 *p++ = hexdigit[(ch >> 4) & 0x000F];
1969 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001970 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001971
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 /* Copy everything else as-is */
1973 else
1974 *p++ = (char) ch;
1975 }
1976 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001977 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001978
1979 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001980 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 return repr;
1982}
1983
1984PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1985 int size)
1986{
1987 return unicodeescape_string(s, size, 0);
1988}
1989
1990PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1991{
1992 if (!PyUnicode_Check(unicode)) {
1993 PyErr_BadArgument();
1994 return NULL;
1995 }
1996 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1997 PyUnicode_GET_SIZE(unicode));
1998}
1999
2000/* --- Raw Unicode Escape Codec ------------------------------------------- */
2001
2002PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
2003 int size,
2004 const char *errors)
2005{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002006 const char *starts = s;
2007 int startinpos;
2008 int endinpos;
2009 int outpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 PyUnicodeObject *v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002011 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012 const char *end;
2013 const char *bs;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002014 PyObject *errorHandler = NULL;
2015 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016
2017 /* Escaped strings will always be longer than the resulting
2018 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002019 length after conversion to the true value. (But decoding error
2020 handler might have to resize the string) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 v = _PyUnicode_New(size);
2022 if (v == NULL)
2023 goto onError;
2024 if (size == 0)
2025 return (PyObject *)v;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002026 p = PyUnicode_AS_UNICODE(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027 end = s + size;
2028 while (s < end) {
2029 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00002030 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 int i;
2032
2033 /* Non-escape characters are interpreted as Unicode ordinals */
2034 if (*s != '\\') {
2035 *p++ = (unsigned char)*s++;
2036 continue;
2037 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002038 startinpos = s-starts;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039
2040 /* \u-escapes are only interpreted iff the number of leading
2041 backslashes if odd */
2042 bs = s;
2043 for (;s < end;) {
2044 if (*s != '\\')
2045 break;
2046 *p++ = (unsigned char)*s++;
2047 }
2048 if (((s - bs) & 1) == 0 ||
2049 s >= end ||
2050 *s != 'u') {
2051 continue;
2052 }
2053 p--;
2054 s++;
2055
2056 /* \uXXXX with 4 hex digits */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002057 outpos = p-PyUnicode_AS_UNICODE(v);
2058 for (x = 0, i = 0; i < 4; ++i, ++s) {
2059 c = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 if (!isxdigit(c)) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002061 endinpos = s-starts;
2062 if (unicode_decode_call_errorhandler(
2063 errors, &errorHandler,
2064 "rawunicodeescape", "truncated \\uXXXX",
2065 starts, size, &startinpos, &endinpos, &exc, &s,
2066 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002068 goto nextByte;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 }
2070 x = (x<<4) & ~0xF;
2071 if (c >= '0' && c <= '9')
2072 x += c - '0';
2073 else if (c >= 'a' && c <= 'f')
2074 x += 10 + c - 'a';
2075 else
2076 x += 10 + c - 'A';
2077 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002078 *p++ = x;
2079 nextByte:
2080 ;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002082 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002083 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002084 Py_XDECREF(errorHandler);
2085 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 return (PyObject *)v;
2087
2088 onError:
2089 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002090 Py_XDECREF(errorHandler);
2091 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 return NULL;
2093}
2094
2095PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
2096 int size)
2097{
2098 PyObject *repr;
2099 char *p;
2100 char *q;
2101
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00002102 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103
2104 repr = PyString_FromStringAndSize(NULL, 6 * size);
2105 if (repr == NULL)
2106 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002107 if (size == 0)
2108 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109
2110 p = q = PyString_AS_STRING(repr);
2111 while (size-- > 0) {
2112 Py_UNICODE ch = *s++;
2113 /* Map 16-bit characters to '\uxxxx' */
2114 if (ch >= 256) {
2115 *p++ = '\\';
2116 *p++ = 'u';
2117 *p++ = hexdigit[(ch >> 12) & 0xf];
2118 *p++ = hexdigit[(ch >> 8) & 0xf];
2119 *p++ = hexdigit[(ch >> 4) & 0xf];
2120 *p++ = hexdigit[ch & 15];
2121 }
2122 /* Copy everything else as-is */
2123 else
2124 *p++ = (char) ch;
2125 }
2126 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00002127 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 return repr;
2129}
2130
2131PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2132{
2133 if (!PyUnicode_Check(unicode)) {
2134 PyErr_BadArgument();
2135 return NULL;
2136 }
2137 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2138 PyUnicode_GET_SIZE(unicode));
2139}
2140
2141/* --- Latin-1 Codec ------------------------------------------------------ */
2142
2143PyObject *PyUnicode_DecodeLatin1(const char *s,
2144 int size,
2145 const char *errors)
2146{
2147 PyUnicodeObject *v;
2148 Py_UNICODE *p;
2149
2150 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002151 if (size == 1 && *(unsigned char*)s < 256) {
2152 Py_UNICODE r = *(unsigned char*)s;
2153 return PyUnicode_FromUnicode(&r, 1);
2154 }
2155
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 v = _PyUnicode_New(size);
2157 if (v == NULL)
2158 goto onError;
2159 if (size == 0)
2160 return (PyObject *)v;
2161 p = PyUnicode_AS_UNICODE(v);
2162 while (size-- > 0)
2163 *p++ = (unsigned char)*s++;
2164 return (PyObject *)v;
2165
2166 onError:
2167 Py_XDECREF(v);
2168 return NULL;
2169}
2170
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002171/* create or adjust a UnicodeEncodeError */
2172static void make_encode_exception(PyObject **exceptionObject,
2173 const char *encoding,
2174 const Py_UNICODE *unicode, int size,
2175 int startpos, int endpos,
2176 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002178 if (*exceptionObject == NULL) {
2179 *exceptionObject = PyUnicodeEncodeError_Create(
2180 encoding, unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 }
2182 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002183 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
2184 goto onError;
2185 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
2186 goto onError;
2187 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
2188 goto onError;
2189 return;
2190 onError:
2191 Py_DECREF(*exceptionObject);
2192 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002193 }
2194}
2195
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002196/* raises a UnicodeEncodeError */
2197static void raise_encode_exception(PyObject **exceptionObject,
2198 const char *encoding,
2199 const Py_UNICODE *unicode, int size,
2200 int startpos, int endpos,
2201 const char *reason)
2202{
2203 make_encode_exception(exceptionObject,
2204 encoding, unicode, size, startpos, endpos, reason);
2205 if (*exceptionObject != NULL)
2206 PyCodec_StrictErrors(*exceptionObject);
2207}
2208
2209/* error handling callback helper:
2210 build arguments, call the callback and check the arguments,
2211 put the result into newpos and return the replacement string, which
2212 has to be freed by the caller */
2213static PyObject *unicode_encode_call_errorhandler(const char *errors,
2214 PyObject **errorHandler,
2215 const char *encoding, const char *reason,
2216 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
2217 int startpos, int endpos,
2218 int *newpos)
2219{
2220 static char *argparse = "O!i;encoding error handler must return (unicode, int) tuple";
2221
2222 PyObject *restuple;
2223 PyObject *resunicode;
2224
2225 if (*errorHandler == NULL) {
2226 *errorHandler = PyCodec_LookupError(errors);
2227 if (*errorHandler == NULL)
2228 return NULL;
2229 }
2230
2231 make_encode_exception(exceptionObject,
2232 encoding, unicode, size, startpos, endpos, reason);
2233 if (*exceptionObject == NULL)
2234 return NULL;
2235
2236 restuple = PyObject_CallFunctionObjArgs(
2237 *errorHandler, *exceptionObject, NULL);
2238 if (restuple == NULL)
2239 return NULL;
2240 if (!PyTuple_Check(restuple)) {
2241 PyErr_Format(PyExc_TypeError, &argparse[4]);
2242 Py_DECREF(restuple);
2243 return NULL;
2244 }
2245 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
2246 &resunicode, newpos)) {
2247 Py_DECREF(restuple);
2248 return NULL;
2249 }
2250 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00002251 *newpos = size+*newpos;
2252 if (*newpos<0 || *newpos>size) {
2253 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
2254 Py_DECREF(restuple);
2255 return NULL;
2256 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002257 Py_INCREF(resunicode);
2258 Py_DECREF(restuple);
2259 return resunicode;
2260}
2261
2262static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
2263 int size,
2264 const char *errors,
2265 int limit)
2266{
2267 /* output object */
2268 PyObject *res;
2269 /* pointers to the beginning and end+1 of input */
2270 const Py_UNICODE *startp = p;
2271 const Py_UNICODE *endp = p + size;
2272 /* pointer to the beginning of the unencodable characters */
2273 /* const Py_UNICODE *badp = NULL; */
2274 /* pointer into the output */
2275 char *str;
2276 /* current output position */
2277 int respos = 0;
2278 int ressize;
2279 char *encoding = (limit == 256) ? "latin-1" : "ascii";
2280 char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
2281 PyObject *errorHandler = NULL;
2282 PyObject *exc = NULL;
2283 /* the following variable is used for caching string comparisons
2284 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
2285 int known_errorHandler = -1;
2286
2287 /* allocate enough for a simple encoding without
2288 replacements, if we need more, we'll resize */
2289 res = PyString_FromStringAndSize(NULL, size);
2290 if (res == NULL)
2291 goto onError;
2292 if (size == 0)
2293 return res;
2294 str = PyString_AS_STRING(res);
2295 ressize = size;
2296
2297 while (p<endp) {
2298 Py_UNICODE c = *p;
2299
2300 /* can we encode this? */
2301 if (c<limit) {
2302 /* no overflow check, because we know that the space is enough */
2303 *str++ = (char)c;
2304 ++p;
2305 }
2306 else {
2307 int unicodepos = p-startp;
2308 int requiredsize;
2309 PyObject *repunicode;
2310 int repsize;
2311 int newpos;
2312 int respos;
2313 Py_UNICODE *uni2;
2314 /* startpos for collecting unencodable chars */
2315 const Py_UNICODE *collstart = p;
2316 const Py_UNICODE *collend = p;
2317 /* find all unecodable characters */
2318 while ((collend < endp) && ((*collend)>=limit))
2319 ++collend;
2320 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
2321 if (known_errorHandler==-1) {
2322 if ((errors==NULL) || (!strcmp(errors, "strict")))
2323 known_errorHandler = 1;
2324 else if (!strcmp(errors, "replace"))
2325 known_errorHandler = 2;
2326 else if (!strcmp(errors, "ignore"))
2327 known_errorHandler = 3;
2328 else if (!strcmp(errors, "xmlcharrefreplace"))
2329 known_errorHandler = 4;
2330 else
2331 known_errorHandler = 0;
2332 }
2333 switch (known_errorHandler) {
2334 case 1: /* strict */
2335 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
2336 goto onError;
2337 case 2: /* replace */
2338 while (collstart++<collend)
2339 *str++ = '?'; /* fall through */
2340 case 3: /* ignore */
2341 p = collend;
2342 break;
2343 case 4: /* xmlcharrefreplace */
2344 respos = str-PyString_AS_STRING(res);
2345 /* determine replacement size (temporarily (mis)uses p) */
2346 for (p = collstart, repsize = 0; p < collend; ++p) {
2347 if (*p<10)
2348 repsize += 2+1+1;
2349 else if (*p<100)
2350 repsize += 2+2+1;
2351 else if (*p<1000)
2352 repsize += 2+3+1;
2353 else if (*p<10000)
2354 repsize += 2+4+1;
2355 else if (*p<100000)
2356 repsize += 2+5+1;
2357 else if (*p<1000000)
2358 repsize += 2+6+1;
2359 else
2360 repsize += 2+7+1;
2361 }
2362 requiredsize = respos+repsize+(endp-collend);
2363 if (requiredsize > ressize) {
2364 if (requiredsize<2*ressize)
2365 requiredsize = 2*ressize;
2366 if (_PyString_Resize(&res, requiredsize))
2367 goto onError;
2368 str = PyString_AS_STRING(res) + respos;
2369 ressize = requiredsize;
2370 }
2371 /* generate replacement (temporarily (mis)uses p) */
2372 for (p = collstart; p < collend; ++p) {
2373 str += sprintf(str, "&#%d;", (int)*p);
2374 }
2375 p = collend;
2376 break;
2377 default:
2378 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2379 encoding, reason, startp, size, &exc,
2380 collstart-startp, collend-startp, &newpos);
2381 if (repunicode == NULL)
2382 goto onError;
2383 /* need more space? (at least enough for what we
2384 have+the replacement+the rest of the string, so
2385 we won't have to check space for encodable characters) */
2386 respos = str-PyString_AS_STRING(res);
2387 repsize = PyUnicode_GET_SIZE(repunicode);
2388 requiredsize = respos+repsize+(endp-collend);
2389 if (requiredsize > ressize) {
2390 if (requiredsize<2*ressize)
2391 requiredsize = 2*ressize;
2392 if (_PyString_Resize(&res, requiredsize)) {
2393 Py_DECREF(repunicode);
2394 goto onError;
2395 }
2396 str = PyString_AS_STRING(res) + respos;
2397 ressize = requiredsize;
2398 }
2399 /* check if there is anything unencodable in the replacement
2400 and copy it to the output */
2401 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
2402 c = *uni2;
2403 if (c >= limit) {
2404 raise_encode_exception(&exc, encoding, startp, size,
2405 unicodepos, unicodepos+1, reason);
2406 Py_DECREF(repunicode);
2407 goto onError;
2408 }
2409 *str = (char)c;
2410 }
2411 p = startp + newpos;
2412 Py_DECREF(repunicode);
2413 }
2414 }
2415 }
2416 /* Resize if we allocated to much */
2417 respos = str-PyString_AS_STRING(res);
2418 if (respos<ressize)
2419 /* If this falls res will be NULL */
2420 _PyString_Resize(&res, respos);
2421 Py_XDECREF(errorHandler);
2422 Py_XDECREF(exc);
2423 return res;
2424
2425 onError:
2426 Py_XDECREF(res);
2427 Py_XDECREF(errorHandler);
2428 Py_XDECREF(exc);
2429 return NULL;
2430}
2431
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2433 int size,
2434 const char *errors)
2435{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002436 return unicode_encode_ucs1(p, size, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437}
2438
2439PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2440{
2441 if (!PyUnicode_Check(unicode)) {
2442 PyErr_BadArgument();
2443 return NULL;
2444 }
2445 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2446 PyUnicode_GET_SIZE(unicode),
2447 NULL);
2448}
2449
2450/* --- 7-bit ASCII Codec -------------------------------------------------- */
2451
Guido van Rossumd57fd912000-03-10 22:53:23 +00002452PyObject *PyUnicode_DecodeASCII(const char *s,
2453 int size,
2454 const char *errors)
2455{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002456 const char *starts = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002457 PyUnicodeObject *v;
2458 Py_UNICODE *p;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002459 int startinpos;
2460 int endinpos;
2461 int outpos;
2462 const char *e;
2463 PyObject *errorHandler = NULL;
2464 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465
2466 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002467 if (size == 1 && *(unsigned char*)s < 128) {
2468 Py_UNICODE r = *(unsigned char*)s;
2469 return PyUnicode_FromUnicode(&r, 1);
2470 }
2471
Guido van Rossumd57fd912000-03-10 22:53:23 +00002472 v = _PyUnicode_New(size);
2473 if (v == NULL)
2474 goto onError;
2475 if (size == 0)
2476 return (PyObject *)v;
2477 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002478 e = s + size;
2479 while (s < e) {
2480 register unsigned char c = (unsigned char)*s;
2481 if (c < 128) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002482 *p++ = c;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002483 ++s;
2484 }
2485 else {
2486 startinpos = s-starts;
2487 endinpos = startinpos + 1;
2488 outpos = p-PyUnicode_AS_UNICODE(v);
2489 if (unicode_decode_call_errorhandler(
2490 errors, &errorHandler,
2491 "ascii", "ordinal not in range(128)",
2492 starts, size, &startinpos, &endinpos, &exc, &s,
2493 (PyObject **)&v, &outpos, &p))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002495 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002496 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002497 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002498 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002499 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002500 Py_XDECREF(errorHandler);
2501 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502 return (PyObject *)v;
2503
2504 onError:
2505 Py_XDECREF(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002506 Py_XDECREF(errorHandler);
2507 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 return NULL;
2509}
2510
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2512 int size,
2513 const char *errors)
2514{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002515 return unicode_encode_ucs1(p, size, errors, 128);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516}
2517
2518PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2519{
2520 if (!PyUnicode_Check(unicode)) {
2521 PyErr_BadArgument();
2522 return NULL;
2523 }
2524 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2525 PyUnicode_GET_SIZE(unicode),
2526 NULL);
2527}
2528
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002529#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002530
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002531/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002532
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002533PyObject *PyUnicode_DecodeMBCS(const char *s,
2534 int size,
2535 const char *errors)
2536{
2537 PyUnicodeObject *v;
2538 Py_UNICODE *p;
2539
2540 /* First get the size of the result */
2541 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002542 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002543 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2544
2545 v = _PyUnicode_New(usize);
2546 if (v == NULL)
2547 return NULL;
2548 if (usize == 0)
2549 return (PyObject *)v;
2550 p = PyUnicode_AS_UNICODE(v);
2551 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2552 Py_DECREF(v);
2553 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2554 }
2555
2556 return (PyObject *)v;
2557}
2558
2559PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2560 int size,
2561 const char *errors)
2562{
2563 PyObject *repr;
2564 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002565 DWORD mbcssize;
2566
2567 /* If there are no characters, bail now! */
2568 if (size==0)
2569 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002570
2571 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002572 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002573 if (mbcssize==0)
2574 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2575
2576 repr = PyString_FromStringAndSize(NULL, mbcssize);
2577 if (repr == NULL)
2578 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002579 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002580 return repr;
2581
2582 /* Do the conversion */
2583 s = PyString_AS_STRING(repr);
2584 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2585 Py_DECREF(repr);
2586 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2587 }
2588 return repr;
2589}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002590
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002591#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002592
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593/* --- Character Mapping Codec -------------------------------------------- */
2594
Guido van Rossumd57fd912000-03-10 22:53:23 +00002595PyObject *PyUnicode_DecodeCharmap(const char *s,
2596 int size,
2597 PyObject *mapping,
2598 const char *errors)
2599{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002600 const char *starts = s;
2601 int startinpos;
2602 int endinpos;
2603 int outpos;
2604 const char *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 PyUnicodeObject *v;
2606 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002607 int extrachars = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002608 PyObject *errorHandler = NULL;
2609 PyObject *exc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610
2611 /* Default to Latin-1 */
2612 if (mapping == NULL)
2613 return PyUnicode_DecodeLatin1(s, size, errors);
2614
2615 v = _PyUnicode_New(size);
2616 if (v == NULL)
2617 goto onError;
2618 if (size == 0)
2619 return (PyObject *)v;
2620 p = PyUnicode_AS_UNICODE(v);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002621 e = s + size;
2622 while (s < e) {
2623 unsigned char ch = *s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002624 PyObject *w, *x;
2625
2626 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2627 w = PyInt_FromLong((long)ch);
2628 if (w == NULL)
2629 goto onError;
2630 x = PyObject_GetItem(mapping, w);
2631 Py_DECREF(w);
2632 if (x == NULL) {
2633 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002634 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002635 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002636 x = Py_None;
2637 Py_INCREF(x);
2638 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002639 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 }
2641
2642 /* Apply mapping */
2643 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002644 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002645 if (value < 0 || value > 65535) {
2646 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002647 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002648 Py_DECREF(x);
2649 goto onError;
2650 }
2651 *p++ = (Py_UNICODE)value;
2652 }
2653 else if (x == Py_None) {
2654 /* undefined mapping */
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002655 outpos = p-PyUnicode_AS_UNICODE(v);
2656 startinpos = s-starts;
2657 endinpos = startinpos+1;
2658 if (unicode_decode_call_errorhandler(
2659 errors, &errorHandler,
2660 "charmap", "character maps to <undefined>",
2661 starts, size, &startinpos, &endinpos, &exc, &s,
2662 (PyObject **)&v, &outpos, &p)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002663 Py_DECREF(x);
2664 goto onError;
2665 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002666 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002667 }
2668 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002669 int targetsize = PyUnicode_GET_SIZE(x);
2670
2671 if (targetsize == 1)
2672 /* 1-1 mapping */
2673 *p++ = *PyUnicode_AS_UNICODE(x);
2674
2675 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002677 if (targetsize > extrachars) {
2678 /* resize first */
2679 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2680 int needed = (targetsize - extrachars) + \
2681 (targetsize << 2);
2682 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002683 if (_PyUnicode_Resize(&v,
2684 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002685 Py_DECREF(x);
2686 goto onError;
2687 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002688 p = PyUnicode_AS_UNICODE(v) + oldpos;
2689 }
2690 Py_UNICODE_COPY(p,
2691 PyUnicode_AS_UNICODE(x),
2692 targetsize);
2693 p += targetsize;
2694 extrachars -= targetsize;
2695 }
2696 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002697 }
2698 else {
2699 /* wrong return value */
2700 PyErr_SetString(PyExc_TypeError,
2701 "character mapping must return integer, None or unicode");
2702 Py_DECREF(x);
2703 goto onError;
2704 }
2705 Py_DECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002706 ++s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002707 }
2708 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002709 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002710 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002711 Py_XDECREF(errorHandler);
2712 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002713 return (PyObject *)v;
2714
2715 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002716 Py_XDECREF(errorHandler);
2717 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718 Py_XDECREF(v);
2719 return NULL;
2720}
2721
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002722/* Lookup the character ch in the mapping. If the character
2723 can't be found, Py_None is returned (or NULL, if another
2724 error occured). */
2725static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002726{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002727 PyObject *w = PyInt_FromLong((long)c);
2728 PyObject *x;
2729
2730 if (w == NULL)
2731 return NULL;
2732 x = PyObject_GetItem(mapping, w);
2733 Py_DECREF(w);
2734 if (x == NULL) {
2735 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2736 /* No mapping found means: mapping is undefined. */
2737 PyErr_Clear();
2738 x = Py_None;
2739 Py_INCREF(x);
2740 return x;
2741 } else
2742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002743 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00002744 else if (x == Py_None)
2745 return x;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002746 else if (PyInt_Check(x)) {
2747 long value = PyInt_AS_LONG(x);
2748 if (value < 0 || value > 255) {
2749 PyErr_SetString(PyExc_TypeError,
2750 "character mapping must be in range(256)");
2751 Py_DECREF(x);
2752 return NULL;
2753 }
2754 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002756 else if (PyString_Check(x))
2757 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002758 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002759 /* wrong return value */
2760 PyErr_SetString(PyExc_TypeError,
2761 "character mapping must return integer, None or str");
2762 Py_DECREF(x);
2763 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002764 }
2765}
2766
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002767/* lookup the character, put the result in the output string and adjust
2768 various state variables. Reallocate the output string if not enough
2769 space is available. Return a new reference to the object that
2770 was put in the output buffer, or Py_None, if the mapping was undefined
2771 (in which case no character was written) or NULL, if a
2772 reallocation error ocurred. The called must decref the result */
2773static
2774PyObject *charmapencode_output(Py_UNICODE c, PyObject *mapping,
2775 PyObject **outobj, int *outpos)
2776{
2777 PyObject *rep = charmapencode_lookup(c, mapping);
2778
2779 if (rep==NULL)
2780 return NULL;
2781 else if (rep==Py_None)
2782 return rep;
2783 else {
2784 char *outstart = PyString_AS_STRING(*outobj);
2785 int outsize = PyString_GET_SIZE(*outobj);
2786 if (PyInt_Check(rep)) {
2787 int requiredsize = *outpos+1;
2788 if (outsize<requiredsize) {
2789 /* exponentially overallocate to minimize reallocations */
2790 if (requiredsize < 2*outsize)
2791 requiredsize = 2*outsize;
2792 if (_PyString_Resize(outobj, requiredsize)) {
2793 Py_DECREF(rep);
2794 return NULL;
2795 }
2796 outstart = PyString_AS_STRING(*outobj);
2797 }
2798 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
2799 }
2800 else {
2801 const char *repchars = PyString_AS_STRING(rep);
2802 int repsize = PyString_GET_SIZE(rep);
2803 int requiredsize = *outpos+repsize;
2804 if (outsize<requiredsize) {
2805 /* exponentially overallocate to minimize reallocations */
2806 if (requiredsize < 2*outsize)
2807 requiredsize = 2*outsize;
2808 if (_PyString_Resize(outobj, requiredsize)) {
2809 Py_DECREF(rep);
2810 return NULL;
2811 }
2812 outstart = PyString_AS_STRING(*outobj);
2813 }
2814 memcpy(outstart + *outpos, repchars, repsize);
2815 *outpos += repsize;
2816 }
2817 }
2818 return rep;
2819}
2820
2821/* handle an error in PyUnicode_EncodeCharmap
2822 Return 0 on success, -1 on error */
2823static
2824int charmap_encoding_error(
2825 const Py_UNICODE *p, int size, int *inpos, PyObject *mapping,
2826 PyObject **exceptionObject,
2827 int *known_errorHandler, PyObject *errorHandler, const char *errors,
2828 PyObject **res, int *respos)
2829{
2830 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
2831 int repsize;
2832 int newpos;
2833 Py_UNICODE *uni2;
2834 /* startpos for collecting unencodable chars */
2835 int collstartpos = *inpos;
2836 int collendpos = *inpos+1;
2837 int collpos;
2838 char *encoding = "charmap";
2839 char *reason = "character maps to <undefined>";
2840
2841 PyObject *x;
2842 /* find all unencodable characters */
2843 while (collendpos < size) {
2844 x = charmapencode_lookup(p[collendpos], mapping);
2845 if (x==NULL)
2846 return -1;
2847 else if (x!=Py_None) {
2848 Py_DECREF(x);
2849 break;
2850 }
2851 Py_DECREF(x);
2852 ++collendpos;
2853 }
2854 /* cache callback name lookup
2855 * (if not done yet, i.e. it's the first error) */
2856 if (*known_errorHandler==-1) {
2857 if ((errors==NULL) || (!strcmp(errors, "strict")))
2858 *known_errorHandler = 1;
2859 else if (!strcmp(errors, "replace"))
2860 *known_errorHandler = 2;
2861 else if (!strcmp(errors, "ignore"))
2862 *known_errorHandler = 3;
2863 else if (!strcmp(errors, "xmlcharrefreplace"))
2864 *known_errorHandler = 4;
2865 else
2866 *known_errorHandler = 0;
2867 }
2868 switch (*known_errorHandler) {
2869 case 1: /* strict */
2870 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2871 return -1;
2872 case 2: /* replace */
2873 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
2874 x = charmapencode_output('?', mapping, res, respos);
2875 if (x==NULL) {
2876 return -1;
2877 }
2878 else if (x==Py_None) {
2879 Py_DECREF(x);
2880 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2881 return -1;
2882 }
2883 Py_DECREF(x);
2884 }
2885 /* fall through */
2886 case 3: /* ignore */
2887 *inpos = collendpos;
2888 break;
2889 case 4: /* xmlcharrefreplace */
2890 /* generate replacement (temporarily (mis)uses p) */
2891 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
2892 char buffer[2+29+1+1];
2893 char *cp;
2894 sprintf(buffer, "&#%d;", (int)p[collpos]);
2895 for (cp = buffer; *cp; ++cp) {
2896 x = charmapencode_output(*cp, mapping, res, respos);
2897 if (x==NULL)
2898 return -1;
2899 else if (x==Py_None) {
2900 Py_DECREF(x);
2901 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2902 return -1;
2903 }
2904 Py_DECREF(x);
2905 }
2906 }
2907 *inpos = collendpos;
2908 break;
2909 default:
2910 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
2911 encoding, reason, p, size, exceptionObject,
2912 collstartpos, collendpos, &newpos);
2913 if (repunicode == NULL)
2914 return -1;
2915 /* generate replacement */
2916 repsize = PyUnicode_GET_SIZE(repunicode);
2917 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
2918 x = charmapencode_output(*uni2, mapping, res, respos);
2919 if (x==NULL) {
2920 Py_DECREF(repunicode);
2921 return -1;
2922 }
2923 else if (x==Py_None) {
2924 Py_DECREF(repunicode);
2925 Py_DECREF(x);
2926 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
2927 return -1;
2928 }
2929 Py_DECREF(x);
2930 }
2931 *inpos = newpos;
2932 Py_DECREF(repunicode);
2933 }
2934 return 0;
2935}
2936
Guido van Rossumd57fd912000-03-10 22:53:23 +00002937PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2938 int size,
2939 PyObject *mapping,
2940 const char *errors)
2941{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002942 /* output object */
2943 PyObject *res = NULL;
2944 /* current input position */
2945 int inpos = 0;
2946 /* current output position */
2947 int respos = 0;
2948 PyObject *errorHandler = NULL;
2949 PyObject *exc = NULL;
2950 /* the following variable is used for caching string comparisons
2951 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
2952 * 3=ignore, 4=xmlcharrefreplace */
2953 int known_errorHandler = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002954
2955 /* Default to Latin-1 */
2956 if (mapping == NULL)
2957 return PyUnicode_EncodeLatin1(p, size, errors);
2958
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002959 /* allocate enough for a simple encoding without
2960 replacements, if we need more, we'll resize */
2961 res = PyString_FromStringAndSize(NULL, size);
2962 if (res == NULL)
2963 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002964 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002965 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002966
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002967 while (inpos<size) {
2968 /* try to encode it */
2969 PyObject *x = charmapencode_output(p[inpos], mapping, &res, &respos);
2970 if (x==NULL) /* error */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002971 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002972 if (x==Py_None) { /* unencodable character */
2973 if (charmap_encoding_error(p, size, &inpos, mapping,
2974 &exc,
2975 &known_errorHandler, errorHandler, errors,
2976 &res, &respos))
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002977 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002979 else
2980 /* done with this character => adjust input position */
2981 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982 Py_DECREF(x);
2983 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002984
Walter Dörwald3aeb6322002-09-02 13:14:32 +00002985 /* Resize if we allocated to much */
2986 if (respos<PyString_GET_SIZE(res)) {
2987 if (_PyString_Resize(&res, respos))
2988 goto onError;
2989 }
2990 Py_XDECREF(exc);
2991 Py_XDECREF(errorHandler);
2992 return res;
2993
2994 onError:
2995 Py_XDECREF(res);
2996 Py_XDECREF(exc);
2997 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002998 return NULL;
2999}
3000
3001PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
3002 PyObject *mapping)
3003{
3004 if (!PyUnicode_Check(unicode) || mapping == NULL) {
3005 PyErr_BadArgument();
3006 return NULL;
3007 }
3008 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
3009 PyUnicode_GET_SIZE(unicode),
3010 mapping,
3011 NULL);
3012}
3013
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003014/* create or adjust a UnicodeTranslateError */
3015static void make_translate_exception(PyObject **exceptionObject,
3016 const Py_UNICODE *unicode, int size,
3017 int startpos, int endpos,
3018 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003019{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003020 if (*exceptionObject == NULL) {
3021 *exceptionObject = PyUnicodeTranslateError_Create(
3022 unicode, size, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023 }
3024 else {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003025 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
3026 goto onError;
3027 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
3028 goto onError;
3029 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
3030 goto onError;
3031 return;
3032 onError:
3033 Py_DECREF(*exceptionObject);
3034 *exceptionObject = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 }
3036}
3037
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003038/* raises a UnicodeTranslateError */
3039static void raise_translate_exception(PyObject **exceptionObject,
3040 const Py_UNICODE *unicode, int size,
3041 int startpos, int endpos,
3042 const char *reason)
3043{
3044 make_translate_exception(exceptionObject,
3045 unicode, size, startpos, endpos, reason);
3046 if (*exceptionObject != NULL)
3047 PyCodec_StrictErrors(*exceptionObject);
3048}
3049
3050/* error handling callback helper:
3051 build arguments, call the callback and check the arguments,
3052 put the result into newpos and return the replacement string, which
3053 has to be freed by the caller */
3054static PyObject *unicode_translate_call_errorhandler(const char *errors,
3055 PyObject **errorHandler,
3056 const char *reason,
3057 const Py_UNICODE *unicode, int size, PyObject **exceptionObject,
3058 int startpos, int endpos,
3059 int *newpos)
3060{
3061 static char *argparse = "O!i;translating error handler must return (unicode, int) tuple";
3062
3063 PyObject *restuple;
3064 PyObject *resunicode;
3065
3066 if (*errorHandler == NULL) {
3067 *errorHandler = PyCodec_LookupError(errors);
3068 if (*errorHandler == NULL)
3069 return NULL;
3070 }
3071
3072 make_translate_exception(exceptionObject,
3073 unicode, size, startpos, endpos, reason);
3074 if (*exceptionObject == NULL)
3075 return NULL;
3076
3077 restuple = PyObject_CallFunctionObjArgs(
3078 *errorHandler, *exceptionObject, NULL);
3079 if (restuple == NULL)
3080 return NULL;
3081 if (!PyTuple_Check(restuple)) {
3082 PyErr_Format(PyExc_TypeError, &argparse[4]);
3083 Py_DECREF(restuple);
3084 return NULL;
3085 }
3086 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3087 &resunicode, newpos)) {
3088 Py_DECREF(restuple);
3089 return NULL;
3090 }
3091 if (*newpos<0)
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00003092 *newpos = size+*newpos;
3093 if (*newpos<0 || *newpos>size) {
3094 PyErr_Format(PyExc_IndexError, "position %d from error handler out of bounds", *newpos);
3095 Py_DECREF(restuple);
3096 return NULL;
3097 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003098 Py_INCREF(resunicode);
3099 Py_DECREF(restuple);
3100 return resunicode;
3101}
3102
3103/* Lookup the character ch in the mapping and put the result in result,
3104 which must be decrefed by the caller.
3105 Return 0 on success, -1 on error */
3106static
3107int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
3108{
3109 PyObject *w = PyInt_FromLong((long)c);
3110 PyObject *x;
3111
3112 if (w == NULL)
3113 return -1;
3114 x = PyObject_GetItem(mapping, w);
3115 Py_DECREF(w);
3116 if (x == NULL) {
3117 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
3118 /* No mapping found means: use 1:1 mapping. */
3119 PyErr_Clear();
3120 *result = NULL;
3121 return 0;
3122 } else
3123 return -1;
3124 }
3125 else if (x == Py_None) {
3126 *result = x;
3127 return 0;
3128 }
3129 else if (PyInt_Check(x)) {
3130 long value = PyInt_AS_LONG(x);
3131 long max = PyUnicode_GetMax();
3132 if (value < 0 || value > max) {
3133 PyErr_Format(PyExc_TypeError,
3134 "character mapping must be in range(0x%lx)", max+1);
3135 Py_DECREF(x);
3136 return -1;
3137 }
3138 *result = x;
3139 return 0;
3140 }
3141 else if (PyUnicode_Check(x)) {
3142 *result = x;
3143 return 0;
3144 }
3145 else {
3146 /* wrong return value */
3147 PyErr_SetString(PyExc_TypeError,
3148 "character mapping must return integer, None or unicode");
3149 return -1;
3150 }
3151}
3152/* ensure that *outobj is at least requiredsize characters long,
3153if not reallocate and adjust various state variables.
3154Return 0 on success, -1 on error */
3155static
3156int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp, int *outsize,
3157 int requiredsize)
3158{
3159 if (requiredsize > *outsize) {
3160 /* remember old output position */
3161 int outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
3162 /* exponentially overallocate to minimize reallocations */
3163 if (requiredsize < 2 * *outsize)
3164 requiredsize = 2 * *outsize;
3165 if (_PyUnicode_Resize(outobj, requiredsize))
3166 return -1;
3167 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
3168 *outsize = requiredsize;
3169 }
3170 return 0;
3171}
3172/* lookup the character, put the result in the output string and adjust
3173 various state variables. Return a new reference to the object that
3174 was put in the output buffer in *result, or Py_None, if the mapping was
3175 undefined (in which case no character was written).
3176 The called must decref result.
3177 Return 0 on success, -1 on error. */
3178static
3179int charmaptranslate_output(Py_UNICODE c, PyObject *mapping,
3180 PyObject **outobj, int *outsize, Py_UNICODE **outp, PyObject **res)
3181{
3182 if (charmaptranslate_lookup(c, mapping, res))
3183 return -1;
3184 if (*res==NULL) {
3185 /* not found => default to 1:1 mapping */
3186 *(*outp)++ = (Py_UNICODE)c;
3187 }
3188 else if (*res==Py_None)
3189 ;
3190 else if (PyInt_Check(*res)) {
3191 /* no overflow check, because we know that the space is enough */
3192 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
3193 }
3194 else if (PyUnicode_Check(*res)) {
3195 int repsize = PyUnicode_GET_SIZE(*res);
3196 if (repsize==1) {
3197 /* no overflow check, because we know that the space is enough */
3198 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
3199 }
3200 else if (repsize!=0) {
3201 /* more than one character */
3202 int requiredsize = *outsize + repsize - 1;
3203 if (charmaptranslate_makespace(outobj, outp, outsize, requiredsize))
3204 return -1;
3205 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
3206 *outp += repsize;
3207 }
3208 }
3209 else
3210 return -1;
3211 return 0;
3212}
3213
3214PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 int size,
3216 PyObject *mapping,
3217 const char *errors)
3218{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003219 /* output object */
3220 PyObject *res = NULL;
3221 /* pointers to the beginning and end+1 of input */
3222 const Py_UNICODE *startp = p;
3223 const Py_UNICODE *endp = p + size;
3224 /* pointer into the output */
3225 Py_UNICODE *str;
3226 /* current output position */
3227 int respos = 0;
3228 int ressize;
3229 char *reason = "character maps to <undefined>";
3230 PyObject *errorHandler = NULL;
3231 PyObject *exc = NULL;
3232 /* the following variable is used for caching string comparisons
3233 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
3234 * 3=ignore, 4=xmlcharrefreplace */
3235 int known_errorHandler = -1;
3236
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 if (mapping == NULL) {
3238 PyErr_BadArgument();
3239 return NULL;
3240 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003241
3242 /* allocate enough for a simple 1:1 translation without
3243 replacements, if we need more, we'll resize */
3244 res = PyUnicode_FromUnicode(NULL, size);
3245 if (res == NULL)
3246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 if (size == 0)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003248 return res;
3249 str = PyUnicode_AS_UNICODE(res);
3250 ressize = size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003252 while (p<endp) {
3253 /* try to encode it */
3254 PyObject *x = NULL;
3255 if (charmaptranslate_output(*p, mapping, &res, &ressize, &str, &x)) {
3256 Py_XDECREF(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 goto onError;
3258 }
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00003259 Py_XDECREF(x);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003260 if (x!=Py_None) /* it worked => adjust input pointer */
3261 ++p;
3262 else { /* untranslatable character */
3263 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
3264 int repsize;
3265 int newpos;
3266 Py_UNICODE *uni2;
3267 /* startpos for collecting untranslatable chars */
3268 const Py_UNICODE *collstart = p;
3269 const Py_UNICODE *collend = p+1;
3270 const Py_UNICODE *coll;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003272 /* find all untranslatable characters */
3273 while (collend < endp) {
3274 if (charmaptranslate_lookup(*collend, mapping, &x))
3275 goto onError;
3276 Py_XDECREF(x);
3277 if (x!=Py_None)
3278 break;
3279 ++collend;
3280 }
3281 /* cache callback name lookup
3282 * (if not done yet, i.e. it's the first error) */
3283 if (known_errorHandler==-1) {
3284 if ((errors==NULL) || (!strcmp(errors, "strict")))
3285 known_errorHandler = 1;
3286 else if (!strcmp(errors, "replace"))
3287 known_errorHandler = 2;
3288 else if (!strcmp(errors, "ignore"))
3289 known_errorHandler = 3;
3290 else if (!strcmp(errors, "xmlcharrefreplace"))
3291 known_errorHandler = 4;
3292 else
3293 known_errorHandler = 0;
3294 }
3295 switch (known_errorHandler) {
3296 case 1: /* strict */
3297 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
3298 goto onError;
3299 case 2: /* replace */
3300 /* No need to check for space, this is a 1:1 replacement */
3301 for (coll = collstart; coll<collend; ++coll)
3302 *str++ = '?';
3303 /* fall through */
3304 case 3: /* ignore */
3305 p = collend;
3306 break;
3307 case 4: /* xmlcharrefreplace */
3308 /* generate replacement (temporarily (mis)uses p) */
3309 for (p = collstart; p < collend; ++p) {
3310 char buffer[2+29+1+1];
3311 char *cp;
3312 sprintf(buffer, "&#%d;", (int)*p);
3313 if (charmaptranslate_makespace(&res, &str, &ressize,
3314 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
3315 goto onError;
3316 for (cp = buffer; *cp; ++cp)
3317 *str++ = *cp;
3318 }
3319 p = collend;
3320 break;
3321 default:
3322 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
3323 reason, startp, size, &exc,
3324 collstart-startp, collend-startp, &newpos);
3325 if (repunicode == NULL)
3326 goto onError;
3327 /* generate replacement */
3328 repsize = PyUnicode_GET_SIZE(repunicode);
3329 if (charmaptranslate_makespace(&res, &str, &ressize,
3330 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
3331 Py_DECREF(repunicode);
3332 goto onError;
3333 }
3334 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
3335 *str++ = *uni2;
3336 p = startp + newpos;
3337 Py_DECREF(repunicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 }
3339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003341 /* Resize if we allocated to much */
3342 respos = str-PyUnicode_AS_UNICODE(res);
3343 if (respos<ressize) {
3344 if (_PyUnicode_Resize(&res, respos))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00003345 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003346 }
3347 Py_XDECREF(exc);
3348 Py_XDECREF(errorHandler);
3349 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003351 onError:
3352 Py_XDECREF(res);
3353 Py_XDECREF(exc);
3354 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355 return NULL;
3356}
3357
3358PyObject *PyUnicode_Translate(PyObject *str,
3359 PyObject *mapping,
3360 const char *errors)
3361{
3362 PyObject *result;
3363
3364 str = PyUnicode_FromObject(str);
3365 if (str == NULL)
3366 goto onError;
3367 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
3368 PyUnicode_GET_SIZE(str),
3369 mapping,
3370 errors);
3371 Py_DECREF(str);
3372 return result;
3373
3374 onError:
3375 Py_XDECREF(str);
3376 return NULL;
3377}
3378
Guido van Rossum9e896b32000-04-05 20:11:21 +00003379/* --- Decimal Encoder ---------------------------------------------------- */
3380
3381int PyUnicode_EncodeDecimal(Py_UNICODE *s,
3382 int length,
3383 char *output,
3384 const char *errors)
3385{
3386 Py_UNICODE *p, *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003387 PyObject *errorHandler = NULL;
3388 PyObject *exc = NULL;
3389 const char *encoding = "decimal";
3390 const char *reason = "invalid decimal Unicode string";
3391 /* the following variable is used for caching string comparisons
3392 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3393 int known_errorHandler = -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003394
3395 if (output == NULL) {
3396 PyErr_BadArgument();
3397 return -1;
3398 }
3399
3400 p = s;
3401 end = s + length;
3402 while (p < end) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003403 register Py_UNICODE ch = *p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003404 int decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003405 PyObject *repunicode;
3406 int repsize;
3407 int newpos;
3408 Py_UNICODE *uni2;
3409 Py_UNICODE *collstart;
3410 Py_UNICODE *collend;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003411
3412 if (Py_UNICODE_ISSPACE(ch)) {
3413 *output++ = ' ';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003414 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003415 continue;
3416 }
3417 decimal = Py_UNICODE_TODECIMAL(ch);
3418 if (decimal >= 0) {
3419 *output++ = '0' + decimal;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003420 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003421 continue;
3422 }
Guido van Rossumba477042000-04-06 18:18:10 +00003423 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00003424 *output++ = (char)ch;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003425 ++p;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003426 continue;
3427 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003428 /* All other characters are considered unencodable */
3429 collstart = p;
3430 collend = p+1;
3431 while (collend < end) {
3432 if ((0 < *collend && *collend < 256) ||
3433 !Py_UNICODE_ISSPACE(*collend) ||
3434 Py_UNICODE_TODECIMAL(*collend))
3435 break;
Guido van Rossum9e896b32000-04-05 20:11:21 +00003436 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003437 /* cache callback name lookup
3438 * (if not done yet, i.e. it's the first error) */
3439 if (known_errorHandler==-1) {
3440 if ((errors==NULL) || (!strcmp(errors, "strict")))
3441 known_errorHandler = 1;
3442 else if (!strcmp(errors, "replace"))
3443 known_errorHandler = 2;
3444 else if (!strcmp(errors, "ignore"))
3445 known_errorHandler = 3;
3446 else if (!strcmp(errors, "xmlcharrefreplace"))
3447 known_errorHandler = 4;
3448 else
3449 known_errorHandler = 0;
3450 }
3451 switch (known_errorHandler) {
3452 case 1: /* strict */
3453 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
3454 goto onError;
3455 case 2: /* replace */
3456 for (p = collstart; p < collend; ++p)
3457 *output++ = '?';
3458 /* fall through */
3459 case 3: /* ignore */
3460 p = collend;
3461 break;
3462 case 4: /* xmlcharrefreplace */
3463 /* generate replacement (temporarily (mis)uses p) */
3464 for (p = collstart; p < collend; ++p)
3465 output += sprintf(output, "&#%d;", (int)*p);
3466 p = collend;
3467 break;
3468 default:
3469 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3470 encoding, reason, s, length, &exc,
3471 collstart-s, collend-s, &newpos);
3472 if (repunicode == NULL)
3473 goto onError;
3474 /* generate replacement */
3475 repsize = PyUnicode_GET_SIZE(repunicode);
3476 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
3477 Py_UNICODE ch = *uni2;
3478 if (Py_UNICODE_ISSPACE(ch))
3479 *output++ = ' ';
3480 else {
3481 decimal = Py_UNICODE_TODECIMAL(ch);
3482 if (decimal >= 0)
3483 *output++ = '0' + decimal;
3484 else if (0 < ch && ch < 256)
3485 *output++ = (char)ch;
3486 else {
3487 Py_DECREF(repunicode);
3488 raise_encode_exception(&exc, encoding,
3489 s, length, collstart-s, collend-s, reason);
3490 goto onError;
3491 }
3492 }
3493 }
3494 p = s + newpos;
3495 Py_DECREF(repunicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003496 }
3497 }
3498 /* 0-terminate the output string */
3499 *output++ = '\0';
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003500 Py_XDECREF(exc);
3501 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003502 return 0;
3503
3504 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003505 Py_XDECREF(exc);
3506 Py_XDECREF(errorHandler);
Guido van Rossum9e896b32000-04-05 20:11:21 +00003507 return -1;
3508}
3509
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510/* --- Helpers ------------------------------------------------------------ */
3511
3512static
3513int count(PyUnicodeObject *self,
3514 int start,
3515 int end,
3516 PyUnicodeObject *substring)
3517{
3518 int count = 0;
3519
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00003520 if (start < 0)
3521 start += self->length;
3522 if (start < 0)
3523 start = 0;
3524 if (end > self->length)
3525 end = self->length;
3526 if (end < 0)
3527 end += self->length;
3528 if (end < 0)
3529 end = 0;
3530
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003531 if (substring->length == 0)
3532 return (end - start + 1);
3533
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534 end -= substring->length;
3535
3536 while (start <= end)
3537 if (Py_UNICODE_MATCH(self, start, substring)) {
3538 count++;
3539 start += substring->length;
3540 } else
3541 start++;
3542
3543 return count;
3544}
3545
3546int PyUnicode_Count(PyObject *str,
3547 PyObject *substr,
3548 int start,
3549 int end)
3550{
3551 int result;
3552
3553 str = PyUnicode_FromObject(str);
3554 if (str == NULL)
3555 return -1;
3556 substr = PyUnicode_FromObject(substr);
3557 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00003558 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 return -1;
3560 }
3561
3562 result = count((PyUnicodeObject *)str,
3563 start, end,
3564 (PyUnicodeObject *)substr);
3565
3566 Py_DECREF(str);
3567 Py_DECREF(substr);
3568 return result;
3569}
3570
3571static
3572int findstring(PyUnicodeObject *self,
3573 PyUnicodeObject *substring,
3574 int start,
3575 int end,
3576 int direction)
3577{
3578 if (start < 0)
3579 start += self->length;
3580 if (start < 0)
3581 start = 0;
3582
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583 if (end > self->length)
3584 end = self->length;
3585 if (end < 0)
3586 end += self->length;
3587 if (end < 0)
3588 end = 0;
3589
Guido van Rossum76afbd92002-08-20 17:29:29 +00003590 if (substring->length == 0)
3591 return (direction > 0) ? start : end;
3592
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 end -= substring->length;
3594
3595 if (direction < 0) {
3596 for (; end >= start; end--)
3597 if (Py_UNICODE_MATCH(self, end, substring))
3598 return end;
3599 } else {
3600 for (; start <= end; start++)
3601 if (Py_UNICODE_MATCH(self, start, substring))
3602 return start;
3603 }
3604
3605 return -1;
3606}
3607
3608int PyUnicode_Find(PyObject *str,
3609 PyObject *substr,
3610 int start,
3611 int end,
3612 int direction)
3613{
3614 int result;
3615
3616 str = PyUnicode_FromObject(str);
3617 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003618 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003619 substr = PyUnicode_FromObject(substr);
3620 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00003621 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00003622 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623 }
3624
3625 result = findstring((PyUnicodeObject *)str,
3626 (PyUnicodeObject *)substr,
3627 start, end, direction);
3628 Py_DECREF(str);
3629 Py_DECREF(substr);
3630 return result;
3631}
3632
3633static
3634int tailmatch(PyUnicodeObject *self,
3635 PyUnicodeObject *substring,
3636 int start,
3637 int end,
3638 int direction)
3639{
3640 if (start < 0)
3641 start += self->length;
3642 if (start < 0)
3643 start = 0;
3644
3645 if (substring->length == 0)
3646 return 1;
3647
3648 if (end > self->length)
3649 end = self->length;
3650 if (end < 0)
3651 end += self->length;
3652 if (end < 0)
3653 end = 0;
3654
3655 end -= substring->length;
3656 if (end < start)
3657 return 0;
3658
3659 if (direction > 0) {
3660 if (Py_UNICODE_MATCH(self, end, substring))
3661 return 1;
3662 } else {
3663 if (Py_UNICODE_MATCH(self, start, substring))
3664 return 1;
3665 }
3666
3667 return 0;
3668}
3669
3670int PyUnicode_Tailmatch(PyObject *str,
3671 PyObject *substr,
3672 int start,
3673 int end,
3674 int direction)
3675{
3676 int result;
3677
3678 str = PyUnicode_FromObject(str);
3679 if (str == NULL)
3680 return -1;
3681 substr = PyUnicode_FromObject(substr);
3682 if (substr == NULL) {
3683 Py_DECREF(substr);
3684 return -1;
3685 }
3686
3687 result = tailmatch((PyUnicodeObject *)str,
3688 (PyUnicodeObject *)substr,
3689 start, end, direction);
3690 Py_DECREF(str);
3691 Py_DECREF(substr);
3692 return result;
3693}
3694
3695static
3696const Py_UNICODE *findchar(const Py_UNICODE *s,
3697 int size,
3698 Py_UNICODE ch)
3699{
3700 /* like wcschr, but doesn't stop at NULL characters */
3701
3702 while (size-- > 0) {
3703 if (*s == ch)
3704 return s;
3705 s++;
3706 }
3707
3708 return NULL;
3709}
3710
3711/* Apply fixfct filter to the Unicode object self and return a
3712 reference to the modified object */
3713
3714static
3715PyObject *fixup(PyUnicodeObject *self,
3716 int (*fixfct)(PyUnicodeObject *s))
3717{
3718
3719 PyUnicodeObject *u;
3720
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003721 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722 if (u == NULL)
3723 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003724
3725 Py_UNICODE_COPY(u->str, self->str, self->length);
3726
Tim Peters7a29bd52001-09-12 03:03:31 +00003727 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 /* fixfct should return TRUE if it modified the buffer. If
3729 FALSE, return a reference to the original buffer instead
3730 (to save space, not time) */
3731 Py_INCREF(self);
3732 Py_DECREF(u);
3733 return (PyObject*) self;
3734 }
3735 return (PyObject*) u;
3736}
3737
3738static
3739int fixupper(PyUnicodeObject *self)
3740{
3741 int len = self->length;
3742 Py_UNICODE *s = self->str;
3743 int status = 0;
3744
3745 while (len-- > 0) {
3746 register Py_UNICODE ch;
3747
3748 ch = Py_UNICODE_TOUPPER(*s);
3749 if (ch != *s) {
3750 status = 1;
3751 *s = ch;
3752 }
3753 s++;
3754 }
3755
3756 return status;
3757}
3758
3759static
3760int fixlower(PyUnicodeObject *self)
3761{
3762 int len = self->length;
3763 Py_UNICODE *s = self->str;
3764 int status = 0;
3765
3766 while (len-- > 0) {
3767 register Py_UNICODE ch;
3768
3769 ch = Py_UNICODE_TOLOWER(*s);
3770 if (ch != *s) {
3771 status = 1;
3772 *s = ch;
3773 }
3774 s++;
3775 }
3776
3777 return status;
3778}
3779
3780static
3781int fixswapcase(PyUnicodeObject *self)
3782{
3783 int len = self->length;
3784 Py_UNICODE *s = self->str;
3785 int status = 0;
3786
3787 while (len-- > 0) {
3788 if (Py_UNICODE_ISUPPER(*s)) {
3789 *s = Py_UNICODE_TOLOWER(*s);
3790 status = 1;
3791 } else if (Py_UNICODE_ISLOWER(*s)) {
3792 *s = Py_UNICODE_TOUPPER(*s);
3793 status = 1;
3794 }
3795 s++;
3796 }
3797
3798 return status;
3799}
3800
3801static
3802int fixcapitalize(PyUnicodeObject *self)
3803{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003804 int len = self->length;
3805 Py_UNICODE *s = self->str;
3806 int status = 0;
3807
3808 if (len == 0)
3809 return 0;
3810 if (Py_UNICODE_ISLOWER(*s)) {
3811 *s = Py_UNICODE_TOUPPER(*s);
3812 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003813 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003814 s++;
3815 while (--len > 0) {
3816 if (Py_UNICODE_ISUPPER(*s)) {
3817 *s = Py_UNICODE_TOLOWER(*s);
3818 status = 1;
3819 }
3820 s++;
3821 }
3822 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823}
3824
3825static
3826int fixtitle(PyUnicodeObject *self)
3827{
3828 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3829 register Py_UNICODE *e;
3830 int previous_is_cased;
3831
3832 /* Shortcut for single character strings */
3833 if (PyUnicode_GET_SIZE(self) == 1) {
3834 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3835 if (*p != ch) {
3836 *p = ch;
3837 return 1;
3838 }
3839 else
3840 return 0;
3841 }
3842
3843 e = p + PyUnicode_GET_SIZE(self);
3844 previous_is_cased = 0;
3845 for (; p < e; p++) {
3846 register const Py_UNICODE ch = *p;
3847
3848 if (previous_is_cased)
3849 *p = Py_UNICODE_TOLOWER(ch);
3850 else
3851 *p = Py_UNICODE_TOTITLE(ch);
3852
3853 if (Py_UNICODE_ISLOWER(ch) ||
3854 Py_UNICODE_ISUPPER(ch) ||
3855 Py_UNICODE_ISTITLE(ch))
3856 previous_is_cased = 1;
3857 else
3858 previous_is_cased = 0;
3859 }
3860 return 1;
3861}
3862
3863PyObject *PyUnicode_Join(PyObject *separator,
3864 PyObject *seq)
3865{
3866 Py_UNICODE *sep;
3867 int seplen;
3868 PyUnicodeObject *res = NULL;
3869 int reslen = 0;
3870 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 int sz = 100;
3872 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003873 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874
Tim Peters2cfe3682001-05-05 05:36:48 +00003875 it = PyObject_GetIter(seq);
3876 if (it == NULL)
3877 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878
3879 if (separator == NULL) {
3880 Py_UNICODE blank = ' ';
3881 sep = &blank;
3882 seplen = 1;
3883 }
3884 else {
3885 separator = PyUnicode_FromObject(separator);
3886 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003887 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003888 sep = PyUnicode_AS_UNICODE(separator);
3889 seplen = PyUnicode_GET_SIZE(separator);
3890 }
3891
3892 res = _PyUnicode_New(sz);
3893 if (res == NULL)
3894 goto onError;
3895 p = PyUnicode_AS_UNICODE(res);
3896 reslen = 0;
3897
Tim Peters2cfe3682001-05-05 05:36:48 +00003898 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003899 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003900 PyObject *item = PyIter_Next(it);
3901 if (item == NULL) {
3902 if (PyErr_Occurred())
3903 goto onError;
3904 break;
3905 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 if (!PyUnicode_Check(item)) {
3907 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003908 if (!PyString_Check(item)) {
3909 PyErr_Format(PyExc_TypeError,
3910 "sequence item %i: expected string or Unicode,"
3911 " %.80s found",
3912 i, item->ob_type->tp_name);
3913 Py_DECREF(item);
3914 goto onError;
3915 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003916 v = PyUnicode_FromObject(item);
3917 Py_DECREF(item);
3918 item = v;
3919 if (item == NULL)
3920 goto onError;
3921 }
3922 itemlen = PyUnicode_GET_SIZE(item);
3923 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003924 if (_PyUnicode_Resize(&res, sz*2)) {
3925 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003927 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003928 sz *= 2;
3929 p = PyUnicode_AS_UNICODE(res) + reslen;
3930 }
3931 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003932 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 p += seplen;
3934 reslen += seplen;
3935 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003936 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003937 p += itemlen;
3938 reslen += itemlen;
3939 Py_DECREF(item);
3940 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003941 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003942 goto onError;
3943
3944 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003945 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003946 return (PyObject *)res;
3947
3948 onError:
3949 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003950 Py_XDECREF(res);
3951 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003952 return NULL;
3953}
3954
3955static
3956PyUnicodeObject *pad(PyUnicodeObject *self,
3957 int left,
3958 int right,
3959 Py_UNICODE fill)
3960{
3961 PyUnicodeObject *u;
3962
3963 if (left < 0)
3964 left = 0;
3965 if (right < 0)
3966 right = 0;
3967
Tim Peters7a29bd52001-09-12 03:03:31 +00003968 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003969 Py_INCREF(self);
3970 return self;
3971 }
3972
3973 u = _PyUnicode_New(left + self->length + right);
3974 if (u) {
3975 if (left)
3976 Py_UNICODE_FILL(u->str, fill, left);
3977 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3978 if (right)
3979 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3980 }
3981
3982 return u;
3983}
3984
3985#define SPLIT_APPEND(data, left, right) \
3986 str = PyUnicode_FromUnicode(data + left, right - left); \
3987 if (!str) \
3988 goto onError; \
3989 if (PyList_Append(list, str)) { \
3990 Py_DECREF(str); \
3991 goto onError; \
3992 } \
3993 else \
3994 Py_DECREF(str);
3995
3996static
3997PyObject *split_whitespace(PyUnicodeObject *self,
3998 PyObject *list,
3999 int maxcount)
4000{
4001 register int i;
4002 register int j;
4003 int len = self->length;
4004 PyObject *str;
4005
4006 for (i = j = 0; i < len; ) {
4007 /* find a token */
4008 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4009 i++;
4010 j = i;
4011 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
4012 i++;
4013 if (j < i) {
4014 if (maxcount-- <= 0)
4015 break;
4016 SPLIT_APPEND(self->str, j, i);
4017 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
4018 i++;
4019 j = i;
4020 }
4021 }
4022 if (j < len) {
4023 SPLIT_APPEND(self->str, j, len);
4024 }
4025 return list;
4026
4027 onError:
4028 Py_DECREF(list);
4029 return NULL;
4030}
4031
4032PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00004033 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004034{
4035 register int i;
4036 register int j;
4037 int len;
4038 PyObject *list;
4039 PyObject *str;
4040 Py_UNICODE *data;
4041
4042 string = PyUnicode_FromObject(string);
4043 if (string == NULL)
4044 return NULL;
4045 data = PyUnicode_AS_UNICODE(string);
4046 len = PyUnicode_GET_SIZE(string);
4047
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048 list = PyList_New(0);
4049 if (!list)
4050 goto onError;
4051
4052 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00004053 int eol;
4054
Guido van Rossumd57fd912000-03-10 22:53:23 +00004055 /* Find a line and append it */
4056 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
4057 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058
4059 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00004060 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 if (i < len) {
4062 if (data[i] == '\r' && i + 1 < len &&
4063 data[i+1] == '\n')
4064 i += 2;
4065 else
4066 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00004067 if (keepends)
4068 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004069 }
Guido van Rossum86662912000-04-11 15:38:46 +00004070 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 j = i;
4072 }
4073 if (j < len) {
4074 SPLIT_APPEND(data, j, len);
4075 }
4076
4077 Py_DECREF(string);
4078 return list;
4079
4080 onError:
4081 Py_DECREF(list);
4082 Py_DECREF(string);
4083 return NULL;
4084}
4085
4086static
4087PyObject *split_char(PyUnicodeObject *self,
4088 PyObject *list,
4089 Py_UNICODE ch,
4090 int maxcount)
4091{
4092 register int i;
4093 register int j;
4094 int len = self->length;
4095 PyObject *str;
4096
4097 for (i = j = 0; i < len; ) {
4098 if (self->str[i] == ch) {
4099 if (maxcount-- <= 0)
4100 break;
4101 SPLIT_APPEND(self->str, j, i);
4102 i = j = i + 1;
4103 } else
4104 i++;
4105 }
4106 if (j <= len) {
4107 SPLIT_APPEND(self->str, j, len);
4108 }
4109 return list;
4110
4111 onError:
4112 Py_DECREF(list);
4113 return NULL;
4114}
4115
4116static
4117PyObject *split_substring(PyUnicodeObject *self,
4118 PyObject *list,
4119 PyUnicodeObject *substring,
4120 int maxcount)
4121{
4122 register int i;
4123 register int j;
4124 int len = self->length;
4125 int sublen = substring->length;
4126 PyObject *str;
4127
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00004128 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 if (Py_UNICODE_MATCH(self, i, substring)) {
4130 if (maxcount-- <= 0)
4131 break;
4132 SPLIT_APPEND(self->str, j, i);
4133 i = j = i + sublen;
4134 } else
4135 i++;
4136 }
4137 if (j <= len) {
4138 SPLIT_APPEND(self->str, j, len);
4139 }
4140 return list;
4141
4142 onError:
4143 Py_DECREF(list);
4144 return NULL;
4145}
4146
4147#undef SPLIT_APPEND
4148
4149static
4150PyObject *split(PyUnicodeObject *self,
4151 PyUnicodeObject *substring,
4152 int maxcount)
4153{
4154 PyObject *list;
4155
4156 if (maxcount < 0)
4157 maxcount = INT_MAX;
4158
4159 list = PyList_New(0);
4160 if (!list)
4161 return NULL;
4162
4163 if (substring == NULL)
4164 return split_whitespace(self,list,maxcount);
4165
4166 else if (substring->length == 1)
4167 return split_char(self,list,substring->str[0],maxcount);
4168
4169 else if (substring->length == 0) {
4170 Py_DECREF(list);
4171 PyErr_SetString(PyExc_ValueError, "empty separator");
4172 return NULL;
4173 }
4174 else
4175 return split_substring(self,list,substring,maxcount);
4176}
4177
4178static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179PyObject *replace(PyUnicodeObject *self,
4180 PyUnicodeObject *str1,
4181 PyUnicodeObject *str2,
4182 int maxcount)
4183{
4184 PyUnicodeObject *u;
4185
4186 if (maxcount < 0)
4187 maxcount = INT_MAX;
4188
4189 if (str1->length == 1 && str2->length == 1) {
4190 int i;
4191
4192 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00004193 if (!findchar(self->str, self->length, str1->str[0]) &&
4194 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195 /* nothing to replace, return original string */
4196 Py_INCREF(self);
4197 u = self;
4198 } else {
4199 Py_UNICODE u1 = str1->str[0];
4200 Py_UNICODE u2 = str2->str[0];
4201
4202 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004203 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204 self->length
4205 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004206 if (u != NULL) {
4207 Py_UNICODE_COPY(u->str, self->str,
4208 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 for (i = 0; i < u->length; i++)
4210 if (u->str[i] == u1) {
4211 if (--maxcount < 0)
4212 break;
4213 u->str[i] = u2;
4214 }
4215 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004217
4218 } else {
4219 int n, i;
4220 Py_UNICODE *p;
4221
4222 /* replace strings */
4223 n = count(self, 0, self->length, str1);
4224 if (n > maxcount)
4225 n = maxcount;
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004226 if (n == 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227 /* nothing to replace, return original string */
Guido van Rossum2023c9b2002-08-23 18:50:21 +00004228 if (PyUnicode_CheckExact(self)) {
4229 Py_INCREF(self);
4230 u = self;
4231 }
4232 else {
4233 u = (PyUnicodeObject *)
4234 PyUnicode_FromUnicode(self->str, self->length);
4235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004236 } else {
4237 u = _PyUnicode_New(
4238 self->length + n * (str2->length - str1->length));
4239 if (u) {
4240 i = 0;
4241 p = u->str;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004242 if (str1->length > 0) {
4243 while (i <= self->length - str1->length)
4244 if (Py_UNICODE_MATCH(self, i, str1)) {
4245 /* replace string segment */
4246 Py_UNICODE_COPY(p, str2->str, str2->length);
4247 p += str2->length;
4248 i += str1->length;
4249 if (--n <= 0) {
4250 /* copy remaining part */
4251 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4252 break;
4253 }
4254 } else
4255 *p++ = self->str[i++];
4256 } else {
4257 while (n > 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 Py_UNICODE_COPY(p, str2->str, str2->length);
4259 p += str2->length;
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004260 if (--n <= 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004262 *p++ = self->str[i++];
Guido van Rossum8b1a6d62002-08-23 18:21:28 +00004263 }
4264 Py_UNICODE_COPY(p, self->str+i, self->length-i);
4265 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 }
4267 }
4268 }
4269
4270 return (PyObject *) u;
4271}
4272
4273/* --- Unicode Object Methods --------------------------------------------- */
4274
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004275PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276"S.title() -> unicode\n\
4277\n\
4278Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004279characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280
4281static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004282unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284 return fixup(self, fixtitle);
4285}
4286
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004287PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288"S.capitalize() -> unicode\n\
4289\n\
4290Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004291have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292
4293static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004294unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296 return fixup(self, fixcapitalize);
4297}
4298
4299#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004300PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301"S.capwords() -> unicode\n\
4302\n\
4303Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004304normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305
4306static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004307unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308{
4309 PyObject *list;
4310 PyObject *item;
4311 int i;
4312
Guido van Rossumd57fd912000-03-10 22:53:23 +00004313 /* Split into words */
4314 list = split(self, NULL, -1);
4315 if (!list)
4316 return NULL;
4317
4318 /* Capitalize each word */
4319 for (i = 0; i < PyList_GET_SIZE(list); i++) {
4320 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
4321 fixcapitalize);
4322 if (item == NULL)
4323 goto onError;
4324 Py_DECREF(PyList_GET_ITEM(list, i));
4325 PyList_SET_ITEM(list, i, item);
4326 }
4327
4328 /* Join the words to form a new string */
4329 item = PyUnicode_Join(NULL, list);
4330
4331onError:
4332 Py_DECREF(list);
4333 return (PyObject *)item;
4334}
4335#endif
4336
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004337PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004338"S.center(width) -> unicode\n\
4339\n\
4340Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004341using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342
4343static PyObject *
4344unicode_center(PyUnicodeObject *self, PyObject *args)
4345{
4346 int marg, left;
4347 int width;
4348
4349 if (!PyArg_ParseTuple(args, "i:center", &width))
4350 return NULL;
4351
Tim Peters7a29bd52001-09-12 03:03:31 +00004352 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004353 Py_INCREF(self);
4354 return (PyObject*) self;
4355 }
4356
4357 marg = width - self->length;
4358 left = marg / 2 + (marg & width & 1);
4359
4360 return (PyObject*) pad(self, left, marg - left, ' ');
4361}
4362
Marc-André Lemburge5034372000-08-08 08:04:29 +00004363#if 0
4364
4365/* This code should go into some future Unicode collation support
4366 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00004367 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00004368
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004369/* speedy UTF-16 code point order comparison */
4370/* gleaned from: */
4371/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
4372
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004373static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004374{
4375 0, 0, 0, 0, 0, 0, 0, 0,
4376 0, 0, 0, 0, 0, 0, 0, 0,
4377 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004378 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004379};
4380
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381static int
4382unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4383{
4384 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004385
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386 Py_UNICODE *s1 = str1->str;
4387 Py_UNICODE *s2 = str2->str;
4388
4389 len1 = str1->length;
4390 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004391
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00004393 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004394
4395 c1 = *s1++;
4396 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00004397
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004398 if (c1 > (1<<11) * 26)
4399 c1 += utf16Fixup[c1>>11];
4400 if (c2 > (1<<11) * 26)
4401 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004402 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00004403
4404 if (c1 != c2)
4405 return (c1 < c2) ? -1 : 1;
4406
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00004407 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 }
4409
4410 return (len1 < len2) ? -1 : (len1 != len2);
4411}
4412
Marc-André Lemburge5034372000-08-08 08:04:29 +00004413#else
4414
4415static int
4416unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
4417{
4418 register int len1, len2;
4419
4420 Py_UNICODE *s1 = str1->str;
4421 Py_UNICODE *s2 = str2->str;
4422
4423 len1 = str1->length;
4424 len2 = str2->length;
4425
4426 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00004427 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00004428
Fredrik Lundh45714e92001-06-26 16:39:36 +00004429 c1 = *s1++;
4430 c2 = *s2++;
4431
4432 if (c1 != c2)
4433 return (c1 < c2) ? -1 : 1;
4434
Marc-André Lemburge5034372000-08-08 08:04:29 +00004435 len1--; len2--;
4436 }
4437
4438 return (len1 < len2) ? -1 : (len1 != len2);
4439}
4440
4441#endif
4442
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443int PyUnicode_Compare(PyObject *left,
4444 PyObject *right)
4445{
4446 PyUnicodeObject *u = NULL, *v = NULL;
4447 int result;
4448
4449 /* Coerce the two arguments */
4450 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4451 if (u == NULL)
4452 goto onError;
4453 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4454 if (v == NULL)
4455 goto onError;
4456
Thomas Wouters7e474022000-07-16 12:04:32 +00004457 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004458 if (v == u) {
4459 Py_DECREF(u);
4460 Py_DECREF(v);
4461 return 0;
4462 }
4463
4464 result = unicode_compare(u, v);
4465
4466 Py_DECREF(u);
4467 Py_DECREF(v);
4468 return result;
4469
4470onError:
4471 Py_XDECREF(u);
4472 Py_XDECREF(v);
4473 return -1;
4474}
4475
Guido van Rossum403d68b2000-03-13 15:55:09 +00004476int PyUnicode_Contains(PyObject *container,
4477 PyObject *element)
4478{
4479 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00004480 int result, size;
4481 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004482
4483 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004484 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004485 if (v == NULL) {
4486 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00004487 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00004488 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00004489 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00004490 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
Marc-André Lemburg9cd87aa2002-10-23 09:02:46 +00004491 if (u == NULL)
Guido van Rossum9e896b32000-04-05 20:11:21 +00004492 goto onError;
Guido van Rossum403d68b2000-03-13 15:55:09 +00004493
Barry Warsaw817918c2002-08-06 16:58:21 +00004494 size = PyUnicode_GET_SIZE(v);
4495 rhs = PyUnicode_AS_UNICODE(v);
4496 lhs = PyUnicode_AS_UNICODE(u);
4497
Guido van Rossum403d68b2000-03-13 15:55:09 +00004498 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00004499 if (size == 1) {
4500 end = lhs + PyUnicode_GET_SIZE(u);
4501 while (lhs < end) {
4502 if (*lhs++ == *rhs) {
4503 result = 1;
4504 break;
4505 }
4506 }
4507 }
4508 else {
4509 end = lhs + (PyUnicode_GET_SIZE(u) - size);
4510 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00004511 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00004512 result = 1;
4513 break;
4514 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00004515 }
4516 }
4517
4518 Py_DECREF(u);
4519 Py_DECREF(v);
4520 return result;
4521
4522onError:
4523 Py_XDECREF(u);
4524 Py_XDECREF(v);
4525 return -1;
4526}
4527
Guido van Rossumd57fd912000-03-10 22:53:23 +00004528/* Concat to string or Unicode object giving a new Unicode object. */
4529
4530PyObject *PyUnicode_Concat(PyObject *left,
4531 PyObject *right)
4532{
4533 PyUnicodeObject *u = NULL, *v = NULL, *w;
4534
4535 /* Coerce the two arguments */
4536 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
4537 if (u == NULL)
4538 goto onError;
4539 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
4540 if (v == NULL)
4541 goto onError;
4542
4543 /* Shortcuts */
4544 if (v == unicode_empty) {
4545 Py_DECREF(v);
4546 return (PyObject *)u;
4547 }
4548 if (u == unicode_empty) {
4549 Py_DECREF(u);
4550 return (PyObject *)v;
4551 }
4552
4553 /* Concat the two Unicode strings */
4554 w = _PyUnicode_New(u->length + v->length);
4555 if (w == NULL)
4556 goto onError;
4557 Py_UNICODE_COPY(w->str, u->str, u->length);
4558 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
4559
4560 Py_DECREF(u);
4561 Py_DECREF(v);
4562 return (PyObject *)w;
4563
4564onError:
4565 Py_XDECREF(u);
4566 Py_XDECREF(v);
4567 return NULL;
4568}
4569
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004570PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004571"S.count(sub[, start[, end]]) -> int\n\
4572\n\
4573Return the number of occurrences of substring sub in Unicode string\n\
4574S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004575interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576
4577static PyObject *
4578unicode_count(PyUnicodeObject *self, PyObject *args)
4579{
4580 PyUnicodeObject *substring;
4581 int start = 0;
4582 int end = INT_MAX;
4583 PyObject *result;
4584
Guido van Rossumb8872e62000-05-09 14:14:27 +00004585 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
4586 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587 return NULL;
4588
4589 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4590 (PyObject *)substring);
4591 if (substring == NULL)
4592 return NULL;
4593
Guido van Rossumd57fd912000-03-10 22:53:23 +00004594 if (start < 0)
4595 start += self->length;
4596 if (start < 0)
4597 start = 0;
4598 if (end > self->length)
4599 end = self->length;
4600 if (end < 0)
4601 end += self->length;
4602 if (end < 0)
4603 end = 0;
4604
4605 result = PyInt_FromLong((long) count(self, start, end, substring));
4606
4607 Py_DECREF(substring);
4608 return result;
4609}
4610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004611PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612"S.encode([encoding[,errors]]) -> string\n\
4613\n\
Fred Drakee4315f52000-05-09 19:53:39 +00004614Return an encoded string version of S. Default encoding is the current\n\
4615default string encoding. errors may be given to set a different error\n\
4616handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004617a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
4618'xmlcharrefreplace' as well as any other name registered with\n\
4619codecs.register_error that can handle UnicodeEncodeErrors.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620
4621static PyObject *
4622unicode_encode(PyUnicodeObject *self, PyObject *args)
4623{
4624 char *encoding = NULL;
4625 char *errors = NULL;
4626 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
4627 return NULL;
4628 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
4629}
4630
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004631PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632"S.expandtabs([tabsize]) -> unicode\n\
4633\n\
4634Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004635If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636
4637static PyObject*
4638unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
4639{
4640 Py_UNICODE *e;
4641 Py_UNICODE *p;
4642 Py_UNICODE *q;
4643 int i, j;
4644 PyUnicodeObject *u;
4645 int tabsize = 8;
4646
4647 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
4648 return NULL;
4649
Thomas Wouters7e474022000-07-16 12:04:32 +00004650 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651 i = j = 0;
4652 e = self->str + self->length;
4653 for (p = self->str; p < e; p++)
4654 if (*p == '\t') {
4655 if (tabsize > 0)
4656 j += tabsize - (j % tabsize);
4657 }
4658 else {
4659 j++;
4660 if (*p == '\n' || *p == '\r') {
4661 i += j;
4662 j = 0;
4663 }
4664 }
4665
4666 /* Second pass: create output string and fill it */
4667 u = _PyUnicode_New(i + j);
4668 if (!u)
4669 return NULL;
4670
4671 j = 0;
4672 q = u->str;
4673
4674 for (p = self->str; p < e; p++)
4675 if (*p == '\t') {
4676 if (tabsize > 0) {
4677 i = tabsize - (j % tabsize);
4678 j += i;
4679 while (i--)
4680 *q++ = ' ';
4681 }
4682 }
4683 else {
4684 j++;
4685 *q++ = *p;
4686 if (*p == '\n' || *p == '\r')
4687 j = 0;
4688 }
4689
4690 return (PyObject*) u;
4691}
4692
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004693PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694"S.find(sub [,start [,end]]) -> int\n\
4695\n\
4696Return the lowest index in S where substring sub is found,\n\
4697such that sub is contained within s[start,end]. Optional\n\
4698arguments start and end are interpreted as in slice notation.\n\
4699\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004700Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701
4702static PyObject *
4703unicode_find(PyUnicodeObject *self, PyObject *args)
4704{
4705 PyUnicodeObject *substring;
4706 int start = 0;
4707 int end = INT_MAX;
4708 PyObject *result;
4709
Guido van Rossumb8872e62000-05-09 14:14:27 +00004710 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4711 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712 return NULL;
4713 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4714 (PyObject *)substring);
4715 if (substring == NULL)
4716 return NULL;
4717
4718 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4719
4720 Py_DECREF(substring);
4721 return result;
4722}
4723
4724static PyObject *
4725unicode_getitem(PyUnicodeObject *self, int index)
4726{
4727 if (index < 0 || index >= self->length) {
4728 PyErr_SetString(PyExc_IndexError, "string index out of range");
4729 return NULL;
4730 }
4731
4732 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4733}
4734
4735static long
4736unicode_hash(PyUnicodeObject *self)
4737{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004738 /* Since Unicode objects compare equal to their ASCII string
4739 counterparts, they should use the individual character values
4740 as basis for their hash value. This is needed to assure that
4741 strings and Unicode objects behave in the same way as
4742 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743
Fredrik Lundhdde61642000-07-10 18:27:47 +00004744 register int len;
4745 register Py_UNICODE *p;
4746 register long x;
4747
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 if (self->hash != -1)
4749 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004750 len = PyUnicode_GET_SIZE(self);
4751 p = PyUnicode_AS_UNICODE(self);
4752 x = *p << 7;
4753 while (--len >= 0)
4754 x = (1000003*x) ^ *p++;
4755 x ^= PyUnicode_GET_SIZE(self);
4756 if (x == -1)
4757 x = -2;
4758 self->hash = x;
4759 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760}
4761
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004762PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763"S.index(sub [,start [,end]]) -> int\n\
4764\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004765Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766
4767static PyObject *
4768unicode_index(PyUnicodeObject *self, PyObject *args)
4769{
4770 int result;
4771 PyUnicodeObject *substring;
4772 int start = 0;
4773 int end = INT_MAX;
4774
Guido van Rossumb8872e62000-05-09 14:14:27 +00004775 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4776 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 return NULL;
4778
4779 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4780 (PyObject *)substring);
4781 if (substring == NULL)
4782 return NULL;
4783
4784 result = findstring(self, substring, start, end, 1);
4785
4786 Py_DECREF(substring);
4787 if (result < 0) {
4788 PyErr_SetString(PyExc_ValueError, "substring not found");
4789 return NULL;
4790 }
4791 return PyInt_FromLong(result);
4792}
4793
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004794PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004795"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004797Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004798at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799
4800static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004801unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802{
4803 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4804 register const Py_UNICODE *e;
4805 int cased;
4806
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 /* Shortcut for single character strings */
4808 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004809 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004811 /* Special case for empty strings */
4812 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004813 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004814
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 e = p + PyUnicode_GET_SIZE(self);
4816 cased = 0;
4817 for (; p < e; p++) {
4818 register const Py_UNICODE ch = *p;
4819
4820 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004821 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 else if (!cased && Py_UNICODE_ISLOWER(ch))
4823 cased = 1;
4824 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004825 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826}
4827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004828PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004829"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004831Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004832at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833
4834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004835unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836{
4837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4838 register const Py_UNICODE *e;
4839 int cased;
4840
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 /* Shortcut for single character strings */
4842 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004843 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004845 /* Special case for empty strings */
4846 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004847 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004848
Guido van Rossumd57fd912000-03-10 22:53:23 +00004849 e = p + PyUnicode_GET_SIZE(self);
4850 cased = 0;
4851 for (; p < e; p++) {
4852 register const Py_UNICODE ch = *p;
4853
4854 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004855 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 else if (!cased && Py_UNICODE_ISUPPER(ch))
4857 cased = 1;
4858 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004859 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860}
4861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004862PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004863"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004865Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4866characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004867only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868
4869static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004870unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871{
4872 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4873 register const Py_UNICODE *e;
4874 int cased, previous_is_cased;
4875
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876 /* Shortcut for single character strings */
4877 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004878 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4879 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004880
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004881 /* Special case for empty strings */
4882 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004883 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004884
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885 e = p + PyUnicode_GET_SIZE(self);
4886 cased = 0;
4887 previous_is_cased = 0;
4888 for (; p < e; p++) {
4889 register const Py_UNICODE ch = *p;
4890
4891 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4892 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004893 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 previous_is_cased = 1;
4895 cased = 1;
4896 }
4897 else if (Py_UNICODE_ISLOWER(ch)) {
4898 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004899 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 previous_is_cased = 1;
4901 cased = 1;
4902 }
4903 else
4904 previous_is_cased = 0;
4905 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004906 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907}
4908
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004909PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004910"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004911\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004912Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004913False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914
4915static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004916unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917{
4918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4919 register const Py_UNICODE *e;
4920
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 /* Shortcut for single character strings */
4922 if (PyUnicode_GET_SIZE(self) == 1 &&
4923 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004924 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004926 /* Special case for empty strings */
4927 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004928 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004929
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 e = p + PyUnicode_GET_SIZE(self);
4931 for (; p < e; p++) {
4932 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004933 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004935 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004936}
4937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004938PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004939"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004940\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004941Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004942and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004943
4944static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004945unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004946{
4947 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4948 register const Py_UNICODE *e;
4949
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004950 /* Shortcut for single character strings */
4951 if (PyUnicode_GET_SIZE(self) == 1 &&
4952 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004953 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004954
4955 /* Special case for empty strings */
4956 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004957 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004958
4959 e = p + PyUnicode_GET_SIZE(self);
4960 for (; p < e; p++) {
4961 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004962 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004963 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004964 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004965}
4966
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004967PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004968"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004969\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004970Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004971and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004972
4973static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004974unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004975{
4976 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4977 register const Py_UNICODE *e;
4978
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004979 /* Shortcut for single character strings */
4980 if (PyUnicode_GET_SIZE(self) == 1 &&
4981 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004982 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004983
4984 /* Special case for empty strings */
4985 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004986 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004987
4988 e = p + PyUnicode_GET_SIZE(self);
4989 for (; p < e; p++) {
4990 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004991 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004992 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004993 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004994}
4995
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004996PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004997"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004999Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005000False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001
5002static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005003unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004{
5005 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5006 register const Py_UNICODE *e;
5007
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008 /* Shortcut for single character strings */
5009 if (PyUnicode_GET_SIZE(self) == 1 &&
5010 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005011 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005013 /* Special case for empty strings */
5014 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005015 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005016
Guido van Rossumd57fd912000-03-10 22:53:23 +00005017 e = p + PyUnicode_GET_SIZE(self);
5018 for (; p < e; p++) {
5019 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005020 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005022 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005023}
5024
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005025PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005026"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005027\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005028Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005029False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030
5031static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005032unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005033{
5034 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5035 register const Py_UNICODE *e;
5036
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 /* Shortcut for single character strings */
5038 if (PyUnicode_GET_SIZE(self) == 1 &&
5039 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005040 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005041
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005042 /* Special case for empty strings */
5043 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005044 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005045
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 e = p + PyUnicode_GET_SIZE(self);
5047 for (; p < e; p++) {
5048 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005049 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005051 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052}
5053
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005054PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005055"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005056\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005057Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005058False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059
5060static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005061unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062{
5063 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5064 register const Py_UNICODE *e;
5065
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066 /* Shortcut for single character strings */
5067 if (PyUnicode_GET_SIZE(self) == 1 &&
5068 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005069 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005071 /* Special case for empty strings */
5072 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00005073 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00005074
Guido van Rossumd57fd912000-03-10 22:53:23 +00005075 e = p + PyUnicode_GET_SIZE(self);
5076 for (; p < e; p++) {
5077 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00005078 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00005080 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081}
5082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005083PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084"S.join(sequence) -> unicode\n\
5085\n\
5086Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005087sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088
5089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005090unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005092 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093}
5094
5095static int
5096unicode_length(PyUnicodeObject *self)
5097{
5098 return self->length;
5099}
5100
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005101PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102"S.ljust(width) -> unicode\n\
5103\n\
5104Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005105done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106
5107static PyObject *
5108unicode_ljust(PyUnicodeObject *self, PyObject *args)
5109{
5110 int width;
5111 if (!PyArg_ParseTuple(args, "i:ljust", &width))
5112 return NULL;
5113
Tim Peters7a29bd52001-09-12 03:03:31 +00005114 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005115 Py_INCREF(self);
5116 return (PyObject*) self;
5117 }
5118
5119 return (PyObject*) pad(self, 0, width - self->length, ' ');
5120}
5121
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005122PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123"S.lower() -> unicode\n\
5124\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005125Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126
5127static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005128unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005130 return fixup(self, fixlower);
5131}
5132
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005133#define LEFTSTRIP 0
5134#define RIGHTSTRIP 1
5135#define BOTHSTRIP 2
5136
5137/* Arrays indexed by above */
5138static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
5139
5140#define STRIPNAME(i) (stripformat[i]+3)
5141
5142static const Py_UNICODE *
5143unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
5144{
Tim Peters030a5ce2002-04-22 19:00:10 +00005145 size_t i;
5146 for (i = 0; i < n; ++i)
5147 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005148 return s+i;
5149 return NULL;
5150}
5151
5152/* externally visible for str.strip(unicode) */
5153PyObject *
5154_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
5155{
5156 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5157 int len = PyUnicode_GET_SIZE(self);
5158 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
5159 int seplen = PyUnicode_GET_SIZE(sepobj);
5160 int i, j;
5161
5162 i = 0;
5163 if (striptype != RIGHTSTRIP) {
5164 while (i < len && unicode_memchr(sep, s[i], seplen)) {
5165 i++;
5166 }
5167 }
5168
5169 j = len;
5170 if (striptype != LEFTSTRIP) {
5171 do {
5172 j--;
5173 } while (j >= i && unicode_memchr(sep, s[j], seplen));
5174 j++;
5175 }
5176
5177 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5178 Py_INCREF(self);
5179 return (PyObject*)self;
5180 }
5181 else
5182 return PyUnicode_FromUnicode(s+i, j-i);
5183}
5184
Guido van Rossumd57fd912000-03-10 22:53:23 +00005185
5186static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005187do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005189 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
5190 int len = PyUnicode_GET_SIZE(self), i, j;
5191
5192 i = 0;
5193 if (striptype != RIGHTSTRIP) {
5194 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
5195 i++;
5196 }
5197 }
5198
5199 j = len;
5200 if (striptype != LEFTSTRIP) {
5201 do {
5202 j--;
5203 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
5204 j++;
5205 }
5206
5207 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
5208 Py_INCREF(self);
5209 return (PyObject*)self;
5210 }
5211 else
5212 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213}
5214
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005215
5216static PyObject *
5217do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
5218{
5219 PyObject *sep = NULL;
5220
5221 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
5222 return NULL;
5223
5224 if (sep != NULL && sep != Py_None) {
5225 if (PyUnicode_Check(sep))
5226 return _PyUnicode_XStrip(self, striptype, sep);
5227 else if (PyString_Check(sep)) {
5228 PyObject *res;
5229 sep = PyUnicode_FromObject(sep);
5230 if (sep==NULL)
5231 return NULL;
5232 res = _PyUnicode_XStrip(self, striptype, sep);
5233 Py_DECREF(sep);
5234 return res;
5235 }
5236 else {
5237 PyErr_Format(PyExc_TypeError,
5238 "%s arg must be None, unicode or str",
5239 STRIPNAME(striptype));
5240 return NULL;
5241 }
5242 }
5243
5244 return do_strip(self, striptype);
5245}
5246
5247
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005248PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005249"S.strip([sep]) -> unicode\n\
5250\n\
5251Return a copy of the string S with leading and trailing\n\
5252whitespace removed.\n\
5253If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005254If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005255
5256static PyObject *
5257unicode_strip(PyUnicodeObject *self, PyObject *args)
5258{
5259 if (PyTuple_GET_SIZE(args) == 0)
5260 return do_strip(self, BOTHSTRIP); /* Common case */
5261 else
5262 return do_argstrip(self, BOTHSTRIP, args);
5263}
5264
5265
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005266PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005267"S.lstrip([sep]) -> unicode\n\
5268\n\
5269Return a copy of the string S with leading whitespace removed.\n\
5270If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005271If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005272
5273static PyObject *
5274unicode_lstrip(PyUnicodeObject *self, PyObject *args)
5275{
5276 if (PyTuple_GET_SIZE(args) == 0)
5277 return do_strip(self, LEFTSTRIP); /* Common case */
5278 else
5279 return do_argstrip(self, LEFTSTRIP, args);
5280}
5281
5282
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005283PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005284"S.rstrip([sep]) -> unicode\n\
5285\n\
5286Return a copy of the string S with trailing whitespace removed.\n\
5287If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005288If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005289
5290static PyObject *
5291unicode_rstrip(PyUnicodeObject *self, PyObject *args)
5292{
5293 if (PyTuple_GET_SIZE(args) == 0)
5294 return do_strip(self, RIGHTSTRIP); /* Common case */
5295 else
5296 return do_argstrip(self, RIGHTSTRIP, args);
5297}
5298
5299
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300static PyObject*
5301unicode_repeat(PyUnicodeObject *str, int len)
5302{
5303 PyUnicodeObject *u;
5304 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00005305 int nchars;
5306 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307
5308 if (len < 0)
5309 len = 0;
5310
Tim Peters7a29bd52001-09-12 03:03:31 +00005311 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 /* no repeat, return original string */
5313 Py_INCREF(str);
5314 return (PyObject*) str;
5315 }
Tim Peters8f422462000-09-09 06:13:41 +00005316
5317 /* ensure # of chars needed doesn't overflow int and # of bytes
5318 * needed doesn't overflow size_t
5319 */
5320 nchars = len * str->length;
5321 if (len && nchars / len != str->length) {
5322 PyErr_SetString(PyExc_OverflowError,
5323 "repeated string is too long");
5324 return NULL;
5325 }
5326 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
5327 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
5328 PyErr_SetString(PyExc_OverflowError,
5329 "repeated string is too long");
5330 return NULL;
5331 }
5332 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005333 if (!u)
5334 return NULL;
5335
5336 p = u->str;
5337
5338 while (len-- > 0) {
5339 Py_UNICODE_COPY(p, str->str, str->length);
5340 p += str->length;
5341 }
5342
5343 return (PyObject*) u;
5344}
5345
5346PyObject *PyUnicode_Replace(PyObject *obj,
5347 PyObject *subobj,
5348 PyObject *replobj,
5349 int maxcount)
5350{
5351 PyObject *self;
5352 PyObject *str1;
5353 PyObject *str2;
5354 PyObject *result;
5355
5356 self = PyUnicode_FromObject(obj);
5357 if (self == NULL)
5358 return NULL;
5359 str1 = PyUnicode_FromObject(subobj);
5360 if (str1 == NULL) {
5361 Py_DECREF(self);
5362 return NULL;
5363 }
5364 str2 = PyUnicode_FromObject(replobj);
5365 if (str2 == NULL) {
5366 Py_DECREF(self);
5367 Py_DECREF(str1);
5368 return NULL;
5369 }
5370 result = replace((PyUnicodeObject *)self,
5371 (PyUnicodeObject *)str1,
5372 (PyUnicodeObject *)str2,
5373 maxcount);
5374 Py_DECREF(self);
5375 Py_DECREF(str1);
5376 Py_DECREF(str2);
5377 return result;
5378}
5379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005380PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005381"S.replace (old, new[, maxsplit]) -> unicode\n\
5382\n\
5383Return a copy of S with all occurrences of substring\n\
5384old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005385given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005386
5387static PyObject*
5388unicode_replace(PyUnicodeObject *self, PyObject *args)
5389{
5390 PyUnicodeObject *str1;
5391 PyUnicodeObject *str2;
5392 int maxcount = -1;
5393 PyObject *result;
5394
5395 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
5396 return NULL;
5397 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
5398 if (str1 == NULL)
5399 return NULL;
5400 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005401 if (str2 == NULL) {
5402 Py_DECREF(str1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005403 return NULL;
Walter Dörwaldf6b56ae2003-02-09 23:42:56 +00005404 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405
5406 result = replace(self, str1, str2, maxcount);
5407
5408 Py_DECREF(str1);
5409 Py_DECREF(str2);
5410 return result;
5411}
5412
5413static
5414PyObject *unicode_repr(PyObject *unicode)
5415{
5416 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
5417 PyUnicode_GET_SIZE(unicode),
5418 1);
5419}
5420
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005421PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005422"S.rfind(sub [,start [,end]]) -> int\n\
5423\n\
5424Return the highest index in S where substring sub is found,\n\
5425such that sub is contained within s[start,end]. Optional\n\
5426arguments start and end are interpreted as in slice notation.\n\
5427\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005428Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429
5430static PyObject *
5431unicode_rfind(PyUnicodeObject *self, PyObject *args)
5432{
5433 PyUnicodeObject *substring;
5434 int start = 0;
5435 int end = INT_MAX;
5436 PyObject *result;
5437
Guido van Rossumb8872e62000-05-09 14:14:27 +00005438 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
5439 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 return NULL;
5441 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5442 (PyObject *)substring);
5443 if (substring == NULL)
5444 return NULL;
5445
5446 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
5447
5448 Py_DECREF(substring);
5449 return result;
5450}
5451
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005452PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453"S.rindex(sub [,start [,end]]) -> int\n\
5454\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005455Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456
5457static PyObject *
5458unicode_rindex(PyUnicodeObject *self, PyObject *args)
5459{
5460 int result;
5461 PyUnicodeObject *substring;
5462 int start = 0;
5463 int end = INT_MAX;
5464
Guido van Rossumb8872e62000-05-09 14:14:27 +00005465 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
5466 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005467 return NULL;
5468 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5469 (PyObject *)substring);
5470 if (substring == NULL)
5471 return NULL;
5472
5473 result = findstring(self, substring, start, end, -1);
5474
5475 Py_DECREF(substring);
5476 if (result < 0) {
5477 PyErr_SetString(PyExc_ValueError, "substring not found");
5478 return NULL;
5479 }
5480 return PyInt_FromLong(result);
5481}
5482
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005483PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484"S.rjust(width) -> unicode\n\
5485\n\
5486Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005487done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488
5489static PyObject *
5490unicode_rjust(PyUnicodeObject *self, PyObject *args)
5491{
5492 int width;
5493 if (!PyArg_ParseTuple(args, "i:rjust", &width))
5494 return NULL;
5495
Tim Peters7a29bd52001-09-12 03:03:31 +00005496 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 Py_INCREF(self);
5498 return (PyObject*) self;
5499 }
5500
5501 return (PyObject*) pad(self, width - self->length, 0, ' ');
5502}
5503
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504static PyObject*
5505unicode_slice(PyUnicodeObject *self, int start, int end)
5506{
5507 /* standard clamping */
5508 if (start < 0)
5509 start = 0;
5510 if (end < 0)
5511 end = 0;
5512 if (end > self->length)
5513 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00005514 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005515 /* full slice, return original string */
5516 Py_INCREF(self);
5517 return (PyObject*) self;
5518 }
5519 if (start > end)
5520 start = end;
5521 /* copy slice */
5522 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
5523 end - start);
5524}
5525
5526PyObject *PyUnicode_Split(PyObject *s,
5527 PyObject *sep,
5528 int maxsplit)
5529{
5530 PyObject *result;
5531
5532 s = PyUnicode_FromObject(s);
5533 if (s == NULL)
5534 return NULL;
5535 if (sep != NULL) {
5536 sep = PyUnicode_FromObject(sep);
5537 if (sep == NULL) {
5538 Py_DECREF(s);
5539 return NULL;
5540 }
5541 }
5542
5543 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
5544
5545 Py_DECREF(s);
5546 Py_XDECREF(sep);
5547 return result;
5548}
5549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005550PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551"S.split([sep [,maxsplit]]) -> list of strings\n\
5552\n\
5553Return a list of the words in S, using sep as the\n\
5554delimiter string. If maxsplit is given, at most maxsplit\n\
5555splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005556is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005557
5558static PyObject*
5559unicode_split(PyUnicodeObject *self, PyObject *args)
5560{
5561 PyObject *substring = Py_None;
5562 int maxcount = -1;
5563
5564 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
5565 return NULL;
5566
5567 if (substring == Py_None)
5568 return split(self, NULL, maxcount);
5569 else if (PyUnicode_Check(substring))
5570 return split(self, (PyUnicodeObject *)substring, maxcount);
5571 else
5572 return PyUnicode_Split((PyObject *)self, substring, maxcount);
5573}
5574
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005575PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00005576"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577\n\
5578Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00005579Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005580is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005581
5582static PyObject*
5583unicode_splitlines(PyUnicodeObject *self, PyObject *args)
5584{
Guido van Rossum86662912000-04-11 15:38:46 +00005585 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005586
Guido van Rossum86662912000-04-11 15:38:46 +00005587 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005588 return NULL;
5589
Guido van Rossum86662912000-04-11 15:38:46 +00005590 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005591}
5592
5593static
5594PyObject *unicode_str(PyUnicodeObject *self)
5595{
Fred Drakee4315f52000-05-09 19:53:39 +00005596 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005597}
5598
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005599PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005600"S.swapcase() -> unicode\n\
5601\n\
5602Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005603and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005604
5605static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005606unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005607{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005608 return fixup(self, fixswapcase);
5609}
5610
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005611PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005612"S.translate(table) -> unicode\n\
5613\n\
5614Return a copy of the string S, where all characters have been mapped\n\
5615through the given translation table, which must be a mapping of\n\
Walter Dörwald5c1ee172002-09-04 20:31:32 +00005616Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
5617Unmapped characters are left untouched. Characters mapped to None\n\
5618are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005619
5620static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005621unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005623 return PyUnicode_TranslateCharmap(self->str,
5624 self->length,
5625 table,
5626 "ignore");
5627}
5628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005629PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005630"S.upper() -> unicode\n\
5631\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005632Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633
5634static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005635unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005636{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005637 return fixup(self, fixupper);
5638}
5639
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005640PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005641"S.zfill(width) -> unicode\n\
5642\n\
5643Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005644of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005645
5646static PyObject *
5647unicode_zfill(PyUnicodeObject *self, PyObject *args)
5648{
5649 int fill;
5650 PyUnicodeObject *u;
5651
5652 int width;
5653 if (!PyArg_ParseTuple(args, "i:zfill", &width))
5654 return NULL;
5655
5656 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00005657 if (PyUnicode_CheckExact(self)) {
5658 Py_INCREF(self);
5659 return (PyObject*) self;
5660 }
5661 else
5662 return PyUnicode_FromUnicode(
5663 PyUnicode_AS_UNICODE(self),
5664 PyUnicode_GET_SIZE(self)
5665 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00005666 }
5667
5668 fill = width - self->length;
5669
5670 u = pad(self, fill, 0, '0');
5671
Walter Dörwald068325e2002-04-15 13:36:47 +00005672 if (u == NULL)
5673 return NULL;
5674
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 if (u->str[fill] == '+' || u->str[fill] == '-') {
5676 /* move sign to beginning of string */
5677 u->str[0] = u->str[fill];
5678 u->str[fill] = '0';
5679 }
5680
5681 return (PyObject*) u;
5682}
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683
5684#if 0
5685static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005686unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687{
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 return PyInt_FromLong(unicode_freelist_size);
5689}
5690#endif
5691
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005692PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005693"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005695Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005696optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005697comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698
5699static PyObject *
5700unicode_startswith(PyUnicodeObject *self,
5701 PyObject *args)
5702{
5703 PyUnicodeObject *substring;
5704 int start = 0;
5705 int end = INT_MAX;
5706 PyObject *result;
5707
Guido van Rossumb8872e62000-05-09 14:14:27 +00005708 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
5709 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005710 return NULL;
5711 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5712 (PyObject *)substring);
5713 if (substring == NULL)
5714 return NULL;
5715
Guido van Rossum77f6a652002-04-03 22:41:51 +00005716 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717
5718 Py_DECREF(substring);
5719 return result;
5720}
5721
5722
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005723PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00005724"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005725\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00005726Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005728comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
5730static PyObject *
5731unicode_endswith(PyUnicodeObject *self,
5732 PyObject *args)
5733{
5734 PyUnicodeObject *substring;
5735 int start = 0;
5736 int end = INT_MAX;
5737 PyObject *result;
5738
Guido van Rossumb8872e62000-05-09 14:14:27 +00005739 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
5740 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005741 return NULL;
5742 substring = (PyUnicodeObject *)PyUnicode_FromObject(
5743 (PyObject *)substring);
5744 if (substring == NULL)
5745 return NULL;
5746
Guido van Rossum77f6a652002-04-03 22:41:51 +00005747 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748
5749 Py_DECREF(substring);
5750 return result;
5751}
5752
5753
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005754
5755static PyObject *
5756unicode_getnewargs(PyUnicodeObject *v)
5757{
5758 return Py_BuildValue("(u#)", v->str, v->length);
5759}
5760
5761
Guido van Rossumd57fd912000-03-10 22:53:23 +00005762static PyMethodDef unicode_methods[] = {
5763
5764 /* Order is according to common usage: often used methods should
5765 appear first, since lookup is done sequentially. */
5766
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005767 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5768 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5769 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5770 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5771 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5772 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5773 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5774 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5775 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5776 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5777 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5778 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5779 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005780 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005781/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5782 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5783 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5784 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005785 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005786 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005787 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005788 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5789 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5790 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5791 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5792 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5793 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5794 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5795 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5796 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5797 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5798 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5799 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5800 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5801 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005802 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005803#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005804 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005805#endif
5806
5807#if 0
5808 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005809 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005810#endif
5811
Guido van Rossum5d9113d2003-01-29 17:58:45 +00005812 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 {NULL, NULL}
5814};
5815
Neil Schemenauerce30bc92002-11-18 16:10:18 +00005816static PyObject *
5817unicode_mod(PyObject *v, PyObject *w)
5818{
5819 if (!PyUnicode_Check(v)) {
5820 Py_INCREF(Py_NotImplemented);
5821 return Py_NotImplemented;
5822 }
5823 return PyUnicode_Format(v, w);
5824}
5825
5826static PyNumberMethods unicode_as_number = {
5827 0, /*nb_add*/
5828 0, /*nb_subtract*/
5829 0, /*nb_multiply*/
5830 0, /*nb_divide*/
5831 unicode_mod, /*nb_remainder*/
5832};
5833
Guido van Rossumd57fd912000-03-10 22:53:23 +00005834static PySequenceMethods unicode_as_sequence = {
5835 (inquiry) unicode_length, /* sq_length */
5836 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5837 (intargfunc) unicode_repeat, /* sq_repeat */
5838 (intargfunc) unicode_getitem, /* sq_item */
5839 (intintargfunc) unicode_slice, /* sq_slice */
5840 0, /* sq_ass_item */
5841 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005842 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843};
5844
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005845static PyObject*
5846unicode_subscript(PyUnicodeObject* self, PyObject* item)
5847{
5848 if (PyInt_Check(item)) {
5849 long i = PyInt_AS_LONG(item);
5850 if (i < 0)
5851 i += PyString_GET_SIZE(self);
5852 return unicode_getitem(self, i);
5853 } else if (PyLong_Check(item)) {
5854 long i = PyLong_AsLong(item);
5855 if (i == -1 && PyErr_Occurred())
5856 return NULL;
5857 if (i < 0)
5858 i += PyString_GET_SIZE(self);
5859 return unicode_getitem(self, i);
5860 } else if (PySlice_Check(item)) {
5861 int start, stop, step, slicelength, cur, i;
5862 Py_UNICODE* source_buf;
5863 Py_UNICODE* result_buf;
5864 PyObject* result;
5865
5866 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5867 &start, &stop, &step, &slicelength) < 0) {
5868 return NULL;
5869 }
5870
5871 if (slicelength <= 0) {
5872 return PyUnicode_FromUnicode(NULL, 0);
5873 } else {
5874 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5875 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5876
5877 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5878 result_buf[i] = source_buf[cur];
5879 }
5880
5881 result = PyUnicode_FromUnicode(result_buf, slicelength);
5882 PyMem_FREE(result_buf);
5883 return result;
5884 }
5885 } else {
5886 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5887 return NULL;
5888 }
5889}
5890
5891static PyMappingMethods unicode_as_mapping = {
5892 (inquiry)unicode_length, /* mp_length */
5893 (binaryfunc)unicode_subscript, /* mp_subscript */
5894 (objobjargproc)0, /* mp_ass_subscript */
5895};
5896
Guido van Rossumd57fd912000-03-10 22:53:23 +00005897static int
5898unicode_buffer_getreadbuf(PyUnicodeObject *self,
5899 int index,
5900 const void **ptr)
5901{
5902 if (index != 0) {
5903 PyErr_SetString(PyExc_SystemError,
5904 "accessing non-existent unicode segment");
5905 return -1;
5906 }
5907 *ptr = (void *) self->str;
5908 return PyUnicode_GET_DATA_SIZE(self);
5909}
5910
5911static int
5912unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5913 const void **ptr)
5914{
5915 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005916 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005917 return -1;
5918}
5919
5920static int
5921unicode_buffer_getsegcount(PyUnicodeObject *self,
5922 int *lenp)
5923{
5924 if (lenp)
5925 *lenp = PyUnicode_GET_DATA_SIZE(self);
5926 return 1;
5927}
5928
5929static int
5930unicode_buffer_getcharbuf(PyUnicodeObject *self,
5931 int index,
5932 const void **ptr)
5933{
5934 PyObject *str;
5935
5936 if (index != 0) {
5937 PyErr_SetString(PyExc_SystemError,
5938 "accessing non-existent unicode segment");
5939 return -1;
5940 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005941 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 if (str == NULL)
5943 return -1;
5944 *ptr = (void *) PyString_AS_STRING(str);
5945 return PyString_GET_SIZE(str);
5946}
5947
5948/* Helpers for PyUnicode_Format() */
5949
5950static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005951getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952{
5953 int argidx = *p_argidx;
5954 if (argidx < arglen) {
5955 (*p_argidx)++;
5956 if (arglen < 0)
5957 return args;
5958 else
5959 return PyTuple_GetItem(args, argidx);
5960 }
5961 PyErr_SetString(PyExc_TypeError,
5962 "not enough arguments for format string");
5963 return NULL;
5964}
5965
5966#define F_LJUST (1<<0)
5967#define F_SIGN (1<<1)
5968#define F_BLANK (1<<2)
5969#define F_ALT (1<<3)
5970#define F_ZERO (1<<4)
5971
5972static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005973int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974{
5975 register int i;
5976 int len;
5977 va_list va;
5978 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005980
5981 /* First, format the string as char array, then expand to Py_UNICODE
5982 array. */
5983 charbuffer = (char *)buffer;
5984 len = vsprintf(charbuffer, format, va);
5985 for (i = len - 1; i >= 0; i--)
5986 buffer[i] = (Py_UNICODE) charbuffer[i];
5987
5988 va_end(va);
5989 return len;
5990}
5991
Guido van Rossum078151d2002-08-11 04:24:12 +00005992/* XXX To save some code duplication, formatfloat/long/int could have been
5993 shared with stringobject.c, converting from 8-bit to Unicode after the
5994 formatting is done. */
5995
Guido van Rossumd57fd912000-03-10 22:53:23 +00005996static int
5997formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005998 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999 int flags,
6000 int prec,
6001 int type,
6002 PyObject *v)
6003{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006004 /* fmt = '%#.' + `prec` + `type`
6005 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00006006 char fmt[20];
6007 double x;
6008
6009 x = PyFloat_AsDouble(v);
6010 if (x == -1.0 && PyErr_Occurred())
6011 return -1;
6012 if (prec < 0)
6013 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006014 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
6015 type = 'g';
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006016 /* Worst case length calc to ensure no buffer overrun:
6017
6018 'g' formats:
6019 fmt = %#.<prec>g
6020 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
6021 for any double rep.)
6022 len = 1 + prec + 1 + 2 + 5 = 9 + prec
6023
6024 'f' formats:
6025 buf = '-' + [0-9]*x + '.' + [0-9]*prec (with x < 50)
6026 len = 1 + 50 + 1 + prec = 52 + prec
6027
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006028 If prec=0 the effective precision is 1 (the leading digit is
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006029 always given), therefore increase the length by one.
6030
6031 */
6032 if ((type == 'g' && buflen <= (size_t)10 + (size_t)prec) ||
6033 (type == 'f' && buflen <= (size_t)53 + (size_t)prec)) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006034 PyErr_SetString(PyExc_OverflowError,
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006035 "formatted float is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006036 return -1;
6037 }
Marc-André Lemburg79f57832002-12-29 19:44:06 +00006038 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
6039 (flags&F_ALT) ? "#" : "",
6040 prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006041 return usprintf(buf, fmt, x);
6042}
6043
Tim Peters38fd5b62000-09-21 05:43:11 +00006044static PyObject*
6045formatlong(PyObject *val, int flags, int prec, int type)
6046{
6047 char *buf;
6048 int i, len;
6049 PyObject *str; /* temporary string object. */
6050 PyUnicodeObject *result;
6051
6052 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
6053 if (!str)
6054 return NULL;
6055 result = _PyUnicode_New(len);
6056 for (i = 0; i < len; i++)
6057 result->str[i] = buf[i];
6058 result->str[len] = 0;
6059 Py_DECREF(str);
6060 return (PyObject*)result;
6061}
6062
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063static int
6064formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006065 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066 int flags,
6067 int prec,
6068 int type,
6069 PyObject *v)
6070{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006071 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006072 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
6073 * + 1 + 1
6074 * = 24
6075 */
Tim Peters38fd5b62000-09-21 05:43:11 +00006076 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006077 long x;
6078
6079 x = PyInt_AsLong(v);
6080 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006081 return -1;
Guido van Rossum078151d2002-08-11 04:24:12 +00006082 if (x < 0 && type != 'd' && type != 'i') {
Guido van Rossum54df53a2002-08-14 18:38:27 +00006083 if (PyErr_Warn(PyExc_FutureWarning,
Guido van Rossum078151d2002-08-11 04:24:12 +00006084 "%u/%o/%x/%X of negative int will return "
6085 "a signed string in Python 2.4 and up") < 0)
6086 return -1;
6087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006088 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006089 prec = 1;
6090
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006091 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006092 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
6093 */
6094 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006095 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006096 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006097 return -1;
6098 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006099
6100 if ((flags & F_ALT) &&
6101 (type == 'x' || type == 'X')) {
6102 /* When converting under %#x or %#X, there are a number
6103 * of issues that cause pain:
6104 * - when 0 is being converted, the C standard leaves off
6105 * the '0x' or '0X', which is inconsistent with other
6106 * %#x/%#X conversions and inconsistent with Python's
6107 * hex() function
6108 * - there are platforms that violate the standard and
6109 * convert 0 with the '0x' or '0X'
6110 * (Metrowerks, Compaq Tru64)
6111 * - there are platforms that give '0x' when converting
6112 * under %#X, but convert 0 in accordance with the
6113 * standard (OS/2 EMX)
6114 *
6115 * We can achieve the desired consistency by inserting our
6116 * own '0x' or '0X' prefix, and substituting %x/%X in place
6117 * of %#x/%#X.
6118 *
6119 * Note that this is the same approach as used in
6120 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006121 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006122 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
6123 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00006124 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00006125 else {
6126 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
6127 (flags&F_ALT) ? "#" : "",
6128 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00006129 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130 return usprintf(buf, fmt, x);
6131}
6132
6133static int
6134formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006135 size_t buflen,
6136 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006137{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006138 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006139 if (PyUnicode_Check(v)) {
6140 if (PyUnicode_GET_SIZE(v) != 1)
6141 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006142 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006143 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006145 else if (PyString_Check(v)) {
6146 if (PyString_GET_SIZE(v) != 1)
6147 goto onError;
6148 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
6149 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006150
6151 else {
6152 /* Integer input truncated to a character */
6153 long x;
6154 x = PyInt_AsLong(v);
6155 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006156 goto onError;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00006157#ifdef Py_UNICODE_WIDE
6158 if (x < 0 || x > 0x10ffff) {
6159 PyErr_SetString(PyExc_ValueError,
6160 "%c arg not in range(0x110000) "
6161 "(wide Python build)");
6162 return -1;
6163 }
6164#else
6165 if (x < 0 || x > 0xffff) {
6166 PyErr_SetString(PyExc_ValueError,
6167 "%c arg not in range(0x10000) "
6168 "(narrow Python build)");
6169 return -1;
6170 }
6171#endif
6172 buf[0] = (Py_UNICODE) x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006173 }
6174 buf[1] = '\0';
6175 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006176
6177 onError:
6178 PyErr_SetString(PyExc_TypeError,
6179 "%c requires int or char");
6180 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181}
6182
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006183/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
6184
6185 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
6186 chars are formatted. XXX This is a magic number. Each formatting
6187 routine does bounds checking to ensure no overflow, but a better
6188 solution may be to malloc a buffer of appropriate size for each
6189 format. For now, the current solution is sufficient.
6190*/
6191#define FORMATBUFLEN (size_t)120
6192
Guido van Rossumd57fd912000-03-10 22:53:23 +00006193PyObject *PyUnicode_Format(PyObject *format,
6194 PyObject *args)
6195{
6196 Py_UNICODE *fmt, *res;
6197 int fmtcnt, rescnt, reslen, arglen, argidx;
6198 int args_owned = 0;
6199 PyUnicodeObject *result = NULL;
6200 PyObject *dict = NULL;
6201 PyObject *uformat;
6202
6203 if (format == NULL || args == NULL) {
6204 PyErr_BadInternalCall();
6205 return NULL;
6206 }
6207 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00006208 if (uformat == NULL)
6209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006210 fmt = PyUnicode_AS_UNICODE(uformat);
6211 fmtcnt = PyUnicode_GET_SIZE(uformat);
6212
6213 reslen = rescnt = fmtcnt + 100;
6214 result = _PyUnicode_New(reslen);
6215 if (result == NULL)
6216 goto onError;
6217 res = PyUnicode_AS_UNICODE(result);
6218
6219 if (PyTuple_Check(args)) {
6220 arglen = PyTuple_Size(args);
6221 argidx = 0;
6222 }
6223 else {
6224 arglen = -1;
6225 argidx = -2;
6226 }
Neal Norwitz80a1bf42002-11-12 23:01:12 +00006227 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args) &&
6228 !PyObject_TypeCheck(args, &PyBaseString_Type))
Guido van Rossumd57fd912000-03-10 22:53:23 +00006229 dict = args;
6230
6231 while (--fmtcnt >= 0) {
6232 if (*fmt != '%') {
6233 if (--rescnt < 0) {
6234 rescnt = fmtcnt + 100;
6235 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006236 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237 return NULL;
6238 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
6239 --rescnt;
6240 }
6241 *res++ = *fmt++;
6242 }
6243 else {
6244 /* Got a format specifier */
6245 int flags = 0;
6246 int width = -1;
6247 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248 Py_UNICODE c = '\0';
6249 Py_UNICODE fill;
6250 PyObject *v = NULL;
6251 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006252 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006253 Py_UNICODE sign;
6254 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006255 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006256
6257 fmt++;
6258 if (*fmt == '(') {
6259 Py_UNICODE *keystart;
6260 int keylen;
6261 PyObject *key;
6262 int pcount = 1;
6263
6264 if (dict == NULL) {
6265 PyErr_SetString(PyExc_TypeError,
6266 "format requires a mapping");
6267 goto onError;
6268 }
6269 ++fmt;
6270 --fmtcnt;
6271 keystart = fmt;
6272 /* Skip over balanced parentheses */
6273 while (pcount > 0 && --fmtcnt >= 0) {
6274 if (*fmt == ')')
6275 --pcount;
6276 else if (*fmt == '(')
6277 ++pcount;
6278 fmt++;
6279 }
6280 keylen = fmt - keystart - 1;
6281 if (fmtcnt < 0 || pcount > 0) {
6282 PyErr_SetString(PyExc_ValueError,
6283 "incomplete format key");
6284 goto onError;
6285 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006286#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00006287 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00006288 then looked up since Python uses strings to hold
6289 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00006290 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006291 key = PyUnicode_EncodeUTF8(keystart,
6292 keylen,
6293 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00006294#else
6295 key = PyUnicode_FromUnicode(keystart, keylen);
6296#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00006297 if (key == NULL)
6298 goto onError;
6299 if (args_owned) {
6300 Py_DECREF(args);
6301 args_owned = 0;
6302 }
6303 args = PyObject_GetItem(dict, key);
6304 Py_DECREF(key);
6305 if (args == NULL) {
6306 goto onError;
6307 }
6308 args_owned = 1;
6309 arglen = -1;
6310 argidx = -2;
6311 }
6312 while (--fmtcnt >= 0) {
6313 switch (c = *fmt++) {
6314 case '-': flags |= F_LJUST; continue;
6315 case '+': flags |= F_SIGN; continue;
6316 case ' ': flags |= F_BLANK; continue;
6317 case '#': flags |= F_ALT; continue;
6318 case '0': flags |= F_ZERO; continue;
6319 }
6320 break;
6321 }
6322 if (c == '*') {
6323 v = getnextarg(args, arglen, &argidx);
6324 if (v == NULL)
6325 goto onError;
6326 if (!PyInt_Check(v)) {
6327 PyErr_SetString(PyExc_TypeError,
6328 "* wants int");
6329 goto onError;
6330 }
6331 width = PyInt_AsLong(v);
6332 if (width < 0) {
6333 flags |= F_LJUST;
6334 width = -width;
6335 }
6336 if (--fmtcnt >= 0)
6337 c = *fmt++;
6338 }
6339 else if (c >= '0' && c <= '9') {
6340 width = c - '0';
6341 while (--fmtcnt >= 0) {
6342 c = *fmt++;
6343 if (c < '0' || c > '9')
6344 break;
6345 if ((width*10) / 10 != width) {
6346 PyErr_SetString(PyExc_ValueError,
6347 "width too big");
6348 goto onError;
6349 }
6350 width = width*10 + (c - '0');
6351 }
6352 }
6353 if (c == '.') {
6354 prec = 0;
6355 if (--fmtcnt >= 0)
6356 c = *fmt++;
6357 if (c == '*') {
6358 v = getnextarg(args, arglen, &argidx);
6359 if (v == NULL)
6360 goto onError;
6361 if (!PyInt_Check(v)) {
6362 PyErr_SetString(PyExc_TypeError,
6363 "* wants int");
6364 goto onError;
6365 }
6366 prec = PyInt_AsLong(v);
6367 if (prec < 0)
6368 prec = 0;
6369 if (--fmtcnt >= 0)
6370 c = *fmt++;
6371 }
6372 else if (c >= '0' && c <= '9') {
6373 prec = c - '0';
6374 while (--fmtcnt >= 0) {
6375 c = Py_CHARMASK(*fmt++);
6376 if (c < '0' || c > '9')
6377 break;
6378 if ((prec*10) / 10 != prec) {
6379 PyErr_SetString(PyExc_ValueError,
6380 "prec too big");
6381 goto onError;
6382 }
6383 prec = prec*10 + (c - '0');
6384 }
6385 }
6386 } /* prec */
6387 if (fmtcnt >= 0) {
6388 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 if (--fmtcnt >= 0)
6390 c = *fmt++;
6391 }
6392 }
6393 if (fmtcnt < 0) {
6394 PyErr_SetString(PyExc_ValueError,
6395 "incomplete format");
6396 goto onError;
6397 }
6398 if (c != '%') {
6399 v = getnextarg(args, arglen, &argidx);
6400 if (v == NULL)
6401 goto onError;
6402 }
6403 sign = 0;
6404 fill = ' ';
6405 switch (c) {
6406
6407 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006408 pbuf = formatbuf;
6409 /* presume that buffer length is at least 1 */
6410 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006411 len = 1;
6412 break;
6413
6414 case 's':
6415 case 'r':
6416 if (PyUnicode_Check(v) && c == 's') {
6417 temp = v;
6418 Py_INCREF(temp);
6419 }
6420 else {
6421 PyObject *unicode;
6422 if (c == 's')
6423 temp = PyObject_Str(v);
6424 else
6425 temp = PyObject_Repr(v);
6426 if (temp == NULL)
6427 goto onError;
6428 if (!PyString_Check(temp)) {
6429 /* XXX Note: this should never happen, since
6430 PyObject_Repr() and PyObject_Str() assure
6431 this */
6432 Py_DECREF(temp);
6433 PyErr_SetString(PyExc_TypeError,
6434 "%s argument has non-string str()");
6435 goto onError;
6436 }
Fred Drakee4315f52000-05-09 19:53:39 +00006437 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00006439 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440 "strict");
6441 Py_DECREF(temp);
6442 temp = unicode;
6443 if (temp == NULL)
6444 goto onError;
6445 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006446 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006447 len = PyUnicode_GET_SIZE(temp);
6448 if (prec >= 0 && len > prec)
6449 len = prec;
6450 break;
6451
6452 case 'i':
6453 case 'd':
6454 case 'u':
6455 case 'o':
6456 case 'x':
6457 case 'X':
6458 if (c == 'i')
6459 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00006460 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006461 temp = formatlong(v, flags, prec, c);
6462 if (!temp)
6463 goto onError;
6464 pbuf = PyUnicode_AS_UNICODE(temp);
6465 len = PyUnicode_GET_SIZE(temp);
6466 /* unbounded ints can always produce
6467 a sign character! */
6468 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006470 else {
6471 pbuf = formatbuf;
6472 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6473 flags, prec, c, v);
6474 if (len < 0)
6475 goto onError;
6476 /* only d conversion is signed */
6477 sign = c == 'd';
6478 }
6479 if (flags & F_ZERO)
6480 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00006481 break;
6482
6483 case 'e':
6484 case 'E':
6485 case 'f':
6486 case 'g':
6487 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006488 pbuf = formatbuf;
6489 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
6490 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006491 if (len < 0)
6492 goto onError;
6493 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00006494 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006495 fill = '0';
6496 break;
6497
6498 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006499 pbuf = formatbuf;
6500 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006501 if (len < 0)
6502 goto onError;
6503 break;
6504
6505 default:
6506 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00006507 "unsupported format character '%c' (0x%x) "
6508 "at index %i",
Neal Norwitza0378e12002-09-13 13:47:06 +00006509 (31<=c && c<=126) ? (char)c : '?',
Marc-André Lemburg24e53b62002-09-24 09:32:14 +00006510 (int)c,
Guido van Rossumefc11882002-09-12 14:43:41 +00006511 (int)(fmt -1 - PyUnicode_AS_UNICODE(uformat)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00006512 goto onError;
6513 }
6514 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00006515 if (*pbuf == '-' || *pbuf == '+') {
6516 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006517 len--;
6518 }
6519 else if (flags & F_SIGN)
6520 sign = '+';
6521 else if (flags & F_BLANK)
6522 sign = ' ';
6523 else
6524 sign = 0;
6525 }
6526 if (width < len)
6527 width = len;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006528 if (rescnt - (sign != 0) < width) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006529 reslen -= rescnt;
6530 rescnt = width + fmtcnt + 100;
6531 reslen += rescnt;
Guido van Rossum049cd6b2002-10-11 00:43:48 +00006532 if (reslen < 0) {
6533 Py_DECREF(result);
6534 return PyErr_NoMemory();
6535 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006536 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537 return NULL;
6538 res = PyUnicode_AS_UNICODE(result)
6539 + reslen - rescnt;
6540 }
6541 if (sign) {
6542 if (fill != ' ')
6543 *res++ = sign;
6544 rescnt--;
6545 if (width > len)
6546 width--;
6547 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006548 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
6549 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006550 assert(pbuf[1] == c);
6551 if (fill != ' ') {
6552 *res++ = *pbuf++;
6553 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00006554 }
Tim Petersfff53252001-04-12 18:38:48 +00006555 rescnt -= 2;
6556 width -= 2;
6557 if (width < 0)
6558 width = 0;
6559 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00006560 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 if (width > len && !(flags & F_LJUST)) {
6562 do {
6563 --rescnt;
6564 *res++ = fill;
6565 } while (--width > len);
6566 }
Tim Peters38fd5b62000-09-21 05:43:11 +00006567 if (fill == ' ') {
6568 if (sign)
6569 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00006570 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00006571 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00006572 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00006573 *res++ = *pbuf++;
6574 *res++ = *pbuf++;
6575 }
6576 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006577 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006578 res += len;
6579 rescnt -= len;
6580 while (--width >= len) {
6581 --rescnt;
6582 *res++ = ' ';
6583 }
6584 if (dict && (argidx < arglen) && c != '%') {
6585 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006586 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006587 goto onError;
6588 }
6589 Py_XDECREF(temp);
6590 } /* '%' */
6591 } /* until end */
6592 if (argidx < arglen && !dict) {
6593 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00006594 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006595 goto onError;
6596 }
6597
6598 if (args_owned) {
6599 Py_DECREF(args);
6600 }
6601 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006602 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006603 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006604 return (PyObject *)result;
6605
6606 onError:
6607 Py_XDECREF(result);
6608 Py_DECREF(uformat);
6609 if (args_owned) {
6610 Py_DECREF(args);
6611 }
6612 return NULL;
6613}
6614
6615static PyBufferProcs unicode_as_buffer = {
6616 (getreadbufferproc) unicode_buffer_getreadbuf,
6617 (getwritebufferproc) unicode_buffer_getwritebuf,
6618 (getsegcountproc) unicode_buffer_getsegcount,
6619 (getcharbufferproc) unicode_buffer_getcharbuf,
6620};
6621
Jeremy Hylton938ace62002-07-17 16:30:39 +00006622static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00006623unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
6624
Tim Peters6d6c1a32001-08-02 04:15:00 +00006625static PyObject *
6626unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6627{
6628 PyObject *x = NULL;
6629 static char *kwlist[] = {"string", "encoding", "errors", 0};
6630 char *encoding = NULL;
6631 char *errors = NULL;
6632
Guido van Rossume023fe02001-08-30 03:12:59 +00006633 if (type != &PyUnicode_Type)
6634 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00006635 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
6636 kwlist, &x, &encoding, &errors))
6637 return NULL;
6638 if (x == NULL)
6639 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00006640 if (encoding == NULL && errors == NULL)
6641 return PyObject_Unicode(x);
6642 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00006643 return PyUnicode_FromEncodedObject(x, encoding, errors);
6644}
6645
Guido van Rossume023fe02001-08-30 03:12:59 +00006646static PyObject *
6647unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
6648{
Tim Petersaf90b3e2001-09-12 05:18:58 +00006649 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006650 int n;
6651
6652 assert(PyType_IsSubtype(type, &PyUnicode_Type));
6653 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
6654 if (tmp == NULL)
6655 return NULL;
6656 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00006657 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
6658 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00006659 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00006660 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
6661 if (pnew->str == NULL) {
6662 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006663 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00006664 return NULL;
6665 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00006666 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
6667 pnew->length = n;
6668 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00006669 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00006670 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00006671}
6672
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00006673PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00006674"unicode(string [, encoding[, errors]]) -> object\n\
6675\n\
6676Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00006677encoding defaults to the current default string encoding.\n\
6678errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00006679
Guido van Rossumd57fd912000-03-10 22:53:23 +00006680PyTypeObject PyUnicode_Type = {
6681 PyObject_HEAD_INIT(&PyType_Type)
6682 0, /* ob_size */
6683 "unicode", /* tp_name */
6684 sizeof(PyUnicodeObject), /* tp_size */
6685 0, /* tp_itemsize */
6686 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00006687 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006688 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006689 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006690 0, /* tp_setattr */
6691 (cmpfunc) unicode_compare, /* tp_compare */
6692 (reprfunc) unicode_repr, /* tp_repr */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006693 &unicode_as_number, /* tp_as_number */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006694 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00006695 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006696 (hashfunc) unicode_hash, /* tp_hash*/
6697 0, /* tp_call*/
6698 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006699 PyObject_GenericGetAttr, /* tp_getattro */
6700 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006701 &unicode_as_buffer, /* tp_as_buffer */
Neil Schemenauerce30bc92002-11-18 16:10:18 +00006702 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
6703 Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006704 unicode_doc, /* tp_doc */
6705 0, /* tp_traverse */
6706 0, /* tp_clear */
6707 0, /* tp_richcompare */
6708 0, /* tp_weaklistoffset */
6709 0, /* tp_iter */
6710 0, /* tp_iternext */
6711 unicode_methods, /* tp_methods */
6712 0, /* tp_members */
6713 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00006714 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00006715 0, /* tp_dict */
6716 0, /* tp_descr_get */
6717 0, /* tp_descr_set */
6718 0, /* tp_dictoffset */
6719 0, /* tp_init */
6720 0, /* tp_alloc */
6721 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006722 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006723};
6724
6725/* Initialize the Unicode implementation */
6726
Thomas Wouters78890102000-07-22 19:25:51 +00006727void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006729 int i;
6730
Fred Drakee4315f52000-05-09 19:53:39 +00006731 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006732 unicode_freelist = NULL;
6733 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006734 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00006735 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006736 for (i = 0; i < 256; i++)
6737 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00006738 if (PyType_Ready(&PyUnicode_Type) < 0)
6739 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740}
6741
6742/* Finalize the Unicode implementation */
6743
6744void
Thomas Wouters78890102000-07-22 19:25:51 +00006745_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006746{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006747 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006748 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006749
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00006750 Py_XDECREF(unicode_empty);
6751 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006752
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00006753 for (i = 0; i < 256; i++) {
6754 if (unicode_latin1[i]) {
6755 Py_DECREF(unicode_latin1[i]);
6756 unicode_latin1[i] = NULL;
6757 }
6758 }
6759
Barry Warsaw5b4c2282000-10-03 20:45:26 +00006760 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006761 PyUnicodeObject *v = u;
6762 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00006763 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00006764 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00006765 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00006766 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006767 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00006768 unicode_freelist = NULL;
6769 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006770}