blob: 6ca709b8d308894ffcd2f0529e89ddc4231456b7 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
Tim Peters5de98422002-04-27 18:44:32 +0000930 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000931 return v;
932}
933
934#undef SPECIAL
935#undef B64
936#undef B64CHAR
937#undef UB64
938#undef ENCODE
939#undef DECODE
940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941/* --- UTF-8 Codec -------------------------------------------------------- */
942
943static
944char utf8_code_length[256] = {
945 /* Map UTF-8 encoded prefix byte to sequence length. zero means
946 illegal prefix. see RFC 2279 for details */
947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
948 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
957 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
960 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
961 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
962 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
963};
964
965static
966int utf8_decoding_error(const char **source,
967 Py_UNICODE **dest,
968 const char *errors,
969 const char *details)
970{
971 if ((errors == NULL) ||
972 (strcmp(errors,"strict") == 0)) {
973 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000974 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975 details);
976 return -1;
977 }
978 else if (strcmp(errors,"ignore") == 0) {
979 (*source)++;
980 return 0;
981 }
982 else if (strcmp(errors,"replace") == 0) {
983 (*source)++;
984 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
985 (*dest)++;
986 return 0;
987 }
988 else {
989 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000990 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 errors);
992 return -1;
993 }
994}
995
Guido van Rossumd57fd912000-03-10 22:53:23 +0000996PyObject *PyUnicode_DecodeUTF8(const char *s,
997 int size,
998 const char *errors)
999{
1000 int n;
1001 const char *e;
1002 PyUnicodeObject *unicode;
1003 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001004 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005
1006 /* Note: size will always be longer than the resulting Unicode
1007 character count */
1008 unicode = _PyUnicode_New(size);
1009 if (!unicode)
1010 return NULL;
1011 if (size == 0)
1012 return (PyObject *)unicode;
1013
1014 /* Unpack UTF-8 encoded data */
1015 p = unicode->str;
1016 e = s + size;
1017
1018 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001019 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020
1021 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 s++;
1024 continue;
1025 }
1026
1027 n = utf8_code_length[ch];
1028
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001029 if (s + n > e) {
1030 errmsg = "unexpected end of data";
1031 goto utf8Error;
1032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033
1034 switch (n) {
1035
1036 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001037 errmsg = "unexpected code byte";
1038 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039
1040 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001041 errmsg = "internal error";
1042 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001045 if ((s[1] & 0xc0) != 0x80) {
1046 errmsg = "invalid data";
1047 goto utf8Error;
1048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001050 if (ch < 0x80) {
1051 errmsg = "illegal encoding";
1052 goto utf8Error;
1053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 break;
1057
1058 case 3:
1059 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001060 (s[2] & 0xc0) != 0x80) {
1061 errmsg = "invalid data";
1062 goto utf8Error;
1063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001065 if (ch < 0x0800) {
1066 /* Note: UTF-8 encodings of surrogates are considered
1067 legal UTF-8 sequences;
1068
1069 XXX For wide builds (UCS-4) we should probably try
1070 to recombine the surrogates into a single code
1071 unit.
1072 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001073 errmsg = "illegal encoding";
1074 goto utf8Error;
1075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001077 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001078 break;
1079
1080 case 4:
1081 if ((s[1] & 0xc0) != 0x80 ||
1082 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001083 (s[3] & 0xc0) != 0x80) {
1084 errmsg = "invalid data";
1085 goto utf8Error;
1086 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001087 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1088 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1089 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001090 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001091 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001092 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001093 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001094 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001095 errmsg = "illegal encoding";
1096 goto utf8Error;
1097 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001098#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001099 *p++ = (Py_UNICODE)ch;
1100#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001101 /* compute and append the two surrogates: */
1102
1103 /* translate from 10000..10FFFF to 0..FFFF */
1104 ch -= 0x10000;
1105
1106 /* high surrogate = top 10 bits added to D800 */
1107 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1108
1109 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001110 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001111#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 break;
1113
1114 default:
1115 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001116 errmsg = "unsupported Unicode code range";
1117 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 }
1119 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001120 continue;
1121
1122 utf8Error:
1123 if (utf8_decoding_error(&s, &p, errors, errmsg))
1124 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 }
1126
1127 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001128 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129 goto onError;
1130
1131 return (PyObject *)unicode;
1132
1133onError:
1134 Py_DECREF(unicode);
1135 return NULL;
1136}
1137
Tim Peters602f7402002-04-27 18:03:26 +00001138/* Allocation strategy: if the string is short, convert into a stack buffer
1139 and allocate exactly as much space needed at the end. Else allocate the
1140 maximum possible needed (4 result bytes per Unicode character), and return
1141 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001142*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001143PyObject *
1144PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1145 int size,
1146 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147{
Tim Peters602f7402002-04-27 18:03:26 +00001148#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001149
Tim Peters602f7402002-04-27 18:03:26 +00001150 int i; /* index into s of next input byte */
1151 PyObject *v; /* result string object */
1152 char *p; /* next free byte in output buffer */
1153 int nallocated; /* number of result bytes allocated */
1154 int nneeded; /* number of result bytes needed */
1155 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001156
Tim Peters602f7402002-04-27 18:03:26 +00001157 assert(s != NULL);
1158 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159
Tim Peters602f7402002-04-27 18:03:26 +00001160 if (size <= MAX_SHORT_UNICHARS) {
1161 /* Write into the stack buffer; nallocated can't overflow.
1162 * At the end, we'll allocate exactly as much heap space as it
1163 * turns out we need.
1164 */
1165 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1166 v = NULL; /* will allocate after we're done */
1167 p = stackbuf;
1168 }
1169 else {
1170 /* Overallocate on the heap, and give the excess back at the end. */
1171 nallocated = size * 4;
1172 if (nallocated / 4 != size) /* overflow! */
1173 return PyErr_NoMemory();
1174 v = PyString_FromStringAndSize(NULL, nallocated);
1175 if (v == NULL)
1176 return NULL;
1177 p = PyString_AS_STRING(v);
1178 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001179
Tim Peters602f7402002-04-27 18:03:26 +00001180 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001181 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001182
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001183 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001184 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001188 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001189 *p++ = (char)(0xc0 | (ch >> 6));
1190 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001191 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001192 else {
Tim Peters602f7402002-04-27 18:03:26 +00001193 /* Encode UCS2 Unicode ordinals */
1194 if (ch < 0x10000) {
1195 /* Special case: check for high surrogate */
1196 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1197 Py_UCS4 ch2 = s[i];
1198 /* Check for low surrogate and combine the two to
1199 form a UCS4 value */
1200 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001201 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001202 i++;
1203 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001204 }
Tim Peters602f7402002-04-27 18:03:26 +00001205 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001206 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001207 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001208 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1209 *p++ = (char)(0x80 | (ch & 0x3f));
1210 continue;
1211 }
1212encodeUCS4:
1213 /* Encode UCS4 Unicode ordinals */
1214 *p++ = (char)(0xf0 | (ch >> 18));
1215 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1216 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1217 *p++ = (char)(0x80 | (ch & 0x3f));
1218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001220
Tim Peters602f7402002-04-27 18:03:26 +00001221 if (v == NULL) {
1222 /* This was stack allocated. */
1223 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1224 assert(nneeded <= nallocated);
1225 v = PyString_FromStringAndSize(stackbuf, nneeded);
1226 }
1227 else {
1228 /* Cut back to size actually needed. */
1229 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1230 assert(nneeded <= nallocated);
1231 _PyString_Resize(&v, nneeded);
1232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001234
Tim Peters602f7402002-04-27 18:03:26 +00001235#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236}
1237
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1239{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 if (!PyUnicode_Check(unicode)) {
1241 PyErr_BadArgument();
1242 return NULL;
1243 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001244 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1245 PyUnicode_GET_SIZE(unicode),
1246 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247}
1248
1249/* --- UTF-16 Codec ------------------------------------------------------- */
1250
1251static
Tim Peters772747b2001-08-09 22:21:55 +00001252int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 const char *errors,
1254 const char *details)
1255{
1256 if ((errors == NULL) ||
1257 (strcmp(errors,"strict") == 0)) {
1258 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001259 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 details);
1261 return -1;
1262 }
1263 else if (strcmp(errors,"ignore") == 0) {
1264 return 0;
1265 }
1266 else if (strcmp(errors,"replace") == 0) {
1267 if (dest) {
1268 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1269 (*dest)++;
1270 }
1271 return 0;
1272 }
1273 else {
1274 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001275 "UTF-16 decoding error; "
1276 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 errors);
1278 return -1;
1279 }
1280}
1281
Tim Peters772747b2001-08-09 22:21:55 +00001282PyObject *
1283PyUnicode_DecodeUTF16(const char *s,
1284 int size,
1285 const char *errors,
1286 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287{
1288 PyUnicodeObject *unicode;
1289 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001290 const unsigned char *q, *e;
1291 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001293 /* Offsets from q for retrieving byte pairs in the right order. */
1294#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1295 int ihi = 1, ilo = 0;
1296#else
1297 int ihi = 0, ilo = 1;
1298#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299
1300 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001301 if (size & 1) {
1302 if (utf16_decoding_error(NULL, errors, "truncated data"))
1303 return NULL;
1304 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 }
1306
1307 /* Note: size will always be longer than the resulting Unicode
1308 character count */
1309 unicode = _PyUnicode_New(size);
1310 if (!unicode)
1311 return NULL;
1312 if (size == 0)
1313 return (PyObject *)unicode;
1314
1315 /* Unpack UTF-16 encoded data */
1316 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001317 q = (unsigned char *)s;
1318 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319
1320 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001321 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001323 /* Check for BOM marks (U+FEFF) in the input and adjust current
1324 byte order setting accordingly. In native mode, the leading BOM
1325 mark is skipped, in all other modes, it is copied to the output
1326 stream as-is (giving a ZWNBSP character). */
1327 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001328 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001330 if (bom == 0xFEFF) {
1331 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001332 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001333 }
1334 else if (bom == 0xFFFE) {
1335 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001336 bo = 1;
1337 }
1338#else
Tim Peters772747b2001-08-09 22:21:55 +00001339 if (bom == 0xFEFF) {
1340 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001341 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001342 }
1343 else if (bom == 0xFFFE) {
1344 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001345 bo = -1;
1346 }
1347#endif
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349
Tim Peters772747b2001-08-09 22:21:55 +00001350 if (bo == -1) {
1351 /* force LE */
1352 ihi = 1;
1353 ilo = 0;
1354 }
1355 else if (bo == 1) {
1356 /* force BE */
1357 ihi = 0;
1358 ilo = 1;
1359 }
1360
1361 while (q < e) {
1362 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1363 q += 2;
1364
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 if (ch < 0xD800 || ch > 0xDFFF) {
1366 *p++ = ch;
1367 continue;
1368 }
1369
1370 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001371 if (q >= e) {
1372 errmsg = "unexpected end of data";
1373 goto utf16Error;
1374 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001375 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001376 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1377 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001378 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001379#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001380 *p++ = ch;
1381 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001382#else
1383 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001384#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001385 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001386 }
1387 else {
1388 errmsg = "illegal UTF-16 surrogate";
1389 goto utf16Error;
1390 }
1391
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001393 errmsg = "illegal encoding";
1394 /* Fall through to report the error */
1395
1396 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001397 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001398 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399 }
1400
1401 if (byteorder)
1402 *byteorder = bo;
1403
1404 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001405 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 goto onError;
1407
1408 return (PyObject *)unicode;
1409
1410onError:
1411 Py_DECREF(unicode);
1412 return NULL;
1413}
1414
Tim Peters772747b2001-08-09 22:21:55 +00001415PyObject *
1416PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1417 int size,
1418 const char *errors,
1419 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420{
1421 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001422 unsigned char *p;
1423 int i, pairs;
1424 /* Offsets from p for storing byte pairs in the right order. */
1425#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1426 int ihi = 1, ilo = 0;
1427#else
1428 int ihi = 0, ilo = 1;
1429#endif
1430
1431#define STORECHAR(CH) \
1432 do { \
1433 p[ihi] = ((CH) >> 8) & 0xff; \
1434 p[ilo] = (CH) & 0xff; \
1435 p += 2; \
1436 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001438 for (i = pairs = 0; i < size; i++)
1439 if (s[i] >= 0x10000)
1440 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001442 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 if (v == NULL)
1444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445
Tim Peters772747b2001-08-09 22:21:55 +00001446 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001448 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001449 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001450 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001451
1452 if (byteorder == -1) {
1453 /* force LE */
1454 ihi = 1;
1455 ilo = 0;
1456 }
1457 else if (byteorder == 1) {
1458 /* force BE */
1459 ihi = 0;
1460 ilo = 1;
1461 }
1462
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001463 while (size-- > 0) {
1464 Py_UNICODE ch = *s++;
1465 Py_UNICODE ch2 = 0;
1466 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001467 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1468 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 }
Tim Peters772747b2001-08-09 22:21:55 +00001470 STORECHAR(ch);
1471 if (ch2)
1472 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001475#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476}
1477
1478PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1479{
1480 if (!PyUnicode_Check(unicode)) {
1481 PyErr_BadArgument();
1482 return NULL;
1483 }
1484 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1485 PyUnicode_GET_SIZE(unicode),
1486 NULL,
1487 0);
1488}
1489
1490/* --- Unicode Escape Codec ----------------------------------------------- */
1491
1492static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001493int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 const char *errors,
1495 const char *details)
1496{
1497 if ((errors == NULL) ||
1498 (strcmp(errors,"strict") == 0)) {
1499 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001500 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 details);
1502 return -1;
1503 }
1504 else if (strcmp(errors,"ignore") == 0) {
1505 return 0;
1506 }
1507 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001508 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1509 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510 return 0;
1511 }
1512 else {
1513 PyErr_Format(PyExc_ValueError,
1514 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001515 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 errors);
1517 return -1;
1518 }
1519}
1520
Fredrik Lundh06d12682001-01-24 07:59:11 +00001521static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001522
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1524 int size,
1525 const char *errors)
1526{
1527 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001528 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001530 char* message;
1531 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1532
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 /* Escaped strings will always be longer than the resulting
1534 Unicode string, so we start with size here and then reduce the
1535 length after conversion to the true value. */
1536 v = _PyUnicode_New(size);
1537 if (v == NULL)
1538 goto onError;
1539 if (size == 0)
1540 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001541
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 p = buf = PyUnicode_AS_UNICODE(v);
1543 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001544
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 while (s < end) {
1546 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001547 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001548 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549
1550 /* Non-escape characters are interpreted as Unicode ordinals */
1551 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001552 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553 continue;
1554 }
1555
1556 /* \ - Escapes */
1557 s++;
1558 switch (*s++) {
1559
1560 /* \x escapes */
1561 case '\n': break;
1562 case '\\': *p++ = '\\'; break;
1563 case '\'': *p++ = '\''; break;
1564 case '\"': *p++ = '\"'; break;
1565 case 'b': *p++ = '\b'; break;
1566 case 'f': *p++ = '\014'; break; /* FF */
1567 case 't': *p++ = '\t'; break;
1568 case 'n': *p++ = '\n'; break;
1569 case 'r': *p++ = '\r'; break;
1570 case 'v': *p++ = '\013'; break; /* VT */
1571 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1572
1573 /* \OOO (octal) escapes */
1574 case '0': case '1': case '2': case '3':
1575 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001576 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001578 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001580 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001582 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 break;
1584
Fredrik Lundhccc74732001-02-18 22:13:49 +00001585 /* hex escapes */
1586 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001588 digits = 2;
1589 message = "truncated \\xXX escape";
1590 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591
Fredrik Lundhccc74732001-02-18 22:13:49 +00001592 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001594 digits = 4;
1595 message = "truncated \\uXXXX escape";
1596 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597
Fredrik Lundhccc74732001-02-18 22:13:49 +00001598 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001599 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001600 digits = 8;
1601 message = "truncated \\UXXXXXXXX escape";
1602 hexescape:
1603 chr = 0;
1604 for (i = 0; i < digits; i++) {
1605 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001606 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001607 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001608 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001609 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001610 i++;
1611 break;
1612 }
1613 chr = (chr<<4) & ~0xF;
1614 if (c >= '0' && c <= '9')
1615 chr += c - '0';
1616 else if (c >= 'a' && c <= 'f')
1617 chr += 10 + c - 'a';
1618 else
1619 chr += 10 + c - 'A';
1620 }
1621 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001622 if (chr == 0xffffffff)
1623 /* _decoding_error will have already written into the
1624 target buffer. */
1625 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001626 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001627 /* when we get here, chr is a 32-bit unicode character */
1628 if (chr <= 0xffff)
1629 /* UCS-2 character */
1630 *p++ = (Py_UNICODE) chr;
1631 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001632 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001633 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001634#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635 *p++ = chr;
1636#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001637 chr -= 0x10000L;
1638 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001639 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001640#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001641 } else {
1642 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001643 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001644 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001645 )
1646 goto onError;
1647 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001648 break;
1649
1650 /* \N{name} */
1651 case 'N':
1652 message = "malformed \\N character escape";
1653 if (ucnhash_CAPI == NULL) {
1654 /* load the unicode data module */
1655 PyObject *m, *v;
1656 m = PyImport_ImportModule("unicodedata");
1657 if (m == NULL)
1658 goto ucnhashError;
1659 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1660 Py_DECREF(m);
1661 if (v == NULL)
1662 goto ucnhashError;
1663 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1664 Py_DECREF(v);
1665 if (ucnhash_CAPI == NULL)
1666 goto ucnhashError;
1667 }
1668 if (*s == '{') {
1669 const char *start = s+1;
1670 /* look for the closing brace */
1671 while (*s != '}' && s < end)
1672 s++;
1673 if (s > start && s < end && *s == '}') {
1674 /* found a name. look it up in the unicode database */
1675 message = "unknown Unicode character name";
1676 s++;
1677 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1678 goto store;
1679 }
1680 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001681 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001683 break;
1684
1685 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001686 if (s > end) {
1687 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1688 goto onError;
1689 }
1690 else {
1691 *p++ = '\\';
1692 *p++ = (unsigned char)s[-1];
1693 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001694 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695 }
1696 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001697 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001700
Fredrik Lundhccc74732001-02-18 22:13:49 +00001701ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001702 PyErr_SetString(
1703 PyExc_UnicodeError,
1704 "\\N escapes not supported (can't load unicodedata module)"
1705 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001706 return NULL;
1707
Fredrik Lundhccc74732001-02-18 22:13:49 +00001708onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709 Py_XDECREF(v);
1710 return NULL;
1711}
1712
1713/* Return a Unicode-Escape string version of the Unicode object.
1714
1715 If quotes is true, the string is enclosed in u"" or u'' quotes as
1716 appropriate.
1717
1718*/
1719
Barry Warsaw51ac5802000-03-20 16:36:48 +00001720static const Py_UNICODE *findchar(const Py_UNICODE *s,
1721 int size,
1722 Py_UNICODE ch);
1723
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724static
1725PyObject *unicodeescape_string(const Py_UNICODE *s,
1726 int size,
1727 int quotes)
1728{
1729 PyObject *repr;
1730 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001732 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733
1734 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1735 if (repr == NULL)
1736 return NULL;
1737
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001738 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
1740 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 *p++ = 'u';
1742 *p++ = (findchar(s, size, '\'') &&
1743 !findchar(s, size, '"')) ? '"' : '\'';
1744 }
1745 while (size-- > 0) {
1746 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001747
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001749 if (quotes &&
1750 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 *p++ = '\\';
1752 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001753 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001755
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001756#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001757 /* Map 21-bit characters to '\U00xxxxxx' */
1758 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001759 int offset = p - PyString_AS_STRING(repr);
1760
1761 /* Resize the string if necessary */
1762 if (offset + 12 > PyString_GET_SIZE(repr)) {
1763 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001764 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001765 p = PyString_AS_STRING(repr) + offset;
1766 }
1767
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001768 *p++ = '\\';
1769 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001770 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1771 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1772 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1773 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1774 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1775 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1776 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001777 *p++ = hexdigit[ch & 0x0000000F];
1778 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001779 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001780#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001781 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1782 else if (ch >= 0xD800 && ch < 0xDC00) {
1783 Py_UNICODE ch2;
1784 Py_UCS4 ucs;
1785
1786 ch2 = *s++;
1787 size--;
1788 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1789 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1790 *p++ = '\\';
1791 *p++ = 'U';
1792 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1793 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1794 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1795 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1796 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1797 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1798 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1799 *p++ = hexdigit[ucs & 0x0000000F];
1800 continue;
1801 }
1802 /* Fall through: isolated surrogates are copied as-is */
1803 s--;
1804 size++;
1805 }
1806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001808 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 *p++ = '\\';
1810 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001811 *p++ = hexdigit[(ch >> 12) & 0x000F];
1812 *p++ = hexdigit[(ch >> 8) & 0x000F];
1813 *p++ = hexdigit[(ch >> 4) & 0x000F];
1814 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001816
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001817 /* Map special whitespace to '\t', \n', '\r' */
1818 else if (ch == '\t') {
1819 *p++ = '\\';
1820 *p++ = 't';
1821 }
1822 else if (ch == '\n') {
1823 *p++ = '\\';
1824 *p++ = 'n';
1825 }
1826 else if (ch == '\r') {
1827 *p++ = '\\';
1828 *p++ = 'r';
1829 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001830
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001831 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001832 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001834 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001835 *p++ = hexdigit[(ch >> 4) & 0x000F];
1836 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001838
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 /* Copy everything else as-is */
1840 else
1841 *p++ = (char) ch;
1842 }
1843 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001844 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845
1846 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001847 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 return repr;
1849}
1850
1851PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1852 int size)
1853{
1854 return unicodeescape_string(s, size, 0);
1855}
1856
1857PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1858{
1859 if (!PyUnicode_Check(unicode)) {
1860 PyErr_BadArgument();
1861 return NULL;
1862 }
1863 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1864 PyUnicode_GET_SIZE(unicode));
1865}
1866
1867/* --- Raw Unicode Escape Codec ------------------------------------------- */
1868
1869PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1870 int size,
1871 const char *errors)
1872{
1873 PyUnicodeObject *v;
1874 Py_UNICODE *p, *buf;
1875 const char *end;
1876 const char *bs;
1877
1878 /* Escaped strings will always be longer than the resulting
1879 Unicode string, so we start with size here and then reduce the
1880 length after conversion to the true value. */
1881 v = _PyUnicode_New(size);
1882 if (v == NULL)
1883 goto onError;
1884 if (size == 0)
1885 return (PyObject *)v;
1886 p = buf = PyUnicode_AS_UNICODE(v);
1887 end = s + size;
1888 while (s < end) {
1889 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001890 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 int i;
1892
1893 /* Non-escape characters are interpreted as Unicode ordinals */
1894 if (*s != '\\') {
1895 *p++ = (unsigned char)*s++;
1896 continue;
1897 }
1898
1899 /* \u-escapes are only interpreted iff the number of leading
1900 backslashes if odd */
1901 bs = s;
1902 for (;s < end;) {
1903 if (*s != '\\')
1904 break;
1905 *p++ = (unsigned char)*s++;
1906 }
1907 if (((s - bs) & 1) == 0 ||
1908 s >= end ||
1909 *s != 'u') {
1910 continue;
1911 }
1912 p--;
1913 s++;
1914
1915 /* \uXXXX with 4 hex digits */
1916 for (x = 0, i = 0; i < 4; i++) {
1917 c = (unsigned char)s[i];
1918 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001919 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 "truncated \\uXXXX"))
1921 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001922 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 i++;
1924 break;
1925 }
1926 x = (x<<4) & ~0xF;
1927 if (c >= '0' && c <= '9')
1928 x += c - '0';
1929 else if (c >= 'a' && c <= 'f')
1930 x += 10 + c - 'a';
1931 else
1932 x += 10 + c - 'A';
1933 }
1934 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001935 if (x != 0xffffffff)
1936 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001938 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 return (PyObject *)v;
1941
1942 onError:
1943 Py_XDECREF(v);
1944 return NULL;
1945}
1946
1947PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1948 int size)
1949{
1950 PyObject *repr;
1951 char *p;
1952 char *q;
1953
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001954 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955
1956 repr = PyString_FromStringAndSize(NULL, 6 * size);
1957 if (repr == NULL)
1958 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001959 if (size == 0)
1960 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961
1962 p = q = PyString_AS_STRING(repr);
1963 while (size-- > 0) {
1964 Py_UNICODE ch = *s++;
1965 /* Map 16-bit characters to '\uxxxx' */
1966 if (ch >= 256) {
1967 *p++ = '\\';
1968 *p++ = 'u';
1969 *p++ = hexdigit[(ch >> 12) & 0xf];
1970 *p++ = hexdigit[(ch >> 8) & 0xf];
1971 *p++ = hexdigit[(ch >> 4) & 0xf];
1972 *p++ = hexdigit[ch & 15];
1973 }
1974 /* Copy everything else as-is */
1975 else
1976 *p++ = (char) ch;
1977 }
1978 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001979 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980 return repr;
1981}
1982
1983PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1984{
1985 if (!PyUnicode_Check(unicode)) {
1986 PyErr_BadArgument();
1987 return NULL;
1988 }
1989 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1990 PyUnicode_GET_SIZE(unicode));
1991}
1992
1993/* --- Latin-1 Codec ------------------------------------------------------ */
1994
1995PyObject *PyUnicode_DecodeLatin1(const char *s,
1996 int size,
1997 const char *errors)
1998{
1999 PyUnicodeObject *v;
2000 Py_UNICODE *p;
2001
2002 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002003 if (size == 1 && *(unsigned char*)s < 256) {
2004 Py_UNICODE r = *(unsigned char*)s;
2005 return PyUnicode_FromUnicode(&r, 1);
2006 }
2007
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 v = _PyUnicode_New(size);
2009 if (v == NULL)
2010 goto onError;
2011 if (size == 0)
2012 return (PyObject *)v;
2013 p = PyUnicode_AS_UNICODE(v);
2014 while (size-- > 0)
2015 *p++ = (unsigned char)*s++;
2016 return (PyObject *)v;
2017
2018 onError:
2019 Py_XDECREF(v);
2020 return NULL;
2021}
2022
2023static
2024int latin1_encoding_error(const Py_UNICODE **source,
2025 char **dest,
2026 const char *errors,
2027 const char *details)
2028{
2029 if ((errors == NULL) ||
2030 (strcmp(errors,"strict") == 0)) {
2031 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002032 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 details);
2034 return -1;
2035 }
2036 else if (strcmp(errors,"ignore") == 0) {
2037 return 0;
2038 }
2039 else if (strcmp(errors,"replace") == 0) {
2040 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002041 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 return 0;
2043 }
2044 else {
2045 PyErr_Format(PyExc_ValueError,
2046 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002047 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 errors);
2049 return -1;
2050 }
2051}
2052
2053PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2054 int size,
2055 const char *errors)
2056{
2057 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002058 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002059
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 repr = PyString_FromStringAndSize(NULL, size);
2061 if (repr == NULL)
2062 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002063 if (size == 0)
2064 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065
2066 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002067 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 while (size-- > 0) {
2069 Py_UNICODE ch = *p++;
2070 if (ch >= 256) {
2071 if (latin1_encoding_error(&p, &s, errors,
2072 "ordinal not in range(256)"))
2073 goto onError;
2074 }
2075 else
2076 *s++ = (char)ch;
2077 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002078 /* Resize if error handling skipped some characters */
2079 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002080 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 return repr;
2082
2083 onError:
2084 Py_DECREF(repr);
2085 return NULL;
2086}
2087
2088PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2089{
2090 if (!PyUnicode_Check(unicode)) {
2091 PyErr_BadArgument();
2092 return NULL;
2093 }
2094 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2095 PyUnicode_GET_SIZE(unicode),
2096 NULL);
2097}
2098
2099/* --- 7-bit ASCII Codec -------------------------------------------------- */
2100
2101static
2102int ascii_decoding_error(const char **source,
2103 Py_UNICODE **dest,
2104 const char *errors,
2105 const char *details)
2106{
2107 if ((errors == NULL) ||
2108 (strcmp(errors,"strict") == 0)) {
2109 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002110 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 details);
2112 return -1;
2113 }
2114 else if (strcmp(errors,"ignore") == 0) {
2115 return 0;
2116 }
2117 else if (strcmp(errors,"replace") == 0) {
2118 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2119 (*dest)++;
2120 return 0;
2121 }
2122 else {
2123 PyErr_Format(PyExc_ValueError,
2124 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002125 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 errors);
2127 return -1;
2128 }
2129}
2130
2131PyObject *PyUnicode_DecodeASCII(const char *s,
2132 int size,
2133 const char *errors)
2134{
2135 PyUnicodeObject *v;
2136 Py_UNICODE *p;
2137
2138 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002139 if (size == 1 && *(unsigned char*)s < 128) {
2140 Py_UNICODE r = *(unsigned char*)s;
2141 return PyUnicode_FromUnicode(&r, 1);
2142 }
2143
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 v = _PyUnicode_New(size);
2145 if (v == NULL)
2146 goto onError;
2147 if (size == 0)
2148 return (PyObject *)v;
2149 p = PyUnicode_AS_UNICODE(v);
2150 while (size-- > 0) {
2151 register unsigned char c;
2152
2153 c = (unsigned char)*s++;
2154 if (c < 128)
2155 *p++ = c;
2156 else if (ascii_decoding_error(&s, &p, errors,
2157 "ordinal not in range(128)"))
2158 goto onError;
2159 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002160 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002161 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002162 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 return (PyObject *)v;
2164
2165 onError:
2166 Py_XDECREF(v);
2167 return NULL;
2168}
2169
2170static
2171int ascii_encoding_error(const Py_UNICODE **source,
2172 char **dest,
2173 const char *errors,
2174 const char *details)
2175{
2176 if ((errors == NULL) ||
2177 (strcmp(errors,"strict") == 0)) {
2178 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002179 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 details);
2181 return -1;
2182 }
2183 else if (strcmp(errors,"ignore") == 0) {
2184 return 0;
2185 }
2186 else if (strcmp(errors,"replace") == 0) {
2187 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002188 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189 return 0;
2190 }
2191 else {
2192 PyErr_Format(PyExc_ValueError,
2193 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002194 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 errors);
2196 return -1;
2197 }
2198}
2199
2200PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2201 int size,
2202 const char *errors)
2203{
2204 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002205 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002206
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 repr = PyString_FromStringAndSize(NULL, size);
2208 if (repr == NULL)
2209 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002210 if (size == 0)
2211 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
2213 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002214 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 while (size-- > 0) {
2216 Py_UNICODE ch = *p++;
2217 if (ch >= 128) {
2218 if (ascii_encoding_error(&p, &s, errors,
2219 "ordinal not in range(128)"))
2220 goto onError;
2221 }
2222 else
2223 *s++ = (char)ch;
2224 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002225 /* Resize if error handling skipped some characters */
2226 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002227 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 return repr;
2229
2230 onError:
2231 Py_DECREF(repr);
2232 return NULL;
2233}
2234
2235PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2236{
2237 if (!PyUnicode_Check(unicode)) {
2238 PyErr_BadArgument();
2239 return NULL;
2240 }
2241 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2242 PyUnicode_GET_SIZE(unicode),
2243 NULL);
2244}
2245
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002246#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002247
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002248/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002249
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002250PyObject *PyUnicode_DecodeMBCS(const char *s,
2251 int size,
2252 const char *errors)
2253{
2254 PyUnicodeObject *v;
2255 Py_UNICODE *p;
2256
2257 /* First get the size of the result */
2258 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002259 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002260 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2261
2262 v = _PyUnicode_New(usize);
2263 if (v == NULL)
2264 return NULL;
2265 if (usize == 0)
2266 return (PyObject *)v;
2267 p = PyUnicode_AS_UNICODE(v);
2268 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2269 Py_DECREF(v);
2270 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2271 }
2272
2273 return (PyObject *)v;
2274}
2275
2276PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2277 int size,
2278 const char *errors)
2279{
2280 PyObject *repr;
2281 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002282 DWORD mbcssize;
2283
2284 /* If there are no characters, bail now! */
2285 if (size==0)
2286 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002287
2288 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002289 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002290 if (mbcssize==0)
2291 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2292
2293 repr = PyString_FromStringAndSize(NULL, mbcssize);
2294 if (repr == NULL)
2295 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002296 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002297 return repr;
2298
2299 /* Do the conversion */
2300 s = PyString_AS_STRING(repr);
2301 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2302 Py_DECREF(repr);
2303 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2304 }
2305 return repr;
2306}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002307
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002308#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002309
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310/* --- Character Mapping Codec -------------------------------------------- */
2311
2312static
2313int charmap_decoding_error(const char **source,
2314 Py_UNICODE **dest,
2315 const char *errors,
2316 const char *details)
2317{
2318 if ((errors == NULL) ||
2319 (strcmp(errors,"strict") == 0)) {
2320 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002321 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 details);
2323 return -1;
2324 }
2325 else if (strcmp(errors,"ignore") == 0) {
2326 return 0;
2327 }
2328 else if (strcmp(errors,"replace") == 0) {
2329 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2330 (*dest)++;
2331 return 0;
2332 }
2333 else {
2334 PyErr_Format(PyExc_ValueError,
2335 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002336 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 errors);
2338 return -1;
2339 }
2340}
2341
2342PyObject *PyUnicode_DecodeCharmap(const char *s,
2343 int size,
2344 PyObject *mapping,
2345 const char *errors)
2346{
2347 PyUnicodeObject *v;
2348 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002349 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350
2351 /* Default to Latin-1 */
2352 if (mapping == NULL)
2353 return PyUnicode_DecodeLatin1(s, size, errors);
2354
2355 v = _PyUnicode_New(size);
2356 if (v == NULL)
2357 goto onError;
2358 if (size == 0)
2359 return (PyObject *)v;
2360 p = PyUnicode_AS_UNICODE(v);
2361 while (size-- > 0) {
2362 unsigned char ch = *s++;
2363 PyObject *w, *x;
2364
2365 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2366 w = PyInt_FromLong((long)ch);
2367 if (w == NULL)
2368 goto onError;
2369 x = PyObject_GetItem(mapping, w);
2370 Py_DECREF(w);
2371 if (x == NULL) {
2372 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002373 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002375 x = Py_None;
2376 Py_INCREF(x);
2377 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002378 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379 }
2380
2381 /* Apply mapping */
2382 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002383 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 if (value < 0 || value > 65535) {
2385 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002386 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 Py_DECREF(x);
2388 goto onError;
2389 }
2390 *p++ = (Py_UNICODE)value;
2391 }
2392 else if (x == Py_None) {
2393 /* undefined mapping */
2394 if (charmap_decoding_error(&s, &p, errors,
2395 "character maps to <undefined>")) {
2396 Py_DECREF(x);
2397 goto onError;
2398 }
2399 }
2400 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002401 int targetsize = PyUnicode_GET_SIZE(x);
2402
2403 if (targetsize == 1)
2404 /* 1-1 mapping */
2405 *p++ = *PyUnicode_AS_UNICODE(x);
2406
2407 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002409 if (targetsize > extrachars) {
2410 /* resize first */
2411 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2412 int needed = (targetsize - extrachars) + \
2413 (targetsize << 2);
2414 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002415 if (_PyUnicode_Resize(&v,
2416 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002417 Py_DECREF(x);
2418 goto onError;
2419 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002420 p = PyUnicode_AS_UNICODE(v) + oldpos;
2421 }
2422 Py_UNICODE_COPY(p,
2423 PyUnicode_AS_UNICODE(x),
2424 targetsize);
2425 p += targetsize;
2426 extrachars -= targetsize;
2427 }
2428 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 }
2430 else {
2431 /* wrong return value */
2432 PyErr_SetString(PyExc_TypeError,
2433 "character mapping must return integer, None or unicode");
2434 Py_DECREF(x);
2435 goto onError;
2436 }
2437 Py_DECREF(x);
2438 }
2439 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002440 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 goto onError;
2442 return (PyObject *)v;
2443
2444 onError:
2445 Py_XDECREF(v);
2446 return NULL;
2447}
2448
2449static
2450int charmap_encoding_error(const Py_UNICODE **source,
2451 char **dest,
2452 const char *errors,
2453 const char *details)
2454{
2455 if ((errors == NULL) ||
2456 (strcmp(errors,"strict") == 0)) {
2457 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002458 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 details);
2460 return -1;
2461 }
2462 else if (strcmp(errors,"ignore") == 0) {
2463 return 0;
2464 }
2465 else if (strcmp(errors,"replace") == 0) {
2466 **dest = '?';
2467 (*dest)++;
2468 return 0;
2469 }
2470 else {
2471 PyErr_Format(PyExc_ValueError,
2472 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002473 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 errors);
2475 return -1;
2476 }
2477}
2478
2479PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2480 int size,
2481 PyObject *mapping,
2482 const char *errors)
2483{
2484 PyObject *v;
2485 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002486 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487
2488 /* Default to Latin-1 */
2489 if (mapping == NULL)
2490 return PyUnicode_EncodeLatin1(p, size, errors);
2491
2492 v = PyString_FromStringAndSize(NULL, size);
2493 if (v == NULL)
2494 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002495 if (size == 0)
2496 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 s = PyString_AS_STRING(v);
2498 while (size-- > 0) {
2499 Py_UNICODE ch = *p++;
2500 PyObject *w, *x;
2501
2502 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2503 w = PyInt_FromLong((long)ch);
2504 if (w == NULL)
2505 goto onError;
2506 x = PyObject_GetItem(mapping, w);
2507 Py_DECREF(w);
2508 if (x == NULL) {
2509 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002510 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002512 x = Py_None;
2513 Py_INCREF(x);
2514 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 }
2517
2518 /* Apply mapping */
2519 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002520 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 if (value < 0 || value > 255) {
2522 PyErr_SetString(PyExc_TypeError,
2523 "character mapping must be in range(256)");
2524 Py_DECREF(x);
2525 goto onError;
2526 }
2527 *s++ = (char)value;
2528 }
2529 else if (x == Py_None) {
2530 /* undefined mapping */
2531 if (charmap_encoding_error(&p, &s, errors,
2532 "character maps to <undefined>")) {
2533 Py_DECREF(x);
2534 goto onError;
2535 }
2536 }
2537 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002538 int targetsize = PyString_GET_SIZE(x);
2539
2540 if (targetsize == 1)
2541 /* 1-1 mapping */
2542 *s++ = *PyString_AS_STRING(x);
2543
2544 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002546 if (targetsize > extrachars) {
2547 /* resize first */
2548 int oldpos = (int)(s - PyString_AS_STRING(v));
2549 int needed = (targetsize - extrachars) + \
2550 (targetsize << 2);
2551 extrachars += needed;
2552 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002553 Py_DECREF(x);
2554 goto onError;
2555 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002556 s = PyString_AS_STRING(v) + oldpos;
2557 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002558 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002559 s += targetsize;
2560 extrachars -= targetsize;
2561 }
2562 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 }
2564 else {
2565 /* wrong return value */
2566 PyErr_SetString(PyExc_TypeError,
2567 "character mapping must return integer, None or unicode");
2568 Py_DECREF(x);
2569 goto onError;
2570 }
2571 Py_DECREF(x);
2572 }
2573 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
Tim Peters5de98422002-04-27 18:44:32 +00002574 _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 return v;
2576
2577 onError:
Tim Peters5de98422002-04-27 18:44:32 +00002578 Py_XDECREF(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 return NULL;
2580}
2581
2582PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2583 PyObject *mapping)
2584{
2585 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2586 PyErr_BadArgument();
2587 return NULL;
2588 }
2589 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2590 PyUnicode_GET_SIZE(unicode),
2591 mapping,
2592 NULL);
2593}
2594
2595static
2596int translate_error(const Py_UNICODE **source,
2597 Py_UNICODE **dest,
2598 const char *errors,
2599 const char *details)
2600{
2601 if ((errors == NULL) ||
2602 (strcmp(errors,"strict") == 0)) {
2603 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002604 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 details);
2606 return -1;
2607 }
2608 else if (strcmp(errors,"ignore") == 0) {
2609 return 0;
2610 }
2611 else if (strcmp(errors,"replace") == 0) {
2612 **dest = '?';
2613 (*dest)++;
2614 return 0;
2615 }
2616 else {
2617 PyErr_Format(PyExc_ValueError,
2618 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002619 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 errors);
2621 return -1;
2622 }
2623}
2624
2625PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2626 int size,
2627 PyObject *mapping,
2628 const char *errors)
2629{
2630 PyUnicodeObject *v;
2631 Py_UNICODE *p;
2632
2633 if (mapping == NULL) {
2634 PyErr_BadArgument();
2635 return NULL;
2636 }
2637
2638 /* Output will never be longer than input */
2639 v = _PyUnicode_New(size);
2640 if (v == NULL)
2641 goto onError;
2642 if (size == 0)
2643 goto done;
2644 p = PyUnicode_AS_UNICODE(v);
2645 while (size-- > 0) {
2646 Py_UNICODE ch = *s++;
2647 PyObject *w, *x;
2648
2649 /* Get mapping */
2650 w = PyInt_FromLong(ch);
2651 if (w == NULL)
2652 goto onError;
2653 x = PyObject_GetItem(mapping, w);
2654 Py_DECREF(w);
2655 if (x == NULL) {
2656 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2657 /* No mapping found: default to 1-1 mapping */
2658 PyErr_Clear();
2659 *p++ = ch;
2660 continue;
2661 }
2662 goto onError;
2663 }
2664
2665 /* Apply mapping */
2666 if (PyInt_Check(x))
2667 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2668 else if (x == Py_None) {
2669 /* undefined mapping */
2670 if (translate_error(&s, &p, errors,
2671 "character maps to <undefined>")) {
2672 Py_DECREF(x);
2673 goto onError;
2674 }
2675 }
2676 else if (PyUnicode_Check(x)) {
2677 if (PyUnicode_GET_SIZE(x) != 1) {
2678 /* 1-n mapping */
2679 PyErr_SetString(PyExc_NotImplementedError,
2680 "1-n mappings are currently not implemented");
2681 Py_DECREF(x);
2682 goto onError;
2683 }
2684 *p++ = *PyUnicode_AS_UNICODE(x);
2685 }
2686 else {
2687 /* wrong return value */
2688 PyErr_SetString(PyExc_TypeError,
2689 "translate mapping must return integer, None or unicode");
2690 Py_DECREF(x);
2691 goto onError;
2692 }
2693 Py_DECREF(x);
2694 }
2695 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002696 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698
2699 done:
2700 return (PyObject *)v;
2701
2702 onError:
2703 Py_XDECREF(v);
2704 return NULL;
2705}
2706
2707PyObject *PyUnicode_Translate(PyObject *str,
2708 PyObject *mapping,
2709 const char *errors)
2710{
2711 PyObject *result;
2712
2713 str = PyUnicode_FromObject(str);
2714 if (str == NULL)
2715 goto onError;
2716 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2717 PyUnicode_GET_SIZE(str),
2718 mapping,
2719 errors);
2720 Py_DECREF(str);
2721 return result;
2722
2723 onError:
2724 Py_XDECREF(str);
2725 return NULL;
2726}
2727
Guido van Rossum9e896b32000-04-05 20:11:21 +00002728/* --- Decimal Encoder ---------------------------------------------------- */
2729
2730int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2731 int length,
2732 char *output,
2733 const char *errors)
2734{
2735 Py_UNICODE *p, *end;
2736
2737 if (output == NULL) {
2738 PyErr_BadArgument();
2739 return -1;
2740 }
2741
2742 p = s;
2743 end = s + length;
2744 while (p < end) {
2745 register Py_UNICODE ch = *p++;
2746 int decimal;
2747
2748 if (Py_UNICODE_ISSPACE(ch)) {
2749 *output++ = ' ';
2750 continue;
2751 }
2752 decimal = Py_UNICODE_TODECIMAL(ch);
2753 if (decimal >= 0) {
2754 *output++ = '0' + decimal;
2755 continue;
2756 }
Guido van Rossumba477042000-04-06 18:18:10 +00002757 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002758 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002759 continue;
2760 }
2761 /* All other characters are considered invalid */
2762 if (errors == NULL || strcmp(errors, "strict") == 0) {
2763 PyErr_SetString(PyExc_ValueError,
2764 "invalid decimal Unicode string");
2765 goto onError;
2766 }
2767 else if (strcmp(errors, "ignore") == 0)
2768 continue;
2769 else if (strcmp(errors, "replace") == 0) {
2770 *output++ = '?';
2771 continue;
2772 }
2773 }
2774 /* 0-terminate the output string */
2775 *output++ = '\0';
2776 return 0;
2777
2778 onError:
2779 return -1;
2780}
2781
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782/* --- Helpers ------------------------------------------------------------ */
2783
2784static
2785int count(PyUnicodeObject *self,
2786 int start,
2787 int end,
2788 PyUnicodeObject *substring)
2789{
2790 int count = 0;
2791
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002792 if (start < 0)
2793 start += self->length;
2794 if (start < 0)
2795 start = 0;
2796 if (end > self->length)
2797 end = self->length;
2798 if (end < 0)
2799 end += self->length;
2800 if (end < 0)
2801 end = 0;
2802
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002803 if (substring->length == 0)
2804 return (end - start + 1);
2805
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 end -= substring->length;
2807
2808 while (start <= end)
2809 if (Py_UNICODE_MATCH(self, start, substring)) {
2810 count++;
2811 start += substring->length;
2812 } else
2813 start++;
2814
2815 return count;
2816}
2817
2818int PyUnicode_Count(PyObject *str,
2819 PyObject *substr,
2820 int start,
2821 int end)
2822{
2823 int result;
2824
2825 str = PyUnicode_FromObject(str);
2826 if (str == NULL)
2827 return -1;
2828 substr = PyUnicode_FromObject(substr);
2829 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002830 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 return -1;
2832 }
2833
2834 result = count((PyUnicodeObject *)str,
2835 start, end,
2836 (PyUnicodeObject *)substr);
2837
2838 Py_DECREF(str);
2839 Py_DECREF(substr);
2840 return result;
2841}
2842
2843static
2844int findstring(PyUnicodeObject *self,
2845 PyUnicodeObject *substring,
2846 int start,
2847 int end,
2848 int direction)
2849{
2850 if (start < 0)
2851 start += self->length;
2852 if (start < 0)
2853 start = 0;
2854
2855 if (substring->length == 0)
2856 return start;
2857
2858 if (end > self->length)
2859 end = self->length;
2860 if (end < 0)
2861 end += self->length;
2862 if (end < 0)
2863 end = 0;
2864
2865 end -= substring->length;
2866
2867 if (direction < 0) {
2868 for (; end >= start; end--)
2869 if (Py_UNICODE_MATCH(self, end, substring))
2870 return end;
2871 } else {
2872 for (; start <= end; start++)
2873 if (Py_UNICODE_MATCH(self, start, substring))
2874 return start;
2875 }
2876
2877 return -1;
2878}
2879
2880int PyUnicode_Find(PyObject *str,
2881 PyObject *substr,
2882 int start,
2883 int end,
2884 int direction)
2885{
2886 int result;
2887
2888 str = PyUnicode_FromObject(str);
2889 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002890 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 substr = PyUnicode_FromObject(substr);
2892 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00002893 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002894 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 }
2896
2897 result = findstring((PyUnicodeObject *)str,
2898 (PyUnicodeObject *)substr,
2899 start, end, direction);
2900 Py_DECREF(str);
2901 Py_DECREF(substr);
2902 return result;
2903}
2904
2905static
2906int tailmatch(PyUnicodeObject *self,
2907 PyUnicodeObject *substring,
2908 int start,
2909 int end,
2910 int direction)
2911{
2912 if (start < 0)
2913 start += self->length;
2914 if (start < 0)
2915 start = 0;
2916
2917 if (substring->length == 0)
2918 return 1;
2919
2920 if (end > self->length)
2921 end = self->length;
2922 if (end < 0)
2923 end += self->length;
2924 if (end < 0)
2925 end = 0;
2926
2927 end -= substring->length;
2928 if (end < start)
2929 return 0;
2930
2931 if (direction > 0) {
2932 if (Py_UNICODE_MATCH(self, end, substring))
2933 return 1;
2934 } else {
2935 if (Py_UNICODE_MATCH(self, start, substring))
2936 return 1;
2937 }
2938
2939 return 0;
2940}
2941
2942int PyUnicode_Tailmatch(PyObject *str,
2943 PyObject *substr,
2944 int start,
2945 int end,
2946 int direction)
2947{
2948 int result;
2949
2950 str = PyUnicode_FromObject(str);
2951 if (str == NULL)
2952 return -1;
2953 substr = PyUnicode_FromObject(substr);
2954 if (substr == NULL) {
2955 Py_DECREF(substr);
2956 return -1;
2957 }
2958
2959 result = tailmatch((PyUnicodeObject *)str,
2960 (PyUnicodeObject *)substr,
2961 start, end, direction);
2962 Py_DECREF(str);
2963 Py_DECREF(substr);
2964 return result;
2965}
2966
2967static
2968const Py_UNICODE *findchar(const Py_UNICODE *s,
2969 int size,
2970 Py_UNICODE ch)
2971{
2972 /* like wcschr, but doesn't stop at NULL characters */
2973
2974 while (size-- > 0) {
2975 if (*s == ch)
2976 return s;
2977 s++;
2978 }
2979
2980 return NULL;
2981}
2982
2983/* Apply fixfct filter to the Unicode object self and return a
2984 reference to the modified object */
2985
2986static
2987PyObject *fixup(PyUnicodeObject *self,
2988 int (*fixfct)(PyUnicodeObject *s))
2989{
2990
2991 PyUnicodeObject *u;
2992
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 if (u == NULL)
2995 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002996
2997 Py_UNICODE_COPY(u->str, self->str, self->length);
2998
Tim Peters7a29bd52001-09-12 03:03:31 +00002999 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 /* fixfct should return TRUE if it modified the buffer. If
3001 FALSE, return a reference to the original buffer instead
3002 (to save space, not time) */
3003 Py_INCREF(self);
3004 Py_DECREF(u);
3005 return (PyObject*) self;
3006 }
3007 return (PyObject*) u;
3008}
3009
3010static
3011int fixupper(PyUnicodeObject *self)
3012{
3013 int len = self->length;
3014 Py_UNICODE *s = self->str;
3015 int status = 0;
3016
3017 while (len-- > 0) {
3018 register Py_UNICODE ch;
3019
3020 ch = Py_UNICODE_TOUPPER(*s);
3021 if (ch != *s) {
3022 status = 1;
3023 *s = ch;
3024 }
3025 s++;
3026 }
3027
3028 return status;
3029}
3030
3031static
3032int fixlower(PyUnicodeObject *self)
3033{
3034 int len = self->length;
3035 Py_UNICODE *s = self->str;
3036 int status = 0;
3037
3038 while (len-- > 0) {
3039 register Py_UNICODE ch;
3040
3041 ch = Py_UNICODE_TOLOWER(*s);
3042 if (ch != *s) {
3043 status = 1;
3044 *s = ch;
3045 }
3046 s++;
3047 }
3048
3049 return status;
3050}
3051
3052static
3053int fixswapcase(PyUnicodeObject *self)
3054{
3055 int len = self->length;
3056 Py_UNICODE *s = self->str;
3057 int status = 0;
3058
3059 while (len-- > 0) {
3060 if (Py_UNICODE_ISUPPER(*s)) {
3061 *s = Py_UNICODE_TOLOWER(*s);
3062 status = 1;
3063 } else if (Py_UNICODE_ISLOWER(*s)) {
3064 *s = Py_UNICODE_TOUPPER(*s);
3065 status = 1;
3066 }
3067 s++;
3068 }
3069
3070 return status;
3071}
3072
3073static
3074int fixcapitalize(PyUnicodeObject *self)
3075{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003076 int len = self->length;
3077 Py_UNICODE *s = self->str;
3078 int status = 0;
3079
3080 if (len == 0)
3081 return 0;
3082 if (Py_UNICODE_ISLOWER(*s)) {
3083 *s = Py_UNICODE_TOUPPER(*s);
3084 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003086 s++;
3087 while (--len > 0) {
3088 if (Py_UNICODE_ISUPPER(*s)) {
3089 *s = Py_UNICODE_TOLOWER(*s);
3090 status = 1;
3091 }
3092 s++;
3093 }
3094 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095}
3096
3097static
3098int fixtitle(PyUnicodeObject *self)
3099{
3100 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3101 register Py_UNICODE *e;
3102 int previous_is_cased;
3103
3104 /* Shortcut for single character strings */
3105 if (PyUnicode_GET_SIZE(self) == 1) {
3106 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3107 if (*p != ch) {
3108 *p = ch;
3109 return 1;
3110 }
3111 else
3112 return 0;
3113 }
3114
3115 e = p + PyUnicode_GET_SIZE(self);
3116 previous_is_cased = 0;
3117 for (; p < e; p++) {
3118 register const Py_UNICODE ch = *p;
3119
3120 if (previous_is_cased)
3121 *p = Py_UNICODE_TOLOWER(ch);
3122 else
3123 *p = Py_UNICODE_TOTITLE(ch);
3124
3125 if (Py_UNICODE_ISLOWER(ch) ||
3126 Py_UNICODE_ISUPPER(ch) ||
3127 Py_UNICODE_ISTITLE(ch))
3128 previous_is_cased = 1;
3129 else
3130 previous_is_cased = 0;
3131 }
3132 return 1;
3133}
3134
3135PyObject *PyUnicode_Join(PyObject *separator,
3136 PyObject *seq)
3137{
3138 Py_UNICODE *sep;
3139 int seplen;
3140 PyUnicodeObject *res = NULL;
3141 int reslen = 0;
3142 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 int sz = 100;
3144 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003145 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146
Tim Peters2cfe3682001-05-05 05:36:48 +00003147 it = PyObject_GetIter(seq);
3148 if (it == NULL)
3149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150
3151 if (separator == NULL) {
3152 Py_UNICODE blank = ' ';
3153 sep = &blank;
3154 seplen = 1;
3155 }
3156 else {
3157 separator = PyUnicode_FromObject(separator);
3158 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003159 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 sep = PyUnicode_AS_UNICODE(separator);
3161 seplen = PyUnicode_GET_SIZE(separator);
3162 }
3163
3164 res = _PyUnicode_New(sz);
3165 if (res == NULL)
3166 goto onError;
3167 p = PyUnicode_AS_UNICODE(res);
3168 reslen = 0;
3169
Tim Peters2cfe3682001-05-05 05:36:48 +00003170 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003172 PyObject *item = PyIter_Next(it);
3173 if (item == NULL) {
3174 if (PyErr_Occurred())
3175 goto onError;
3176 break;
3177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 if (!PyUnicode_Check(item)) {
3179 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003180 if (!PyString_Check(item)) {
3181 PyErr_Format(PyExc_TypeError,
3182 "sequence item %i: expected string or Unicode,"
3183 " %.80s found",
3184 i, item->ob_type->tp_name);
3185 Py_DECREF(item);
3186 goto onError;
3187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 v = PyUnicode_FromObject(item);
3189 Py_DECREF(item);
3190 item = v;
3191 if (item == NULL)
3192 goto onError;
3193 }
3194 itemlen = PyUnicode_GET_SIZE(item);
3195 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003196 if (_PyUnicode_Resize(&res, sz*2)) {
3197 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 sz *= 2;
3201 p = PyUnicode_AS_UNICODE(res) + reslen;
3202 }
3203 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003204 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 p += seplen;
3206 reslen += seplen;
3207 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003208 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 p += itemlen;
3210 reslen += itemlen;
3211 Py_DECREF(item);
3212 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003213 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 goto onError;
3215
3216 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003217 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 return (PyObject *)res;
3219
3220 onError:
3221 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003222 Py_XDECREF(res);
3223 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 return NULL;
3225}
3226
3227static
3228PyUnicodeObject *pad(PyUnicodeObject *self,
3229 int left,
3230 int right,
3231 Py_UNICODE fill)
3232{
3233 PyUnicodeObject *u;
3234
3235 if (left < 0)
3236 left = 0;
3237 if (right < 0)
3238 right = 0;
3239
Tim Peters7a29bd52001-09-12 03:03:31 +00003240 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 Py_INCREF(self);
3242 return self;
3243 }
3244
3245 u = _PyUnicode_New(left + self->length + right);
3246 if (u) {
3247 if (left)
3248 Py_UNICODE_FILL(u->str, fill, left);
3249 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3250 if (right)
3251 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3252 }
3253
3254 return u;
3255}
3256
3257#define SPLIT_APPEND(data, left, right) \
3258 str = PyUnicode_FromUnicode(data + left, right - left); \
3259 if (!str) \
3260 goto onError; \
3261 if (PyList_Append(list, str)) { \
3262 Py_DECREF(str); \
3263 goto onError; \
3264 } \
3265 else \
3266 Py_DECREF(str);
3267
3268static
3269PyObject *split_whitespace(PyUnicodeObject *self,
3270 PyObject *list,
3271 int maxcount)
3272{
3273 register int i;
3274 register int j;
3275 int len = self->length;
3276 PyObject *str;
3277
3278 for (i = j = 0; i < len; ) {
3279 /* find a token */
3280 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3281 i++;
3282 j = i;
3283 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3284 i++;
3285 if (j < i) {
3286 if (maxcount-- <= 0)
3287 break;
3288 SPLIT_APPEND(self->str, j, i);
3289 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3290 i++;
3291 j = i;
3292 }
3293 }
3294 if (j < len) {
3295 SPLIT_APPEND(self->str, j, len);
3296 }
3297 return list;
3298
3299 onError:
3300 Py_DECREF(list);
3301 return NULL;
3302}
3303
3304PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003305 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306{
3307 register int i;
3308 register int j;
3309 int len;
3310 PyObject *list;
3311 PyObject *str;
3312 Py_UNICODE *data;
3313
3314 string = PyUnicode_FromObject(string);
3315 if (string == NULL)
3316 return NULL;
3317 data = PyUnicode_AS_UNICODE(string);
3318 len = PyUnicode_GET_SIZE(string);
3319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 list = PyList_New(0);
3321 if (!list)
3322 goto onError;
3323
3324 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003325 int eol;
3326
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 /* Find a line and append it */
3328 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3329 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330
3331 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003332 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 if (i < len) {
3334 if (data[i] == '\r' && i + 1 < len &&
3335 data[i+1] == '\n')
3336 i += 2;
3337 else
3338 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003339 if (keepends)
3340 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 }
Guido van Rossum86662912000-04-11 15:38:46 +00003342 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 j = i;
3344 }
3345 if (j < len) {
3346 SPLIT_APPEND(data, j, len);
3347 }
3348
3349 Py_DECREF(string);
3350 return list;
3351
3352 onError:
3353 Py_DECREF(list);
3354 Py_DECREF(string);
3355 return NULL;
3356}
3357
3358static
3359PyObject *split_char(PyUnicodeObject *self,
3360 PyObject *list,
3361 Py_UNICODE ch,
3362 int maxcount)
3363{
3364 register int i;
3365 register int j;
3366 int len = self->length;
3367 PyObject *str;
3368
3369 for (i = j = 0; i < len; ) {
3370 if (self->str[i] == ch) {
3371 if (maxcount-- <= 0)
3372 break;
3373 SPLIT_APPEND(self->str, j, i);
3374 i = j = i + 1;
3375 } else
3376 i++;
3377 }
3378 if (j <= len) {
3379 SPLIT_APPEND(self->str, j, len);
3380 }
3381 return list;
3382
3383 onError:
3384 Py_DECREF(list);
3385 return NULL;
3386}
3387
3388static
3389PyObject *split_substring(PyUnicodeObject *self,
3390 PyObject *list,
3391 PyUnicodeObject *substring,
3392 int maxcount)
3393{
3394 register int i;
3395 register int j;
3396 int len = self->length;
3397 int sublen = substring->length;
3398 PyObject *str;
3399
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003400 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401 if (Py_UNICODE_MATCH(self, i, substring)) {
3402 if (maxcount-- <= 0)
3403 break;
3404 SPLIT_APPEND(self->str, j, i);
3405 i = j = i + sublen;
3406 } else
3407 i++;
3408 }
3409 if (j <= len) {
3410 SPLIT_APPEND(self->str, j, len);
3411 }
3412 return list;
3413
3414 onError:
3415 Py_DECREF(list);
3416 return NULL;
3417}
3418
3419#undef SPLIT_APPEND
3420
3421static
3422PyObject *split(PyUnicodeObject *self,
3423 PyUnicodeObject *substring,
3424 int maxcount)
3425{
3426 PyObject *list;
3427
3428 if (maxcount < 0)
3429 maxcount = INT_MAX;
3430
3431 list = PyList_New(0);
3432 if (!list)
3433 return NULL;
3434
3435 if (substring == NULL)
3436 return split_whitespace(self,list,maxcount);
3437
3438 else if (substring->length == 1)
3439 return split_char(self,list,substring->str[0],maxcount);
3440
3441 else if (substring->length == 0) {
3442 Py_DECREF(list);
3443 PyErr_SetString(PyExc_ValueError, "empty separator");
3444 return NULL;
3445 }
3446 else
3447 return split_substring(self,list,substring,maxcount);
3448}
3449
3450static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451PyObject *replace(PyUnicodeObject *self,
3452 PyUnicodeObject *str1,
3453 PyUnicodeObject *str2,
3454 int maxcount)
3455{
3456 PyUnicodeObject *u;
3457
3458 if (maxcount < 0)
3459 maxcount = INT_MAX;
3460
3461 if (str1->length == 1 && str2->length == 1) {
3462 int i;
3463
3464 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003465 if (!findchar(self->str, self->length, str1->str[0]) &&
3466 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 /* nothing to replace, return original string */
3468 Py_INCREF(self);
3469 u = self;
3470 } else {
3471 Py_UNICODE u1 = str1->str[0];
3472 Py_UNICODE u2 = str2->str[0];
3473
3474 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003475 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 self->length
3477 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003478 if (u != NULL) {
3479 Py_UNICODE_COPY(u->str, self->str,
3480 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481 for (i = 0; i < u->length; i++)
3482 if (u->str[i] == u1) {
3483 if (--maxcount < 0)
3484 break;
3485 u->str[i] = u2;
3486 }
3487 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489
3490 } else {
3491 int n, i;
3492 Py_UNICODE *p;
3493
3494 /* replace strings */
3495 n = count(self, 0, self->length, str1);
3496 if (n > maxcount)
3497 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003498 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 /* nothing to replace, return original string */
3500 Py_INCREF(self);
3501 u = self;
3502 } else {
3503 u = _PyUnicode_New(
3504 self->length + n * (str2->length - str1->length));
3505 if (u) {
3506 i = 0;
3507 p = u->str;
3508 while (i <= self->length - str1->length)
3509 if (Py_UNICODE_MATCH(self, i, str1)) {
3510 /* replace string segment */
3511 Py_UNICODE_COPY(p, str2->str, str2->length);
3512 p += str2->length;
3513 i += str1->length;
3514 if (--n <= 0) {
3515 /* copy remaining part */
3516 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3517 break;
3518 }
3519 } else
3520 *p++ = self->str[i++];
3521 }
3522 }
3523 }
3524
3525 return (PyObject *) u;
3526}
3527
3528/* --- Unicode Object Methods --------------------------------------------- */
3529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003530PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531"S.title() -> unicode\n\
3532\n\
3533Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003534characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535
3536static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003537unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 return fixup(self, fixtitle);
3540}
3541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003542PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543"S.capitalize() -> unicode\n\
3544\n\
3545Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003546have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547
3548static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003549unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 return fixup(self, fixcapitalize);
3552}
3553
3554#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003555PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556"S.capwords() -> unicode\n\
3557\n\
3558Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003559normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560
3561static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003562unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563{
3564 PyObject *list;
3565 PyObject *item;
3566 int i;
3567
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 /* Split into words */
3569 list = split(self, NULL, -1);
3570 if (!list)
3571 return NULL;
3572
3573 /* Capitalize each word */
3574 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3575 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3576 fixcapitalize);
3577 if (item == NULL)
3578 goto onError;
3579 Py_DECREF(PyList_GET_ITEM(list, i));
3580 PyList_SET_ITEM(list, i, item);
3581 }
3582
3583 /* Join the words to form a new string */
3584 item = PyUnicode_Join(NULL, list);
3585
3586onError:
3587 Py_DECREF(list);
3588 return (PyObject *)item;
3589}
3590#endif
3591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003592PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593"S.center(width) -> unicode\n\
3594\n\
3595Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003596using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597
3598static PyObject *
3599unicode_center(PyUnicodeObject *self, PyObject *args)
3600{
3601 int marg, left;
3602 int width;
3603
3604 if (!PyArg_ParseTuple(args, "i:center", &width))
3605 return NULL;
3606
Tim Peters7a29bd52001-09-12 03:03:31 +00003607 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 Py_INCREF(self);
3609 return (PyObject*) self;
3610 }
3611
3612 marg = width - self->length;
3613 left = marg / 2 + (marg & width & 1);
3614
3615 return (PyObject*) pad(self, left, marg - left, ' ');
3616}
3617
Marc-André Lemburge5034372000-08-08 08:04:29 +00003618#if 0
3619
3620/* This code should go into some future Unicode collation support
3621 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003622 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003623
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003624/* speedy UTF-16 code point order comparison */
3625/* gleaned from: */
3626/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3627
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003628static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003629{
3630 0, 0, 0, 0, 0, 0, 0, 0,
3631 0, 0, 0, 0, 0, 0, 0, 0,
3632 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003633 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003634};
3635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636static int
3637unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3638{
3639 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003640
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 Py_UNICODE *s1 = str1->str;
3642 Py_UNICODE *s2 = str2->str;
3643
3644 len1 = str1->length;
3645 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003646
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003648 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003649
3650 c1 = *s1++;
3651 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003652
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003653 if (c1 > (1<<11) * 26)
3654 c1 += utf16Fixup[c1>>11];
3655 if (c2 > (1<<11) * 26)
3656 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003657 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003658
3659 if (c1 != c2)
3660 return (c1 < c2) ? -1 : 1;
3661
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003662 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 }
3664
3665 return (len1 < len2) ? -1 : (len1 != len2);
3666}
3667
Marc-André Lemburge5034372000-08-08 08:04:29 +00003668#else
3669
3670static int
3671unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3672{
3673 register int len1, len2;
3674
3675 Py_UNICODE *s1 = str1->str;
3676 Py_UNICODE *s2 = str2->str;
3677
3678 len1 = str1->length;
3679 len2 = str2->length;
3680
3681 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003682 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003683
Fredrik Lundh45714e92001-06-26 16:39:36 +00003684 c1 = *s1++;
3685 c2 = *s2++;
3686
3687 if (c1 != c2)
3688 return (c1 < c2) ? -1 : 1;
3689
Marc-André Lemburge5034372000-08-08 08:04:29 +00003690 len1--; len2--;
3691 }
3692
3693 return (len1 < len2) ? -1 : (len1 != len2);
3694}
3695
3696#endif
3697
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698int PyUnicode_Compare(PyObject *left,
3699 PyObject *right)
3700{
3701 PyUnicodeObject *u = NULL, *v = NULL;
3702 int result;
3703
3704 /* Coerce the two arguments */
3705 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3706 if (u == NULL)
3707 goto onError;
3708 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3709 if (v == NULL)
3710 goto onError;
3711
Thomas Wouters7e474022000-07-16 12:04:32 +00003712 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 if (v == u) {
3714 Py_DECREF(u);
3715 Py_DECREF(v);
3716 return 0;
3717 }
3718
3719 result = unicode_compare(u, v);
3720
3721 Py_DECREF(u);
3722 Py_DECREF(v);
3723 return result;
3724
3725onError:
3726 Py_XDECREF(u);
3727 Py_XDECREF(v);
3728 return -1;
3729}
3730
Guido van Rossum403d68b2000-03-13 15:55:09 +00003731int PyUnicode_Contains(PyObject *container,
3732 PyObject *element)
3733{
3734 PyUnicodeObject *u = NULL, *v = NULL;
3735 int result;
3736 register const Py_UNICODE *p, *e;
3737 register Py_UNICODE ch;
3738
3739 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003740 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003741 if (v == NULL) {
3742 PyErr_SetString(PyExc_TypeError,
3743 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003744 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003745 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003746 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3747 if (u == NULL) {
3748 Py_DECREF(v);
3749 goto onError;
3750 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003751
3752 /* Check v in u */
3753 if (PyUnicode_GET_SIZE(v) != 1) {
3754 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003755 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003756 goto onError;
3757 }
3758 ch = *PyUnicode_AS_UNICODE(v);
3759 p = PyUnicode_AS_UNICODE(u);
3760 e = p + PyUnicode_GET_SIZE(u);
3761 result = 0;
3762 while (p < e) {
3763 if (*p++ == ch) {
3764 result = 1;
3765 break;
3766 }
3767 }
3768
3769 Py_DECREF(u);
3770 Py_DECREF(v);
3771 return result;
3772
3773onError:
3774 Py_XDECREF(u);
3775 Py_XDECREF(v);
3776 return -1;
3777}
3778
Guido van Rossumd57fd912000-03-10 22:53:23 +00003779/* Concat to string or Unicode object giving a new Unicode object. */
3780
3781PyObject *PyUnicode_Concat(PyObject *left,
3782 PyObject *right)
3783{
3784 PyUnicodeObject *u = NULL, *v = NULL, *w;
3785
3786 /* Coerce the two arguments */
3787 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3788 if (u == NULL)
3789 goto onError;
3790 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3791 if (v == NULL)
3792 goto onError;
3793
3794 /* Shortcuts */
3795 if (v == unicode_empty) {
3796 Py_DECREF(v);
3797 return (PyObject *)u;
3798 }
3799 if (u == unicode_empty) {
3800 Py_DECREF(u);
3801 return (PyObject *)v;
3802 }
3803
3804 /* Concat the two Unicode strings */
3805 w = _PyUnicode_New(u->length + v->length);
3806 if (w == NULL)
3807 goto onError;
3808 Py_UNICODE_COPY(w->str, u->str, u->length);
3809 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3810
3811 Py_DECREF(u);
3812 Py_DECREF(v);
3813 return (PyObject *)w;
3814
3815onError:
3816 Py_XDECREF(u);
3817 Py_XDECREF(v);
3818 return NULL;
3819}
3820
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003821PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003822"S.count(sub[, start[, end]]) -> int\n\
3823\n\
3824Return the number of occurrences of substring sub in Unicode string\n\
3825S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003826interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003827
3828static PyObject *
3829unicode_count(PyUnicodeObject *self, PyObject *args)
3830{
3831 PyUnicodeObject *substring;
3832 int start = 0;
3833 int end = INT_MAX;
3834 PyObject *result;
3835
Guido van Rossumb8872e62000-05-09 14:14:27 +00003836 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3837 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003838 return NULL;
3839
3840 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3841 (PyObject *)substring);
3842 if (substring == NULL)
3843 return NULL;
3844
Guido van Rossumd57fd912000-03-10 22:53:23 +00003845 if (start < 0)
3846 start += self->length;
3847 if (start < 0)
3848 start = 0;
3849 if (end > self->length)
3850 end = self->length;
3851 if (end < 0)
3852 end += self->length;
3853 if (end < 0)
3854 end = 0;
3855
3856 result = PyInt_FromLong((long) count(self, start, end, substring));
3857
3858 Py_DECREF(substring);
3859 return result;
3860}
3861
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003862PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863"S.encode([encoding[,errors]]) -> string\n\
3864\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003865Return an encoded string version of S. Default encoding is the current\n\
3866default string encoding. errors may be given to set a different error\n\
3867handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003868a ValueError. Other possible values are 'ignore' and 'replace'.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869
3870static PyObject *
3871unicode_encode(PyUnicodeObject *self, PyObject *args)
3872{
3873 char *encoding = NULL;
3874 char *errors = NULL;
3875 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3876 return NULL;
3877 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3878}
3879
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003880PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881"S.expandtabs([tabsize]) -> unicode\n\
3882\n\
3883Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003884If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885
3886static PyObject*
3887unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3888{
3889 Py_UNICODE *e;
3890 Py_UNICODE *p;
3891 Py_UNICODE *q;
3892 int i, j;
3893 PyUnicodeObject *u;
3894 int tabsize = 8;
3895
3896 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3897 return NULL;
3898
Thomas Wouters7e474022000-07-16 12:04:32 +00003899 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900 i = j = 0;
3901 e = self->str + self->length;
3902 for (p = self->str; p < e; p++)
3903 if (*p == '\t') {
3904 if (tabsize > 0)
3905 j += tabsize - (j % tabsize);
3906 }
3907 else {
3908 j++;
3909 if (*p == '\n' || *p == '\r') {
3910 i += j;
3911 j = 0;
3912 }
3913 }
3914
3915 /* Second pass: create output string and fill it */
3916 u = _PyUnicode_New(i + j);
3917 if (!u)
3918 return NULL;
3919
3920 j = 0;
3921 q = u->str;
3922
3923 for (p = self->str; p < e; p++)
3924 if (*p == '\t') {
3925 if (tabsize > 0) {
3926 i = tabsize - (j % tabsize);
3927 j += i;
3928 while (i--)
3929 *q++ = ' ';
3930 }
3931 }
3932 else {
3933 j++;
3934 *q++ = *p;
3935 if (*p == '\n' || *p == '\r')
3936 j = 0;
3937 }
3938
3939 return (PyObject*) u;
3940}
3941
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003942PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943"S.find(sub [,start [,end]]) -> int\n\
3944\n\
3945Return the lowest index in S where substring sub is found,\n\
3946such that sub is contained within s[start,end]. Optional\n\
3947arguments start and end are interpreted as in slice notation.\n\
3948\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003949Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950
3951static PyObject *
3952unicode_find(PyUnicodeObject *self, PyObject *args)
3953{
3954 PyUnicodeObject *substring;
3955 int start = 0;
3956 int end = INT_MAX;
3957 PyObject *result;
3958
Guido van Rossumb8872e62000-05-09 14:14:27 +00003959 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3960 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003961 return NULL;
3962 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3963 (PyObject *)substring);
3964 if (substring == NULL)
3965 return NULL;
3966
3967 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3968
3969 Py_DECREF(substring);
3970 return result;
3971}
3972
3973static PyObject *
3974unicode_getitem(PyUnicodeObject *self, int index)
3975{
3976 if (index < 0 || index >= self->length) {
3977 PyErr_SetString(PyExc_IndexError, "string index out of range");
3978 return NULL;
3979 }
3980
3981 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3982}
3983
3984static long
3985unicode_hash(PyUnicodeObject *self)
3986{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003987 /* Since Unicode objects compare equal to their ASCII string
3988 counterparts, they should use the individual character values
3989 as basis for their hash value. This is needed to assure that
3990 strings and Unicode objects behave in the same way as
3991 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003992
Fredrik Lundhdde61642000-07-10 18:27:47 +00003993 register int len;
3994 register Py_UNICODE *p;
3995 register long x;
3996
Guido van Rossumd57fd912000-03-10 22:53:23 +00003997 if (self->hash != -1)
3998 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003999 len = PyUnicode_GET_SIZE(self);
4000 p = PyUnicode_AS_UNICODE(self);
4001 x = *p << 7;
4002 while (--len >= 0)
4003 x = (1000003*x) ^ *p++;
4004 x ^= PyUnicode_GET_SIZE(self);
4005 if (x == -1)
4006 x = -2;
4007 self->hash = x;
4008 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004009}
4010
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004011PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004012"S.index(sub [,start [,end]]) -> int\n\
4013\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004014Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015
4016static PyObject *
4017unicode_index(PyUnicodeObject *self, PyObject *args)
4018{
4019 int result;
4020 PyUnicodeObject *substring;
4021 int start = 0;
4022 int end = INT_MAX;
4023
Guido van Rossumb8872e62000-05-09 14:14:27 +00004024 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4025 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 return NULL;
4027
4028 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4029 (PyObject *)substring);
4030 if (substring == NULL)
4031 return NULL;
4032
4033 result = findstring(self, substring, start, end, 1);
4034
4035 Py_DECREF(substring);
4036 if (result < 0) {
4037 PyErr_SetString(PyExc_ValueError, "substring not found");
4038 return NULL;
4039 }
4040 return PyInt_FromLong(result);
4041}
4042
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004043PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004044"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004046Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004047at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048
4049static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004050unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051{
4052 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4053 register const Py_UNICODE *e;
4054 int cased;
4055
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056 /* Shortcut for single character strings */
4057 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004058 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004060 /* Special case for empty strings */
4061 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004062 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004063
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 e = p + PyUnicode_GET_SIZE(self);
4065 cased = 0;
4066 for (; p < e; p++) {
4067 register const Py_UNICODE ch = *p;
4068
4069 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004070 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 else if (!cased && Py_UNICODE_ISLOWER(ch))
4072 cased = 1;
4073 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004074 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075}
4076
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004077PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004078"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004079\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004080Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004081at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082
4083static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004084unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085{
4086 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4087 register const Py_UNICODE *e;
4088 int cased;
4089
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 /* Shortcut for single character strings */
4091 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004092 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004094 /* Special case for empty strings */
4095 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004096 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004097
Guido van Rossumd57fd912000-03-10 22:53:23 +00004098 e = p + PyUnicode_GET_SIZE(self);
4099 cased = 0;
4100 for (; p < e; p++) {
4101 register const Py_UNICODE ch = *p;
4102
4103 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004104 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 else if (!cased && Py_UNICODE_ISUPPER(ch))
4106 cased = 1;
4107 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004108 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109}
4110
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004111PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004112"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004114Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4115characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004116only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117
4118static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004119unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120{
4121 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4122 register const Py_UNICODE *e;
4123 int cased, previous_is_cased;
4124
Guido van Rossumd57fd912000-03-10 22:53:23 +00004125 /* Shortcut for single character strings */
4126 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004127 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4128 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004130 /* Special case for empty strings */
4131 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004132 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004133
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134 e = p + PyUnicode_GET_SIZE(self);
4135 cased = 0;
4136 previous_is_cased = 0;
4137 for (; p < e; p++) {
4138 register const Py_UNICODE ch = *p;
4139
4140 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4141 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004142 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 previous_is_cased = 1;
4144 cased = 1;
4145 }
4146 else if (Py_UNICODE_ISLOWER(ch)) {
4147 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004148 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149 previous_is_cased = 1;
4150 cased = 1;
4151 }
4152 else
4153 previous_is_cased = 0;
4154 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004155 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004156}
4157
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004158PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004159"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004161Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004162False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163
4164static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004165unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166{
4167 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4168 register const Py_UNICODE *e;
4169
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 /* Shortcut for single character strings */
4171 if (PyUnicode_GET_SIZE(self) == 1 &&
4172 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004173 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004175 /* Special case for empty strings */
4176 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004177 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004178
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 e = p + PyUnicode_GET_SIZE(self);
4180 for (; p < e; p++) {
4181 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004182 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004184 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185}
4186
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004187PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004188"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004189\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004190Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004191and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004192
4193static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004194unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004195{
4196 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4197 register const Py_UNICODE *e;
4198
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004199 /* Shortcut for single character strings */
4200 if (PyUnicode_GET_SIZE(self) == 1 &&
4201 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004202 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004203
4204 /* Special case for empty strings */
4205 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004206 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004207
4208 e = p + PyUnicode_GET_SIZE(self);
4209 for (; p < e; p++) {
4210 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004211 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004212 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004213 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004214}
4215
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004216PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004217"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004218\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004219Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004220and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004221
4222static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004223unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004224{
4225 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4226 register const Py_UNICODE *e;
4227
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004228 /* Shortcut for single character strings */
4229 if (PyUnicode_GET_SIZE(self) == 1 &&
4230 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004231 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004232
4233 /* Special case for empty strings */
4234 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004235 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004236
4237 e = p + PyUnicode_GET_SIZE(self);
4238 for (; p < e; p++) {
4239 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004240 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004241 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004242 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004243}
4244
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004245PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004246"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004248Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004249False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250
4251static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004252unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253{
4254 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4255 register const Py_UNICODE *e;
4256
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 /* Shortcut for single character strings */
4258 if (PyUnicode_GET_SIZE(self) == 1 &&
4259 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004260 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004262 /* Special case for empty strings */
4263 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004264 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004265
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266 e = p + PyUnicode_GET_SIZE(self);
4267 for (; p < e; p++) {
4268 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004269 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004270 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004271 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272}
4273
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004274PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004275"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004277Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004278False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279
4280static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004281unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282{
4283 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4284 register const Py_UNICODE *e;
4285
Guido van Rossumd57fd912000-03-10 22:53:23 +00004286 /* Shortcut for single character strings */
4287 if (PyUnicode_GET_SIZE(self) == 1 &&
4288 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004289 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004291 /* Special case for empty strings */
4292 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004293 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004294
Guido van Rossumd57fd912000-03-10 22:53:23 +00004295 e = p + PyUnicode_GET_SIZE(self);
4296 for (; p < e; p++) {
4297 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004298 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004300 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301}
4302
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004303PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004304"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004306Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004307False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308
4309static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004310unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311{
4312 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4313 register const Py_UNICODE *e;
4314
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 /* Shortcut for single character strings */
4316 if (PyUnicode_GET_SIZE(self) == 1 &&
4317 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004318 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004320 /* Special case for empty strings */
4321 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004322 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004323
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324 e = p + PyUnicode_GET_SIZE(self);
4325 for (; p < e; p++) {
4326 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004327 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004329 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330}
4331
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004332PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333"S.join(sequence) -> unicode\n\
4334\n\
4335Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004336sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337
4338static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004339unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004341 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342}
4343
4344static int
4345unicode_length(PyUnicodeObject *self)
4346{
4347 return self->length;
4348}
4349
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004350PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351"S.ljust(width) -> unicode\n\
4352\n\
4353Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004354done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355
4356static PyObject *
4357unicode_ljust(PyUnicodeObject *self, PyObject *args)
4358{
4359 int width;
4360 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4361 return NULL;
4362
Tim Peters7a29bd52001-09-12 03:03:31 +00004363 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 Py_INCREF(self);
4365 return (PyObject*) self;
4366 }
4367
4368 return (PyObject*) pad(self, 0, width - self->length, ' ');
4369}
4370
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004371PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372"S.lower() -> unicode\n\
4373\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004374Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375
4376static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004377unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 return fixup(self, fixlower);
4380}
4381
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004382#define LEFTSTRIP 0
4383#define RIGHTSTRIP 1
4384#define BOTHSTRIP 2
4385
4386/* Arrays indexed by above */
4387static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
4388
4389#define STRIPNAME(i) (stripformat[i]+3)
4390
4391static const Py_UNICODE *
4392unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
4393{
Tim Peters030a5ce2002-04-22 19:00:10 +00004394 size_t i;
4395 for (i = 0; i < n; ++i)
4396 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004397 return s+i;
4398 return NULL;
4399}
4400
4401/* externally visible for str.strip(unicode) */
4402PyObject *
4403_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
4404{
4405 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4406 int len = PyUnicode_GET_SIZE(self);
4407 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
4408 int seplen = PyUnicode_GET_SIZE(sepobj);
4409 int i, j;
4410
4411 i = 0;
4412 if (striptype != RIGHTSTRIP) {
4413 while (i < len && unicode_memchr(sep, s[i], seplen)) {
4414 i++;
4415 }
4416 }
4417
4418 j = len;
4419 if (striptype != LEFTSTRIP) {
4420 do {
4421 j--;
4422 } while (j >= i && unicode_memchr(sep, s[j], seplen));
4423 j++;
4424 }
4425
4426 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4427 Py_INCREF(self);
4428 return (PyObject*)self;
4429 }
4430 else
4431 return PyUnicode_FromUnicode(s+i, j-i);
4432}
4433
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434
4435static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004436do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004437{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004438 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4439 int len = PyUnicode_GET_SIZE(self), i, j;
4440
4441 i = 0;
4442 if (striptype != RIGHTSTRIP) {
4443 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
4444 i++;
4445 }
4446 }
4447
4448 j = len;
4449 if (striptype != LEFTSTRIP) {
4450 do {
4451 j--;
4452 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
4453 j++;
4454 }
4455
4456 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4457 Py_INCREF(self);
4458 return (PyObject*)self;
4459 }
4460 else
4461 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004462}
4463
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004464
4465static PyObject *
4466do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
4467{
4468 PyObject *sep = NULL;
4469
4470 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
4471 return NULL;
4472
4473 if (sep != NULL && sep != Py_None) {
4474 if (PyUnicode_Check(sep))
4475 return _PyUnicode_XStrip(self, striptype, sep);
4476 else if (PyString_Check(sep)) {
4477 PyObject *res;
4478 sep = PyUnicode_FromObject(sep);
4479 if (sep==NULL)
4480 return NULL;
4481 res = _PyUnicode_XStrip(self, striptype, sep);
4482 Py_DECREF(sep);
4483 return res;
4484 }
4485 else {
4486 PyErr_Format(PyExc_TypeError,
4487 "%s arg must be None, unicode or str",
4488 STRIPNAME(striptype));
4489 return NULL;
4490 }
4491 }
4492
4493 return do_strip(self, striptype);
4494}
4495
4496
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004497PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004498"S.strip([sep]) -> unicode\n\
4499\n\
4500Return a copy of the string S with leading and trailing\n\
4501whitespace removed.\n\
4502If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004503If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004504
4505static PyObject *
4506unicode_strip(PyUnicodeObject *self, PyObject *args)
4507{
4508 if (PyTuple_GET_SIZE(args) == 0)
4509 return do_strip(self, BOTHSTRIP); /* Common case */
4510 else
4511 return do_argstrip(self, BOTHSTRIP, args);
4512}
4513
4514
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004515PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004516"S.lstrip([sep]) -> unicode\n\
4517\n\
4518Return a copy of the string S with leading whitespace removed.\n\
4519If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004520If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004521
4522static PyObject *
4523unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4524{
4525 if (PyTuple_GET_SIZE(args) == 0)
4526 return do_strip(self, LEFTSTRIP); /* Common case */
4527 else
4528 return do_argstrip(self, LEFTSTRIP, args);
4529}
4530
4531
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004532PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004533"S.rstrip([sep]) -> unicode\n\
4534\n\
4535Return a copy of the string S with trailing whitespace removed.\n\
4536If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004537If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004538
4539static PyObject *
4540unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4541{
4542 if (PyTuple_GET_SIZE(args) == 0)
4543 return do_strip(self, RIGHTSTRIP); /* Common case */
4544 else
4545 return do_argstrip(self, RIGHTSTRIP, args);
4546}
4547
4548
Guido van Rossumd57fd912000-03-10 22:53:23 +00004549static PyObject*
4550unicode_repeat(PyUnicodeObject *str, int len)
4551{
4552 PyUnicodeObject *u;
4553 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004554 int nchars;
4555 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004556
4557 if (len < 0)
4558 len = 0;
4559
Tim Peters7a29bd52001-09-12 03:03:31 +00004560 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561 /* no repeat, return original string */
4562 Py_INCREF(str);
4563 return (PyObject*) str;
4564 }
Tim Peters8f422462000-09-09 06:13:41 +00004565
4566 /* ensure # of chars needed doesn't overflow int and # of bytes
4567 * needed doesn't overflow size_t
4568 */
4569 nchars = len * str->length;
4570 if (len && nchars / len != str->length) {
4571 PyErr_SetString(PyExc_OverflowError,
4572 "repeated string is too long");
4573 return NULL;
4574 }
4575 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4576 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4577 PyErr_SetString(PyExc_OverflowError,
4578 "repeated string is too long");
4579 return NULL;
4580 }
4581 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 if (!u)
4583 return NULL;
4584
4585 p = u->str;
4586
4587 while (len-- > 0) {
4588 Py_UNICODE_COPY(p, str->str, str->length);
4589 p += str->length;
4590 }
4591
4592 return (PyObject*) u;
4593}
4594
4595PyObject *PyUnicode_Replace(PyObject *obj,
4596 PyObject *subobj,
4597 PyObject *replobj,
4598 int maxcount)
4599{
4600 PyObject *self;
4601 PyObject *str1;
4602 PyObject *str2;
4603 PyObject *result;
4604
4605 self = PyUnicode_FromObject(obj);
4606 if (self == NULL)
4607 return NULL;
4608 str1 = PyUnicode_FromObject(subobj);
4609 if (str1 == NULL) {
4610 Py_DECREF(self);
4611 return NULL;
4612 }
4613 str2 = PyUnicode_FromObject(replobj);
4614 if (str2 == NULL) {
4615 Py_DECREF(self);
4616 Py_DECREF(str1);
4617 return NULL;
4618 }
4619 result = replace((PyUnicodeObject *)self,
4620 (PyUnicodeObject *)str1,
4621 (PyUnicodeObject *)str2,
4622 maxcount);
4623 Py_DECREF(self);
4624 Py_DECREF(str1);
4625 Py_DECREF(str2);
4626 return result;
4627}
4628
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004629PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004630"S.replace (old, new[, maxsplit]) -> unicode\n\
4631\n\
4632Return a copy of S with all occurrences of substring\n\
4633old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004634given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635
4636static PyObject*
4637unicode_replace(PyUnicodeObject *self, PyObject *args)
4638{
4639 PyUnicodeObject *str1;
4640 PyUnicodeObject *str2;
4641 int maxcount = -1;
4642 PyObject *result;
4643
4644 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4645 return NULL;
4646 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4647 if (str1 == NULL)
4648 return NULL;
4649 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4650 if (str2 == NULL)
4651 return NULL;
4652
4653 result = replace(self, str1, str2, maxcount);
4654
4655 Py_DECREF(str1);
4656 Py_DECREF(str2);
4657 return result;
4658}
4659
4660static
4661PyObject *unicode_repr(PyObject *unicode)
4662{
4663 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4664 PyUnicode_GET_SIZE(unicode),
4665 1);
4666}
4667
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004668PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669"S.rfind(sub [,start [,end]]) -> int\n\
4670\n\
4671Return the highest index in S where substring sub is found,\n\
4672such that sub is contained within s[start,end]. Optional\n\
4673arguments start and end are interpreted as in slice notation.\n\
4674\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004675Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004676
4677static PyObject *
4678unicode_rfind(PyUnicodeObject *self, PyObject *args)
4679{
4680 PyUnicodeObject *substring;
4681 int start = 0;
4682 int end = INT_MAX;
4683 PyObject *result;
4684
Guido van Rossumb8872e62000-05-09 14:14:27 +00004685 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4686 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687 return NULL;
4688 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4689 (PyObject *)substring);
4690 if (substring == NULL)
4691 return NULL;
4692
4693 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4694
4695 Py_DECREF(substring);
4696 return result;
4697}
4698
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004699PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700"S.rindex(sub [,start [,end]]) -> int\n\
4701\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004702Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703
4704static PyObject *
4705unicode_rindex(PyUnicodeObject *self, PyObject *args)
4706{
4707 int result;
4708 PyUnicodeObject *substring;
4709 int start = 0;
4710 int end = INT_MAX;
4711
Guido van Rossumb8872e62000-05-09 14:14:27 +00004712 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4713 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 return NULL;
4715 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4716 (PyObject *)substring);
4717 if (substring == NULL)
4718 return NULL;
4719
4720 result = findstring(self, substring, start, end, -1);
4721
4722 Py_DECREF(substring);
4723 if (result < 0) {
4724 PyErr_SetString(PyExc_ValueError, "substring not found");
4725 return NULL;
4726 }
4727 return PyInt_FromLong(result);
4728}
4729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004730PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731"S.rjust(width) -> unicode\n\
4732\n\
4733Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004734done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735
4736static PyObject *
4737unicode_rjust(PyUnicodeObject *self, PyObject *args)
4738{
4739 int width;
4740 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4741 return NULL;
4742
Tim Peters7a29bd52001-09-12 03:03:31 +00004743 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 Py_INCREF(self);
4745 return (PyObject*) self;
4746 }
4747
4748 return (PyObject*) pad(self, width - self->length, 0, ' ');
4749}
4750
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751static PyObject*
4752unicode_slice(PyUnicodeObject *self, int start, int end)
4753{
4754 /* standard clamping */
4755 if (start < 0)
4756 start = 0;
4757 if (end < 0)
4758 end = 0;
4759 if (end > self->length)
4760 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004761 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 /* full slice, return original string */
4763 Py_INCREF(self);
4764 return (PyObject*) self;
4765 }
4766 if (start > end)
4767 start = end;
4768 /* copy slice */
4769 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4770 end - start);
4771}
4772
4773PyObject *PyUnicode_Split(PyObject *s,
4774 PyObject *sep,
4775 int maxsplit)
4776{
4777 PyObject *result;
4778
4779 s = PyUnicode_FromObject(s);
4780 if (s == NULL)
4781 return NULL;
4782 if (sep != NULL) {
4783 sep = PyUnicode_FromObject(sep);
4784 if (sep == NULL) {
4785 Py_DECREF(s);
4786 return NULL;
4787 }
4788 }
4789
4790 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4791
4792 Py_DECREF(s);
4793 Py_XDECREF(sep);
4794 return result;
4795}
4796
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004797PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798"S.split([sep [,maxsplit]]) -> list of strings\n\
4799\n\
4800Return a list of the words in S, using sep as the\n\
4801delimiter string. If maxsplit is given, at most maxsplit\n\
4802splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004803is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804
4805static PyObject*
4806unicode_split(PyUnicodeObject *self, PyObject *args)
4807{
4808 PyObject *substring = Py_None;
4809 int maxcount = -1;
4810
4811 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4812 return NULL;
4813
4814 if (substring == Py_None)
4815 return split(self, NULL, maxcount);
4816 else if (PyUnicode_Check(substring))
4817 return split(self, (PyUnicodeObject *)substring, maxcount);
4818 else
4819 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4820}
4821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004822PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00004823"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824\n\
4825Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004826Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004827is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828
4829static PyObject*
4830unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4831{
Guido van Rossum86662912000-04-11 15:38:46 +00004832 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833
Guido van Rossum86662912000-04-11 15:38:46 +00004834 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 return NULL;
4836
Guido van Rossum86662912000-04-11 15:38:46 +00004837 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004838}
4839
4840static
4841PyObject *unicode_str(PyUnicodeObject *self)
4842{
Fred Drakee4315f52000-05-09 19:53:39 +00004843 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844}
4845
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004846PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004847"S.swapcase() -> unicode\n\
4848\n\
4849Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004850and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851
4852static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004853unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004855 return fixup(self, fixswapcase);
4856}
4857
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004858PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859"S.translate(table) -> unicode\n\
4860\n\
4861Return a copy of the string S, where all characters have been mapped\n\
4862through the given translation table, which must be a mapping of\n\
4863Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004864are left untouched. Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865
4866static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004867unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 return PyUnicode_TranslateCharmap(self->str,
4870 self->length,
4871 table,
4872 "ignore");
4873}
4874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004875PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004876"S.upper() -> unicode\n\
4877\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004878Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
4880static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004881unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 return fixup(self, fixupper);
4884}
4885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004886PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887"S.zfill(width) -> unicode\n\
4888\n\
4889Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004890of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004891
4892static PyObject *
4893unicode_zfill(PyUnicodeObject *self, PyObject *args)
4894{
4895 int fill;
4896 PyUnicodeObject *u;
4897
4898 int width;
4899 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4900 return NULL;
4901
4902 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004903 if (PyUnicode_CheckExact(self)) {
4904 Py_INCREF(self);
4905 return (PyObject*) self;
4906 }
4907 else
4908 return PyUnicode_FromUnicode(
4909 PyUnicode_AS_UNICODE(self),
4910 PyUnicode_GET_SIZE(self)
4911 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912 }
4913
4914 fill = width - self->length;
4915
4916 u = pad(self, fill, 0, '0');
4917
Walter Dörwald068325e2002-04-15 13:36:47 +00004918 if (u == NULL)
4919 return NULL;
4920
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 if (u->str[fill] == '+' || u->str[fill] == '-') {
4922 /* move sign to beginning of string */
4923 u->str[0] = u->str[fill];
4924 u->str[fill] = '0';
4925 }
4926
4927 return (PyObject*) u;
4928}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004929
4930#if 0
4931static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004932unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004933{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004934 return PyInt_FromLong(unicode_freelist_size);
4935}
4936#endif
4937
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004938PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004939"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004941Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004942optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004943comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004944
4945static PyObject *
4946unicode_startswith(PyUnicodeObject *self,
4947 PyObject *args)
4948{
4949 PyUnicodeObject *substring;
4950 int start = 0;
4951 int end = INT_MAX;
4952 PyObject *result;
4953
Guido van Rossumb8872e62000-05-09 14:14:27 +00004954 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4955 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956 return NULL;
4957 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4958 (PyObject *)substring);
4959 if (substring == NULL)
4960 return NULL;
4961
Guido van Rossum77f6a652002-04-03 22:41:51 +00004962 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004963
4964 Py_DECREF(substring);
4965 return result;
4966}
4967
4968
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004969PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004970"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004971\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004972Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004974comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004975
4976static PyObject *
4977unicode_endswith(PyUnicodeObject *self,
4978 PyObject *args)
4979{
4980 PyUnicodeObject *substring;
4981 int start = 0;
4982 int end = INT_MAX;
4983 PyObject *result;
4984
Guido van Rossumb8872e62000-05-09 14:14:27 +00004985 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4986 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004987 return NULL;
4988 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4989 (PyObject *)substring);
4990 if (substring == NULL)
4991 return NULL;
4992
Guido van Rossum77f6a652002-04-03 22:41:51 +00004993 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994
4995 Py_DECREF(substring);
4996 return result;
4997}
4998
4999
5000static PyMethodDef unicode_methods[] = {
5001
5002 /* Order is according to common usage: often used methods should
5003 appear first, since lookup is done sequentially. */
5004
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005005 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5006 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5007 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5008 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5009 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5010 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5011 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5012 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5013 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5014 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5015 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5016 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5017 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005018 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005019/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5020 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5021 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5022 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005023 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005024 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005025 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005026 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5027 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5028 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5029 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5030 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5031 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5032 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5033 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5034 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5035 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5036 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5037 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5038 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5039 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005040 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005041#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005042 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043#endif
5044
5045#if 0
5046 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005047 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048#endif
5049
5050 {NULL, NULL}
5051};
5052
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053static PySequenceMethods unicode_as_sequence = {
5054 (inquiry) unicode_length, /* sq_length */
5055 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5056 (intargfunc) unicode_repeat, /* sq_repeat */
5057 (intargfunc) unicode_getitem, /* sq_item */
5058 (intintargfunc) unicode_slice, /* sq_slice */
5059 0, /* sq_ass_item */
5060 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005061 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062};
5063
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005064static PyObject*
5065unicode_subscript(PyUnicodeObject* self, PyObject* item)
5066{
5067 if (PyInt_Check(item)) {
5068 long i = PyInt_AS_LONG(item);
5069 if (i < 0)
5070 i += PyString_GET_SIZE(self);
5071 return unicode_getitem(self, i);
5072 } else if (PyLong_Check(item)) {
5073 long i = PyLong_AsLong(item);
5074 if (i == -1 && PyErr_Occurred())
5075 return NULL;
5076 if (i < 0)
5077 i += PyString_GET_SIZE(self);
5078 return unicode_getitem(self, i);
5079 } else if (PySlice_Check(item)) {
5080 int start, stop, step, slicelength, cur, i;
5081 Py_UNICODE* source_buf;
5082 Py_UNICODE* result_buf;
5083 PyObject* result;
5084
5085 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5086 &start, &stop, &step, &slicelength) < 0) {
5087 return NULL;
5088 }
5089
5090 if (slicelength <= 0) {
5091 return PyUnicode_FromUnicode(NULL, 0);
5092 } else {
5093 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5094 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5095
5096 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5097 result_buf[i] = source_buf[cur];
5098 }
5099
5100 result = PyUnicode_FromUnicode(result_buf, slicelength);
5101 PyMem_FREE(result_buf);
5102 return result;
5103 }
5104 } else {
5105 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5106 return NULL;
5107 }
5108}
5109
5110static PyMappingMethods unicode_as_mapping = {
5111 (inquiry)unicode_length, /* mp_length */
5112 (binaryfunc)unicode_subscript, /* mp_subscript */
5113 (objobjargproc)0, /* mp_ass_subscript */
5114};
5115
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116static int
5117unicode_buffer_getreadbuf(PyUnicodeObject *self,
5118 int index,
5119 const void **ptr)
5120{
5121 if (index != 0) {
5122 PyErr_SetString(PyExc_SystemError,
5123 "accessing non-existent unicode segment");
5124 return -1;
5125 }
5126 *ptr = (void *) self->str;
5127 return PyUnicode_GET_DATA_SIZE(self);
5128}
5129
5130static int
5131unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5132 const void **ptr)
5133{
5134 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005135 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 return -1;
5137}
5138
5139static int
5140unicode_buffer_getsegcount(PyUnicodeObject *self,
5141 int *lenp)
5142{
5143 if (lenp)
5144 *lenp = PyUnicode_GET_DATA_SIZE(self);
5145 return 1;
5146}
5147
5148static int
5149unicode_buffer_getcharbuf(PyUnicodeObject *self,
5150 int index,
5151 const void **ptr)
5152{
5153 PyObject *str;
5154
5155 if (index != 0) {
5156 PyErr_SetString(PyExc_SystemError,
5157 "accessing non-existent unicode segment");
5158 return -1;
5159 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005160 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005161 if (str == NULL)
5162 return -1;
5163 *ptr = (void *) PyString_AS_STRING(str);
5164 return PyString_GET_SIZE(str);
5165}
5166
5167/* Helpers for PyUnicode_Format() */
5168
5169static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005170getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171{
5172 int argidx = *p_argidx;
5173 if (argidx < arglen) {
5174 (*p_argidx)++;
5175 if (arglen < 0)
5176 return args;
5177 else
5178 return PyTuple_GetItem(args, argidx);
5179 }
5180 PyErr_SetString(PyExc_TypeError,
5181 "not enough arguments for format string");
5182 return NULL;
5183}
5184
5185#define F_LJUST (1<<0)
5186#define F_SIGN (1<<1)
5187#define F_BLANK (1<<2)
5188#define F_ALT (1<<3)
5189#define F_ZERO (1<<4)
5190
5191static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193{
5194 register int i;
5195 int len;
5196 va_list va;
5197 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199
5200 /* First, format the string as char array, then expand to Py_UNICODE
5201 array. */
5202 charbuffer = (char *)buffer;
5203 len = vsprintf(charbuffer, format, va);
5204 for (i = len - 1; i >= 0; i--)
5205 buffer[i] = (Py_UNICODE) charbuffer[i];
5206
5207 va_end(va);
5208 return len;
5209}
5210
5211static int
5212formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005213 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 int flags,
5215 int prec,
5216 int type,
5217 PyObject *v)
5218{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005219 /* fmt = '%#.' + `prec` + `type`
5220 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 char fmt[20];
5222 double x;
5223
5224 x = PyFloat_AsDouble(v);
5225 if (x == -1.0 && PyErr_Occurred())
5226 return -1;
5227 if (prec < 0)
5228 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5230 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005231 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5232 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005233 /* worst case length calc to ensure no buffer overrun:
5234 fmt = %#.<prec>g
5235 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5236 for any double rep.)
5237 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5238 If prec=0 the effective precision is 1 (the leading digit is
5239 always given), therefore increase by one to 10+prec. */
5240 if (buflen <= (size_t)10 + (size_t)prec) {
5241 PyErr_SetString(PyExc_OverflowError,
5242 "formatted float is too long (precision too long?)");
5243 return -1;
5244 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 return usprintf(buf, fmt, x);
5246}
5247
Tim Peters38fd5b62000-09-21 05:43:11 +00005248static PyObject*
5249formatlong(PyObject *val, int flags, int prec, int type)
5250{
5251 char *buf;
5252 int i, len;
5253 PyObject *str; /* temporary string object. */
5254 PyUnicodeObject *result;
5255
5256 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5257 if (!str)
5258 return NULL;
5259 result = _PyUnicode_New(len);
5260 for (i = 0; i < len; i++)
5261 result->str[i] = buf[i];
5262 result->str[len] = 0;
5263 Py_DECREF(str);
5264 return (PyObject*)result;
5265}
5266
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267static int
5268formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005269 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270 int flags,
5271 int prec,
5272 int type,
5273 PyObject *v)
5274{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005275 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005276 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5277 * + 1 + 1
5278 * = 24
5279 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005280 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 long x;
5282
5283 x = PyInt_AsLong(v);
5284 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005285 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005287 prec = 1;
5288
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005289 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005290 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5291 */
5292 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005293 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005294 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005295 return -1;
5296 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005297
5298 if ((flags & F_ALT) &&
5299 (type == 'x' || type == 'X')) {
5300 /* When converting under %#x or %#X, there are a number
5301 * of issues that cause pain:
5302 * - when 0 is being converted, the C standard leaves off
5303 * the '0x' or '0X', which is inconsistent with other
5304 * %#x/%#X conversions and inconsistent with Python's
5305 * hex() function
5306 * - there are platforms that violate the standard and
5307 * convert 0 with the '0x' or '0X'
5308 * (Metrowerks, Compaq Tru64)
5309 * - there are platforms that give '0x' when converting
5310 * under %#X, but convert 0 in accordance with the
5311 * standard (OS/2 EMX)
5312 *
5313 * We can achieve the desired consistency by inserting our
5314 * own '0x' or '0X' prefix, and substituting %x/%X in place
5315 * of %#x/%#X.
5316 *
5317 * Note that this is the same approach as used in
5318 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005319 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005320 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5321 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005322 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005323 else {
5324 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5325 (flags&F_ALT) ? "#" : "",
5326 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005327 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005328 return usprintf(buf, fmt, x);
5329}
5330
5331static int
5332formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005333 size_t buflen,
5334 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005335{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005336 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005337 if (PyUnicode_Check(v)) {
5338 if (PyUnicode_GET_SIZE(v) != 1)
5339 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005341 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005342
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005343 else if (PyString_Check(v)) {
5344 if (PyString_GET_SIZE(v) != 1)
5345 goto onError;
5346 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348
5349 else {
5350 /* Integer input truncated to a character */
5351 long x;
5352 x = PyInt_AsLong(v);
5353 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005354 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005355 buf[0] = (char) x;
5356 }
5357 buf[1] = '\0';
5358 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005359
5360 onError:
5361 PyErr_SetString(PyExc_TypeError,
5362 "%c requires int or char");
5363 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364}
5365
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005366/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5367
5368 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5369 chars are formatted. XXX This is a magic number. Each formatting
5370 routine does bounds checking to ensure no overflow, but a better
5371 solution may be to malloc a buffer of appropriate size for each
5372 format. For now, the current solution is sufficient.
5373*/
5374#define FORMATBUFLEN (size_t)120
5375
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376PyObject *PyUnicode_Format(PyObject *format,
5377 PyObject *args)
5378{
5379 Py_UNICODE *fmt, *res;
5380 int fmtcnt, rescnt, reslen, arglen, argidx;
5381 int args_owned = 0;
5382 PyUnicodeObject *result = NULL;
5383 PyObject *dict = NULL;
5384 PyObject *uformat;
5385
5386 if (format == NULL || args == NULL) {
5387 PyErr_BadInternalCall();
5388 return NULL;
5389 }
5390 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005391 if (uformat == NULL)
5392 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005393 fmt = PyUnicode_AS_UNICODE(uformat);
5394 fmtcnt = PyUnicode_GET_SIZE(uformat);
5395
5396 reslen = rescnt = fmtcnt + 100;
5397 result = _PyUnicode_New(reslen);
5398 if (result == NULL)
5399 goto onError;
5400 res = PyUnicode_AS_UNICODE(result);
5401
5402 if (PyTuple_Check(args)) {
5403 arglen = PyTuple_Size(args);
5404 argidx = 0;
5405 }
5406 else {
5407 arglen = -1;
5408 argidx = -2;
5409 }
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005410 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005411 dict = args;
5412
5413 while (--fmtcnt >= 0) {
5414 if (*fmt != '%') {
5415 if (--rescnt < 0) {
5416 rescnt = fmtcnt + 100;
5417 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005418 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005419 return NULL;
5420 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5421 --rescnt;
5422 }
5423 *res++ = *fmt++;
5424 }
5425 else {
5426 /* Got a format specifier */
5427 int flags = 0;
5428 int width = -1;
5429 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005430 Py_UNICODE c = '\0';
5431 Py_UNICODE fill;
5432 PyObject *v = NULL;
5433 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005434 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005435 Py_UNICODE sign;
5436 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005437 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005438
5439 fmt++;
5440 if (*fmt == '(') {
5441 Py_UNICODE *keystart;
5442 int keylen;
5443 PyObject *key;
5444 int pcount = 1;
5445
5446 if (dict == NULL) {
5447 PyErr_SetString(PyExc_TypeError,
5448 "format requires a mapping");
5449 goto onError;
5450 }
5451 ++fmt;
5452 --fmtcnt;
5453 keystart = fmt;
5454 /* Skip over balanced parentheses */
5455 while (pcount > 0 && --fmtcnt >= 0) {
5456 if (*fmt == ')')
5457 --pcount;
5458 else if (*fmt == '(')
5459 ++pcount;
5460 fmt++;
5461 }
5462 keylen = fmt - keystart - 1;
5463 if (fmtcnt < 0 || pcount > 0) {
5464 PyErr_SetString(PyExc_ValueError,
5465 "incomplete format key");
5466 goto onError;
5467 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005468#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005469 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005470 then looked up since Python uses strings to hold
5471 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005472 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005473 key = PyUnicode_EncodeUTF8(keystart,
5474 keylen,
5475 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005476#else
5477 key = PyUnicode_FromUnicode(keystart, keylen);
5478#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 if (key == NULL)
5480 goto onError;
5481 if (args_owned) {
5482 Py_DECREF(args);
5483 args_owned = 0;
5484 }
5485 args = PyObject_GetItem(dict, key);
5486 Py_DECREF(key);
5487 if (args == NULL) {
5488 goto onError;
5489 }
5490 args_owned = 1;
5491 arglen = -1;
5492 argidx = -2;
5493 }
5494 while (--fmtcnt >= 0) {
5495 switch (c = *fmt++) {
5496 case '-': flags |= F_LJUST; continue;
5497 case '+': flags |= F_SIGN; continue;
5498 case ' ': flags |= F_BLANK; continue;
5499 case '#': flags |= F_ALT; continue;
5500 case '0': flags |= F_ZERO; continue;
5501 }
5502 break;
5503 }
5504 if (c == '*') {
5505 v = getnextarg(args, arglen, &argidx);
5506 if (v == NULL)
5507 goto onError;
5508 if (!PyInt_Check(v)) {
5509 PyErr_SetString(PyExc_TypeError,
5510 "* wants int");
5511 goto onError;
5512 }
5513 width = PyInt_AsLong(v);
5514 if (width < 0) {
5515 flags |= F_LJUST;
5516 width = -width;
5517 }
5518 if (--fmtcnt >= 0)
5519 c = *fmt++;
5520 }
5521 else if (c >= '0' && c <= '9') {
5522 width = c - '0';
5523 while (--fmtcnt >= 0) {
5524 c = *fmt++;
5525 if (c < '0' || c > '9')
5526 break;
5527 if ((width*10) / 10 != width) {
5528 PyErr_SetString(PyExc_ValueError,
5529 "width too big");
5530 goto onError;
5531 }
5532 width = width*10 + (c - '0');
5533 }
5534 }
5535 if (c == '.') {
5536 prec = 0;
5537 if (--fmtcnt >= 0)
5538 c = *fmt++;
5539 if (c == '*') {
5540 v = getnextarg(args, arglen, &argidx);
5541 if (v == NULL)
5542 goto onError;
5543 if (!PyInt_Check(v)) {
5544 PyErr_SetString(PyExc_TypeError,
5545 "* wants int");
5546 goto onError;
5547 }
5548 prec = PyInt_AsLong(v);
5549 if (prec < 0)
5550 prec = 0;
5551 if (--fmtcnt >= 0)
5552 c = *fmt++;
5553 }
5554 else if (c >= '0' && c <= '9') {
5555 prec = c - '0';
5556 while (--fmtcnt >= 0) {
5557 c = Py_CHARMASK(*fmt++);
5558 if (c < '0' || c > '9')
5559 break;
5560 if ((prec*10) / 10 != prec) {
5561 PyErr_SetString(PyExc_ValueError,
5562 "prec too big");
5563 goto onError;
5564 }
5565 prec = prec*10 + (c - '0');
5566 }
5567 }
5568 } /* prec */
5569 if (fmtcnt >= 0) {
5570 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 if (--fmtcnt >= 0)
5572 c = *fmt++;
5573 }
5574 }
5575 if (fmtcnt < 0) {
5576 PyErr_SetString(PyExc_ValueError,
5577 "incomplete format");
5578 goto onError;
5579 }
5580 if (c != '%') {
5581 v = getnextarg(args, arglen, &argidx);
5582 if (v == NULL)
5583 goto onError;
5584 }
5585 sign = 0;
5586 fill = ' ';
5587 switch (c) {
5588
5589 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005590 pbuf = formatbuf;
5591 /* presume that buffer length is at least 1 */
5592 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 len = 1;
5594 break;
5595
5596 case 's':
5597 case 'r':
5598 if (PyUnicode_Check(v) && c == 's') {
5599 temp = v;
5600 Py_INCREF(temp);
5601 }
5602 else {
5603 PyObject *unicode;
5604 if (c == 's')
5605 temp = PyObject_Str(v);
5606 else
5607 temp = PyObject_Repr(v);
5608 if (temp == NULL)
5609 goto onError;
5610 if (!PyString_Check(temp)) {
5611 /* XXX Note: this should never happen, since
5612 PyObject_Repr() and PyObject_Str() assure
5613 this */
5614 Py_DECREF(temp);
5615 PyErr_SetString(PyExc_TypeError,
5616 "%s argument has non-string str()");
5617 goto onError;
5618 }
Fred Drakee4315f52000-05-09 19:53:39 +00005619 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005620 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005621 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005622 "strict");
5623 Py_DECREF(temp);
5624 temp = unicode;
5625 if (temp == NULL)
5626 goto onError;
5627 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005628 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005629 len = PyUnicode_GET_SIZE(temp);
5630 if (prec >= 0 && len > prec)
5631 len = prec;
5632 break;
5633
5634 case 'i':
5635 case 'd':
5636 case 'u':
5637 case 'o':
5638 case 'x':
5639 case 'X':
5640 if (c == 'i')
5641 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005642 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005643 temp = formatlong(v, flags, prec, c);
5644 if (!temp)
5645 goto onError;
5646 pbuf = PyUnicode_AS_UNICODE(temp);
5647 len = PyUnicode_GET_SIZE(temp);
5648 /* unbounded ints can always produce
5649 a sign character! */
5650 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005651 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005652 else {
5653 pbuf = formatbuf;
5654 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5655 flags, prec, c, v);
5656 if (len < 0)
5657 goto onError;
5658 /* only d conversion is signed */
5659 sign = c == 'd';
5660 }
5661 if (flags & F_ZERO)
5662 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005663 break;
5664
5665 case 'e':
5666 case 'E':
5667 case 'f':
5668 case 'g':
5669 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005670 pbuf = formatbuf;
5671 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5672 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005673 if (len < 0)
5674 goto onError;
5675 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005676 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005677 fill = '0';
5678 break;
5679
5680 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005681 pbuf = formatbuf;
5682 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 if (len < 0)
5684 goto onError;
5685 break;
5686
5687 default:
5688 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005689 "unsupported format character '%c' (0x%x) "
5690 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005691 (31<=c && c<=126) ? c : '?',
5692 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005693 goto onError;
5694 }
5695 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005696 if (*pbuf == '-' || *pbuf == '+') {
5697 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 len--;
5699 }
5700 else if (flags & F_SIGN)
5701 sign = '+';
5702 else if (flags & F_BLANK)
5703 sign = ' ';
5704 else
5705 sign = 0;
5706 }
5707 if (width < len)
5708 width = len;
5709 if (rescnt < width + (sign != 0)) {
5710 reslen -= rescnt;
5711 rescnt = width + fmtcnt + 100;
5712 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005713 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005714 return NULL;
5715 res = PyUnicode_AS_UNICODE(result)
5716 + reslen - rescnt;
5717 }
5718 if (sign) {
5719 if (fill != ' ')
5720 *res++ = sign;
5721 rescnt--;
5722 if (width > len)
5723 width--;
5724 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005725 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5726 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005727 assert(pbuf[1] == c);
5728 if (fill != ' ') {
5729 *res++ = *pbuf++;
5730 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005731 }
Tim Petersfff53252001-04-12 18:38:48 +00005732 rescnt -= 2;
5733 width -= 2;
5734 if (width < 0)
5735 width = 0;
5736 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005737 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005738 if (width > len && !(flags & F_LJUST)) {
5739 do {
5740 --rescnt;
5741 *res++ = fill;
5742 } while (--width > len);
5743 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005744 if (fill == ' ') {
5745 if (sign)
5746 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005747 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005748 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005749 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005750 *res++ = *pbuf++;
5751 *res++ = *pbuf++;
5752 }
5753 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005754 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 res += len;
5756 rescnt -= len;
5757 while (--width >= len) {
5758 --rescnt;
5759 *res++ = ' ';
5760 }
5761 if (dict && (argidx < arglen) && c != '%') {
5762 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005763 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764 goto onError;
5765 }
5766 Py_XDECREF(temp);
5767 } /* '%' */
5768 } /* until end */
5769 if (argidx < arglen && !dict) {
5770 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005771 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005772 goto onError;
5773 }
5774
5775 if (args_owned) {
5776 Py_DECREF(args);
5777 }
5778 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005779 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005780 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005781 return (PyObject *)result;
5782
5783 onError:
5784 Py_XDECREF(result);
5785 Py_DECREF(uformat);
5786 if (args_owned) {
5787 Py_DECREF(args);
5788 }
5789 return NULL;
5790}
5791
5792static PyBufferProcs unicode_as_buffer = {
5793 (getreadbufferproc) unicode_buffer_getreadbuf,
5794 (getwritebufferproc) unicode_buffer_getwritebuf,
5795 (getsegcountproc) unicode_buffer_getsegcount,
5796 (getcharbufferproc) unicode_buffer_getcharbuf,
5797};
5798
Jeremy Hylton938ace62002-07-17 16:30:39 +00005799static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00005800unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5801
Tim Peters6d6c1a32001-08-02 04:15:00 +00005802static PyObject *
5803unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5804{
5805 PyObject *x = NULL;
5806 static char *kwlist[] = {"string", "encoding", "errors", 0};
5807 char *encoding = NULL;
5808 char *errors = NULL;
5809
Guido van Rossume023fe02001-08-30 03:12:59 +00005810 if (type != &PyUnicode_Type)
5811 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005812 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5813 kwlist, &x, &encoding, &errors))
5814 return NULL;
5815 if (x == NULL)
5816 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005817 if (encoding == NULL && errors == NULL)
5818 return PyObject_Unicode(x);
5819 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005820 return PyUnicode_FromEncodedObject(x, encoding, errors);
5821}
5822
Guido van Rossume023fe02001-08-30 03:12:59 +00005823static PyObject *
5824unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5825{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005826 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005827 int n;
5828
5829 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5830 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5831 if (tmp == NULL)
5832 return NULL;
5833 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005834 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5835 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005836 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005837 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5838 if (pnew->str == NULL) {
5839 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005840 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005841 return NULL;
5842 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005843 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5844 pnew->length = n;
5845 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005846 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005847 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005848}
5849
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005850PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00005851"unicode(string [, encoding[, errors]]) -> object\n\
5852\n\
5853Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00005854encoding defaults to the current default string encoding.\n\
5855errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00005856
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857PyTypeObject PyUnicode_Type = {
5858 PyObject_HEAD_INIT(&PyType_Type)
5859 0, /* ob_size */
5860 "unicode", /* tp_name */
5861 sizeof(PyUnicodeObject), /* tp_size */
5862 0, /* tp_itemsize */
5863 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005864 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005865 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005866 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 0, /* tp_setattr */
5868 (cmpfunc) unicode_compare, /* tp_compare */
5869 (reprfunc) unicode_repr, /* tp_repr */
5870 0, /* tp_as_number */
5871 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005872 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 (hashfunc) unicode_hash, /* tp_hash*/
5874 0, /* tp_call*/
5875 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005876 PyObject_GenericGetAttr, /* tp_getattro */
5877 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005878 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005879 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005880 unicode_doc, /* tp_doc */
5881 0, /* tp_traverse */
5882 0, /* tp_clear */
5883 0, /* tp_richcompare */
5884 0, /* tp_weaklistoffset */
5885 0, /* tp_iter */
5886 0, /* tp_iternext */
5887 unicode_methods, /* tp_methods */
5888 0, /* tp_members */
5889 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00005890 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005891 0, /* tp_dict */
5892 0, /* tp_descr_get */
5893 0, /* tp_descr_set */
5894 0, /* tp_dictoffset */
5895 0, /* tp_init */
5896 0, /* tp_alloc */
5897 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005898 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005899};
5900
5901/* Initialize the Unicode implementation */
5902
Thomas Wouters78890102000-07-22 19:25:51 +00005903void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005904{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005905 int i;
5906
Fred Drakee4315f52000-05-09 19:53:39 +00005907 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005908 unicode_freelist = NULL;
5909 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005911 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005912 for (i = 0; i < 256; i++)
5913 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00005914 if (PyType_Ready(&PyUnicode_Type) < 0)
5915 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916}
5917
5918/* Finalize the Unicode implementation */
5919
5920void
Thomas Wouters78890102000-07-22 19:25:51 +00005921_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005923 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005924 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005925
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005926 Py_XDECREF(unicode_empty);
5927 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005928
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005929 for (i = 0; i < 256; i++) {
5930 if (unicode_latin1[i]) {
5931 Py_DECREF(unicode_latin1[i]);
5932 unicode_latin1[i] = NULL;
5933 }
5934 }
5935
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005936 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937 PyUnicodeObject *v = u;
5938 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005939 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005940 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005941 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005942 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005944 unicode_freelist = NULL;
5945 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946}