blob: 03b5dbd9704d2a8b652c28c1cc2f394c27865bc0 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000044#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000045#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
Tim Peters5de98422002-04-27 18:44:32 +0000930 _PyString_Resize(&v, out - start);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000931 return v;
932}
933
934#undef SPECIAL
935#undef B64
936#undef B64CHAR
937#undef UB64
938#undef ENCODE
939#undef DECODE
940
Guido van Rossumd57fd912000-03-10 22:53:23 +0000941/* --- UTF-8 Codec -------------------------------------------------------- */
942
943static
944char utf8_code_length[256] = {
945 /* Map UTF-8 encoded prefix byte to sequence length. zero means
946 illegal prefix. see RFC 2279 for details */
947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
948 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
957 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
960 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
961 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
962 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
963};
964
965static
966int utf8_decoding_error(const char **source,
967 Py_UNICODE **dest,
968 const char *errors,
969 const char *details)
970{
971 if ((errors == NULL) ||
972 (strcmp(errors,"strict") == 0)) {
973 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000974 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000975 details);
976 return -1;
977 }
978 else if (strcmp(errors,"ignore") == 0) {
979 (*source)++;
980 return 0;
981 }
982 else if (strcmp(errors,"replace") == 0) {
983 (*source)++;
984 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
985 (*dest)++;
986 return 0;
987 }
988 else {
989 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000990 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000991 errors);
992 return -1;
993 }
994}
995
Guido van Rossumd57fd912000-03-10 22:53:23 +0000996PyObject *PyUnicode_DecodeUTF8(const char *s,
997 int size,
998 const char *errors)
999{
1000 int n;
1001 const char *e;
1002 PyUnicodeObject *unicode;
1003 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001004 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005
1006 /* Note: size will always be longer than the resulting Unicode
1007 character count */
1008 unicode = _PyUnicode_New(size);
1009 if (!unicode)
1010 return NULL;
1011 if (size == 0)
1012 return (PyObject *)unicode;
1013
1014 /* Unpack UTF-8 encoded data */
1015 p = unicode->str;
1016 e = s + size;
1017
1018 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001019 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020
1021 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 s++;
1024 continue;
1025 }
1026
1027 n = utf8_code_length[ch];
1028
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001029 if (s + n > e) {
1030 errmsg = "unexpected end of data";
1031 goto utf8Error;
1032 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033
1034 switch (n) {
1035
1036 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001037 errmsg = "unexpected code byte";
1038 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001039
1040 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001041 errmsg = "internal error";
1042 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001045 if ((s[1] & 0xc0) != 0x80) {
1046 errmsg = "invalid data";
1047 goto utf8Error;
1048 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001050 if (ch < 0x80) {
1051 errmsg = "illegal encoding";
1052 goto utf8Error;
1053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056 break;
1057
1058 case 3:
1059 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001060 (s[2] & 0xc0) != 0x80) {
1061 errmsg = "invalid data";
1062 goto utf8Error;
1063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001065 if (ch < 0x0800) {
1066 /* Note: UTF-8 encodings of surrogates are considered
1067 legal UTF-8 sequences;
1068
1069 XXX For wide builds (UCS-4) we should probably try
1070 to recombine the surrogates into a single code
1071 unit.
1072 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001073 errmsg = "illegal encoding";
1074 goto utf8Error;
1075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001077 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001078 break;
1079
1080 case 4:
1081 if ((s[1] & 0xc0) != 0x80 ||
1082 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001083 (s[3] & 0xc0) != 0x80) {
1084 errmsg = "invalid data";
1085 goto utf8Error;
1086 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001087 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1088 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1089 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001090 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001091 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001092 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001093 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001094 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001095 errmsg = "illegal encoding";
1096 goto utf8Error;
1097 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001098#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001099 *p++ = (Py_UNICODE)ch;
1100#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001101 /* compute and append the two surrogates: */
1102
1103 /* translate from 10000..10FFFF to 0..FFFF */
1104 ch -= 0x10000;
1105
1106 /* high surrogate = top 10 bits added to D800 */
1107 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1108
1109 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001110 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001111#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001112 break;
1113
1114 default:
1115 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001116 errmsg = "unsupported Unicode code range";
1117 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 }
1119 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001120 continue;
1121
1122 utf8Error:
1123 if (utf8_decoding_error(&s, &p, errors, errmsg))
1124 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001125 }
1126
1127 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001128 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129 goto onError;
1130
1131 return (PyObject *)unicode;
1132
1133onError:
1134 Py_DECREF(unicode);
1135 return NULL;
1136}
1137
Tim Peters602f7402002-04-27 18:03:26 +00001138/* Allocation strategy: if the string is short, convert into a stack buffer
1139 and allocate exactly as much space needed at the end. Else allocate the
1140 maximum possible needed (4 result bytes per Unicode character), and return
1141 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001142*/
Tim Peters7e3d9612002-04-21 03:26:37 +00001143PyObject *
1144PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1145 int size,
1146 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147{
Tim Peters602f7402002-04-27 18:03:26 +00001148#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
Tim Peters0eca65c2002-04-21 17:28:06 +00001149
Tim Peters602f7402002-04-27 18:03:26 +00001150 int i; /* index into s of next input byte */
1151 PyObject *v; /* result string object */
1152 char *p; /* next free byte in output buffer */
1153 int nallocated; /* number of result bytes allocated */
1154 int nneeded; /* number of result bytes needed */
1155 char stackbuf[MAX_SHORT_UNICHARS * 4];
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001156
Tim Peters602f7402002-04-27 18:03:26 +00001157 assert(s != NULL);
1158 assert(size >= 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159
Tim Peters602f7402002-04-27 18:03:26 +00001160 if (size <= MAX_SHORT_UNICHARS) {
1161 /* Write into the stack buffer; nallocated can't overflow.
1162 * At the end, we'll allocate exactly as much heap space as it
1163 * turns out we need.
1164 */
1165 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
1166 v = NULL; /* will allocate after we're done */
1167 p = stackbuf;
1168 }
1169 else {
1170 /* Overallocate on the heap, and give the excess back at the end. */
1171 nallocated = size * 4;
1172 if (nallocated / 4 != size) /* overflow! */
1173 return PyErr_NoMemory();
1174 v = PyString_FromStringAndSize(NULL, nallocated);
1175 if (v == NULL)
1176 return NULL;
1177 p = PyString_AS_STRING(v);
1178 }
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001179
Tim Peters602f7402002-04-27 18:03:26 +00001180 for (i = 0; i < size;) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001181 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001182
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001183 if (ch < 0x80)
Tim Peters602f7402002-04-27 18:03:26 +00001184 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 else if (ch < 0x0800) {
Tim Peters602f7402002-04-27 18:03:26 +00001188 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001189 *p++ = (char)(0xc0 | (ch >> 6));
1190 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001191 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001192 else {
Tim Peters602f7402002-04-27 18:03:26 +00001193 /* Encode UCS2 Unicode ordinals */
1194 if (ch < 0x10000) {
1195 /* Special case: check for high surrogate */
1196 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1197 Py_UCS4 ch2 = s[i];
1198 /* Check for low surrogate and combine the two to
1199 form a UCS4 value */
1200 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001201 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
Tim Peters602f7402002-04-27 18:03:26 +00001202 i++;
1203 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001204 }
Tim Peters602f7402002-04-27 18:03:26 +00001205 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001206 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001207 *p++ = (char)(0xe0 | (ch >> 12));
Tim Peters602f7402002-04-27 18:03:26 +00001208 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1209 *p++ = (char)(0x80 | (ch & 0x3f));
1210 continue;
1211 }
1212encodeUCS4:
1213 /* Encode UCS4 Unicode ordinals */
1214 *p++ = (char)(0xf0 | (ch >> 18));
1215 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1216 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1217 *p++ = (char)(0x80 | (ch & 0x3f));
1218 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 }
Tim Peters0eca65c2002-04-21 17:28:06 +00001220
Tim Peters602f7402002-04-27 18:03:26 +00001221 if (v == NULL) {
1222 /* This was stack allocated. */
1223 nneeded = Py_SAFE_DOWNCAST(p - stackbuf, long, int);
1224 assert(nneeded <= nallocated);
1225 v = PyString_FromStringAndSize(stackbuf, nneeded);
1226 }
1227 else {
1228 /* Cut back to size actually needed. */
1229 nneeded = Py_SAFE_DOWNCAST(p - PyString_AS_STRING(v), long, int);
1230 assert(nneeded <= nallocated);
1231 _PyString_Resize(&v, nneeded);
1232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 return v;
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00001234
Tim Peters602f7402002-04-27 18:03:26 +00001235#undef MAX_SHORT_UNICHARS
Guido van Rossumd57fd912000-03-10 22:53:23 +00001236}
1237
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1239{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 if (!PyUnicode_Check(unicode)) {
1241 PyErr_BadArgument();
1242 return NULL;
1243 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001244 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1245 PyUnicode_GET_SIZE(unicode),
1246 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247}
1248
1249/* --- UTF-16 Codec ------------------------------------------------------- */
1250
1251static
Tim Peters772747b2001-08-09 22:21:55 +00001252int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 const char *errors,
1254 const char *details)
1255{
1256 if ((errors == NULL) ||
1257 (strcmp(errors,"strict") == 0)) {
1258 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001259 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 details);
1261 return -1;
1262 }
1263 else if (strcmp(errors,"ignore") == 0) {
1264 return 0;
1265 }
1266 else if (strcmp(errors,"replace") == 0) {
1267 if (dest) {
1268 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1269 (*dest)++;
1270 }
1271 return 0;
1272 }
1273 else {
1274 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001275 "UTF-16 decoding error; "
1276 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 errors);
1278 return -1;
1279 }
1280}
1281
Tim Peters772747b2001-08-09 22:21:55 +00001282PyObject *
1283PyUnicode_DecodeUTF16(const char *s,
1284 int size,
1285 const char *errors,
1286 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287{
1288 PyUnicodeObject *unicode;
1289 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001290 const unsigned char *q, *e;
1291 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001292 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001293 /* Offsets from q for retrieving byte pairs in the right order. */
1294#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1295 int ihi = 1, ilo = 0;
1296#else
1297 int ihi = 0, ilo = 1;
1298#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001299
1300 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001301 if (size & 1) {
1302 if (utf16_decoding_error(NULL, errors, "truncated data"))
1303 return NULL;
1304 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 }
1306
1307 /* Note: size will always be longer than the resulting Unicode
1308 character count */
1309 unicode = _PyUnicode_New(size);
1310 if (!unicode)
1311 return NULL;
1312 if (size == 0)
1313 return (PyObject *)unicode;
1314
1315 /* Unpack UTF-16 encoded data */
1316 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001317 q = (unsigned char *)s;
1318 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319
1320 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001321 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001323 /* Check for BOM marks (U+FEFF) in the input and adjust current
1324 byte order setting accordingly. In native mode, the leading BOM
1325 mark is skipped, in all other modes, it is copied to the output
1326 stream as-is (giving a ZWNBSP character). */
1327 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001328 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001330 if (bom == 0xFEFF) {
1331 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001332 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001333 }
1334 else if (bom == 0xFFFE) {
1335 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001336 bo = 1;
1337 }
1338#else
Tim Peters772747b2001-08-09 22:21:55 +00001339 if (bom == 0xFEFF) {
1340 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001341 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001342 }
1343 else if (bom == 0xFFFE) {
1344 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001345 bo = -1;
1346 }
1347#endif
1348 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349
Tim Peters772747b2001-08-09 22:21:55 +00001350 if (bo == -1) {
1351 /* force LE */
1352 ihi = 1;
1353 ilo = 0;
1354 }
1355 else if (bo == 1) {
1356 /* force BE */
1357 ihi = 0;
1358 ilo = 1;
1359 }
1360
1361 while (q < e) {
1362 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1363 q += 2;
1364
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 if (ch < 0xD800 || ch > 0xDFFF) {
1366 *p++ = ch;
1367 continue;
1368 }
1369
1370 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001371 if (q >= e) {
1372 errmsg = "unexpected end of data";
1373 goto utf16Error;
1374 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001375 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001376 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1377 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001378 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001379#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001380 *p++ = ch;
1381 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001382#else
1383 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001384#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001385 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001386 }
1387 else {
1388 errmsg = "illegal UTF-16 surrogate";
1389 goto utf16Error;
1390 }
1391
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001393 errmsg = "illegal encoding";
1394 /* Fall through to report the error */
1395
1396 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001397 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001398 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001399 }
1400
1401 if (byteorder)
1402 *byteorder = bo;
1403
1404 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001405 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 goto onError;
1407
1408 return (PyObject *)unicode;
1409
1410onError:
1411 Py_DECREF(unicode);
1412 return NULL;
1413}
1414
Tim Peters772747b2001-08-09 22:21:55 +00001415PyObject *
1416PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1417 int size,
1418 const char *errors,
1419 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420{
1421 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001422 unsigned char *p;
1423 int i, pairs;
1424 /* Offsets from p for storing byte pairs in the right order. */
1425#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1426 int ihi = 1, ilo = 0;
1427#else
1428 int ihi = 0, ilo = 1;
1429#endif
1430
1431#define STORECHAR(CH) \
1432 do { \
1433 p[ihi] = ((CH) >> 8) & 0xff; \
1434 p[ilo] = (CH) & 0xff; \
1435 p += 2; \
1436 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001438 for (i = pairs = 0; i < size; i++)
1439 if (s[i] >= 0x10000)
1440 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001442 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001443 if (v == NULL)
1444 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001445
Tim Peters772747b2001-08-09 22:21:55 +00001446 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001448 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001449 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001450 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001451
1452 if (byteorder == -1) {
1453 /* force LE */
1454 ihi = 1;
1455 ilo = 0;
1456 }
1457 else if (byteorder == 1) {
1458 /* force BE */
1459 ihi = 0;
1460 ilo = 1;
1461 }
1462
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001463 while (size-- > 0) {
1464 Py_UNICODE ch = *s++;
1465 Py_UNICODE ch2 = 0;
1466 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001467 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1468 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 }
Tim Peters772747b2001-08-09 22:21:55 +00001470 STORECHAR(ch);
1471 if (ch2)
1472 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001473 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001475#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476}
1477
1478PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1479{
1480 if (!PyUnicode_Check(unicode)) {
1481 PyErr_BadArgument();
1482 return NULL;
1483 }
1484 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1485 PyUnicode_GET_SIZE(unicode),
1486 NULL,
1487 0);
1488}
1489
1490/* --- Unicode Escape Codec ----------------------------------------------- */
1491
1492static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001493int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 const char *errors,
1495 const char *details)
1496{
1497 if ((errors == NULL) ||
1498 (strcmp(errors,"strict") == 0)) {
1499 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001500 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001501 details);
1502 return -1;
1503 }
1504 else if (strcmp(errors,"ignore") == 0) {
1505 return 0;
1506 }
1507 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001508 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1509 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001510 return 0;
1511 }
1512 else {
1513 PyErr_Format(PyExc_ValueError,
1514 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001515 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 errors);
1517 return -1;
1518 }
1519}
1520
Fredrik Lundh06d12682001-01-24 07:59:11 +00001521static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001522
Guido van Rossumd57fd912000-03-10 22:53:23 +00001523PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1524 int size,
1525 const char *errors)
1526{
1527 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001528 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001530 char* message;
1531 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1532
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 /* Escaped strings will always be longer than the resulting
1534 Unicode string, so we start with size here and then reduce the
1535 length after conversion to the true value. */
1536 v = _PyUnicode_New(size);
1537 if (v == NULL)
1538 goto onError;
1539 if (size == 0)
1540 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001541
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 p = buf = PyUnicode_AS_UNICODE(v);
1543 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001544
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 while (s < end) {
1546 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001547 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001548 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549
1550 /* Non-escape characters are interpreted as Unicode ordinals */
1551 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001552 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553 continue;
1554 }
1555
1556 /* \ - Escapes */
1557 s++;
1558 switch (*s++) {
1559
1560 /* \x escapes */
1561 case '\n': break;
1562 case '\\': *p++ = '\\'; break;
1563 case '\'': *p++ = '\''; break;
1564 case '\"': *p++ = '\"'; break;
1565 case 'b': *p++ = '\b'; break;
1566 case 'f': *p++ = '\014'; break; /* FF */
1567 case 't': *p++ = '\t'; break;
1568 case 'n': *p++ = '\n'; break;
1569 case 'r': *p++ = '\r'; break;
1570 case 'v': *p++ = '\013'; break; /* VT */
1571 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1572
1573 /* \OOO (octal) escapes */
1574 case '0': case '1': case '2': case '3':
1575 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001576 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001578 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001580 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001582 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001583 break;
1584
Fredrik Lundhccc74732001-02-18 22:13:49 +00001585 /* hex escapes */
1586 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001587 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001588 digits = 2;
1589 message = "truncated \\xXX escape";
1590 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591
Fredrik Lundhccc74732001-02-18 22:13:49 +00001592 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001594 digits = 4;
1595 message = "truncated \\uXXXX escape";
1596 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597
Fredrik Lundhccc74732001-02-18 22:13:49 +00001598 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001599 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001600 digits = 8;
1601 message = "truncated \\UXXXXXXXX escape";
1602 hexescape:
1603 chr = 0;
1604 for (i = 0; i < digits; i++) {
1605 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001606 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001607 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001608 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001609 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001610 i++;
1611 break;
1612 }
1613 chr = (chr<<4) & ~0xF;
1614 if (c >= '0' && c <= '9')
1615 chr += c - '0';
1616 else if (c >= 'a' && c <= 'f')
1617 chr += 10 + c - 'a';
1618 else
1619 chr += 10 + c - 'A';
1620 }
1621 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001622 if (chr == 0xffffffff)
1623 /* _decoding_error will have already written into the
1624 target buffer. */
1625 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001626 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001627 /* when we get here, chr is a 32-bit unicode character */
1628 if (chr <= 0xffff)
1629 /* UCS-2 character */
1630 *p++ = (Py_UNICODE) chr;
1631 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001632 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001633 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001634#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001635 *p++ = chr;
1636#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001637 chr -= 0x10000L;
1638 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001639 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001640#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001641 } else {
1642 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001643 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001644 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001645 )
1646 goto onError;
1647 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001648 break;
1649
1650 /* \N{name} */
1651 case 'N':
1652 message = "malformed \\N character escape";
1653 if (ucnhash_CAPI == NULL) {
1654 /* load the unicode data module */
1655 PyObject *m, *v;
1656 m = PyImport_ImportModule("unicodedata");
1657 if (m == NULL)
1658 goto ucnhashError;
1659 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1660 Py_DECREF(m);
1661 if (v == NULL)
1662 goto ucnhashError;
1663 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1664 Py_DECREF(v);
1665 if (ucnhash_CAPI == NULL)
1666 goto ucnhashError;
1667 }
1668 if (*s == '{') {
1669 const char *start = s+1;
1670 /* look for the closing brace */
1671 while (*s != '}' && s < end)
1672 s++;
1673 if (s > start && s < end && *s == '}') {
1674 /* found a name. look it up in the unicode database */
1675 message = "unknown Unicode character name";
1676 s++;
1677 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1678 goto store;
1679 }
1680 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001681 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001682 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001683 break;
1684
1685 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001686 if (s > end) {
1687 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1688 goto onError;
1689 }
1690 else {
1691 *p++ = '\\';
1692 *p++ = (unsigned char)s[-1];
1693 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001694 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001695 }
1696 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001697 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001700
Fredrik Lundhccc74732001-02-18 22:13:49 +00001701ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001702 PyErr_SetString(
1703 PyExc_UnicodeError,
1704 "\\N escapes not supported (can't load unicodedata module)"
1705 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001706 return NULL;
1707
Fredrik Lundhccc74732001-02-18 22:13:49 +00001708onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001709 Py_XDECREF(v);
1710 return NULL;
1711}
1712
1713/* Return a Unicode-Escape string version of the Unicode object.
1714
1715 If quotes is true, the string is enclosed in u"" or u'' quotes as
1716 appropriate.
1717
1718*/
1719
Barry Warsaw51ac5802000-03-20 16:36:48 +00001720static const Py_UNICODE *findchar(const Py_UNICODE *s,
1721 int size,
1722 Py_UNICODE ch);
1723
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724static
1725PyObject *unicodeescape_string(const Py_UNICODE *s,
1726 int size,
1727 int quotes)
1728{
1729 PyObject *repr;
1730 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001732 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733
1734 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1735 if (repr == NULL)
1736 return NULL;
1737
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001738 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
1740 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 *p++ = 'u';
1742 *p++ = (findchar(s, size, '\'') &&
1743 !findchar(s, size, '"')) ? '"' : '\'';
1744 }
1745 while (size-- > 0) {
1746 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001747
Guido van Rossumd57fd912000-03-10 22:53:23 +00001748 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001749 if (quotes &&
1750 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 *p++ = '\\';
1752 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001753 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001755
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001756#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001757 /* Map 21-bit characters to '\U00xxxxxx' */
1758 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001759 int offset = p - PyString_AS_STRING(repr);
1760
1761 /* Resize the string if necessary */
1762 if (offset + 12 > PyString_GET_SIZE(repr)) {
1763 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
Tim Peters5de98422002-04-27 18:44:32 +00001764 return NULL;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001765 p = PyString_AS_STRING(repr) + offset;
1766 }
1767
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001768 *p++ = '\\';
1769 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001770 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1771 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1772 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1773 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1774 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1775 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1776 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001777 *p++ = hexdigit[ch & 0x0000000F];
1778 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001779 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001780#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001781 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1782 else if (ch >= 0xD800 && ch < 0xDC00) {
1783 Py_UNICODE ch2;
1784 Py_UCS4 ucs;
1785
1786 ch2 = *s++;
1787 size--;
1788 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1789 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1790 *p++ = '\\';
1791 *p++ = 'U';
1792 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1793 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1794 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1795 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1796 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1797 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1798 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1799 *p++ = hexdigit[ucs & 0x0000000F];
1800 continue;
1801 }
1802 /* Fall through: isolated surrogates are copied as-is */
1803 s--;
1804 size++;
1805 }
1806
Guido van Rossumd57fd912000-03-10 22:53:23 +00001807 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001808 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 *p++ = '\\';
1810 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001811 *p++ = hexdigit[(ch >> 12) & 0x000F];
1812 *p++ = hexdigit[(ch >> 8) & 0x000F];
1813 *p++ = hexdigit[(ch >> 4) & 0x000F];
1814 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001815 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001816
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001817 /* Map special whitespace to '\t', \n', '\r' */
1818 else if (ch == '\t') {
1819 *p++ = '\\';
1820 *p++ = 't';
1821 }
1822 else if (ch == '\n') {
1823 *p++ = '\\';
1824 *p++ = 'n';
1825 }
1826 else if (ch == '\r') {
1827 *p++ = '\\';
1828 *p++ = 'r';
1829 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001830
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001831 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001832 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001834 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001835 *p++ = hexdigit[(ch >> 4) & 0x000F];
1836 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001837 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001838
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 /* Copy everything else as-is */
1840 else
1841 *p++ = (char) ch;
1842 }
1843 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001844 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001845
1846 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001847 _PyString_Resize(&repr, p - PyString_AS_STRING(repr));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 return repr;
1849}
1850
1851PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1852 int size)
1853{
1854 return unicodeescape_string(s, size, 0);
1855}
1856
1857PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1858{
1859 if (!PyUnicode_Check(unicode)) {
1860 PyErr_BadArgument();
1861 return NULL;
1862 }
1863 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1864 PyUnicode_GET_SIZE(unicode));
1865}
1866
1867/* --- Raw Unicode Escape Codec ------------------------------------------- */
1868
1869PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1870 int size,
1871 const char *errors)
1872{
1873 PyUnicodeObject *v;
1874 Py_UNICODE *p, *buf;
1875 const char *end;
1876 const char *bs;
1877
1878 /* Escaped strings will always be longer than the resulting
1879 Unicode string, so we start with size here and then reduce the
1880 length after conversion to the true value. */
1881 v = _PyUnicode_New(size);
1882 if (v == NULL)
1883 goto onError;
1884 if (size == 0)
1885 return (PyObject *)v;
1886 p = buf = PyUnicode_AS_UNICODE(v);
1887 end = s + size;
1888 while (s < end) {
1889 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001890 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 int i;
1892
1893 /* Non-escape characters are interpreted as Unicode ordinals */
1894 if (*s != '\\') {
1895 *p++ = (unsigned char)*s++;
1896 continue;
1897 }
1898
1899 /* \u-escapes are only interpreted iff the number of leading
1900 backslashes if odd */
1901 bs = s;
1902 for (;s < end;) {
1903 if (*s != '\\')
1904 break;
1905 *p++ = (unsigned char)*s++;
1906 }
1907 if (((s - bs) & 1) == 0 ||
1908 s >= end ||
1909 *s != 'u') {
1910 continue;
1911 }
1912 p--;
1913 s++;
1914
1915 /* \uXXXX with 4 hex digits */
1916 for (x = 0, i = 0; i < 4; i++) {
1917 c = (unsigned char)s[i];
1918 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001919 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001920 "truncated \\uXXXX"))
1921 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001922 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001923 i++;
1924 break;
1925 }
1926 x = (x<<4) & ~0xF;
1927 if (c >= '0' && c <= '9')
1928 x += c - '0';
1929 else if (c >= 'a' && c <= 'f')
1930 x += 10 + c - 'a';
1931 else
1932 x += 10 + c - 'A';
1933 }
1934 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001935 if (x != 0xffffffff)
1936 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001938 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001940 return (PyObject *)v;
1941
1942 onError:
1943 Py_XDECREF(v);
1944 return NULL;
1945}
1946
1947PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1948 int size)
1949{
1950 PyObject *repr;
1951 char *p;
1952 char *q;
1953
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001954 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955
1956 repr = PyString_FromStringAndSize(NULL, 6 * size);
1957 if (repr == NULL)
1958 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001959 if (size == 0)
1960 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961
1962 p = q = PyString_AS_STRING(repr);
1963 while (size-- > 0) {
1964 Py_UNICODE ch = *s++;
1965 /* Map 16-bit characters to '\uxxxx' */
1966 if (ch >= 256) {
1967 *p++ = '\\';
1968 *p++ = 'u';
1969 *p++ = hexdigit[(ch >> 12) & 0xf];
1970 *p++ = hexdigit[(ch >> 8) & 0xf];
1971 *p++ = hexdigit[(ch >> 4) & 0xf];
1972 *p++ = hexdigit[ch & 15];
1973 }
1974 /* Copy everything else as-is */
1975 else
1976 *p++ = (char) ch;
1977 }
1978 *p = '\0';
Tim Peters5de98422002-04-27 18:44:32 +00001979 _PyString_Resize(&repr, p - q);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001980 return repr;
1981}
1982
1983PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1984{
1985 if (!PyUnicode_Check(unicode)) {
1986 PyErr_BadArgument();
1987 return NULL;
1988 }
1989 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1990 PyUnicode_GET_SIZE(unicode));
1991}
1992
1993/* --- Latin-1 Codec ------------------------------------------------------ */
1994
1995PyObject *PyUnicode_DecodeLatin1(const char *s,
1996 int size,
1997 const char *errors)
1998{
1999 PyUnicodeObject *v;
2000 Py_UNICODE *p;
2001
2002 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002003 if (size == 1 && *(unsigned char*)s < 256) {
2004 Py_UNICODE r = *(unsigned char*)s;
2005 return PyUnicode_FromUnicode(&r, 1);
2006 }
2007
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008 v = _PyUnicode_New(size);
2009 if (v == NULL)
2010 goto onError;
2011 if (size == 0)
2012 return (PyObject *)v;
2013 p = PyUnicode_AS_UNICODE(v);
2014 while (size-- > 0)
2015 *p++ = (unsigned char)*s++;
2016 return (PyObject *)v;
2017
2018 onError:
2019 Py_XDECREF(v);
2020 return NULL;
2021}
2022
2023static
2024int latin1_encoding_error(const Py_UNICODE **source,
2025 char **dest,
2026 const char *errors,
2027 const char *details)
2028{
2029 if ((errors == NULL) ||
2030 (strcmp(errors,"strict") == 0)) {
2031 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002032 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 details);
2034 return -1;
2035 }
2036 else if (strcmp(errors,"ignore") == 0) {
2037 return 0;
2038 }
2039 else if (strcmp(errors,"replace") == 0) {
2040 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002041 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002042 return 0;
2043 }
2044 else {
2045 PyErr_Format(PyExc_ValueError,
2046 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002047 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 errors);
2049 return -1;
2050 }
2051}
2052
2053PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2054 int size,
2055 const char *errors)
2056{
2057 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002058 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002059
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 repr = PyString_FromStringAndSize(NULL, size);
2061 if (repr == NULL)
2062 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002063 if (size == 0)
2064 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065
2066 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002067 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 while (size-- > 0) {
2069 Py_UNICODE ch = *p++;
2070 if (ch >= 256) {
2071 if (latin1_encoding_error(&p, &s, errors,
2072 "ordinal not in range(256)"))
2073 goto onError;
2074 }
2075 else
2076 *s++ = (char)ch;
2077 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002078 /* Resize if error handling skipped some characters */
2079 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002080 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 return repr;
2082
2083 onError:
2084 Py_DECREF(repr);
2085 return NULL;
2086}
2087
2088PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2089{
2090 if (!PyUnicode_Check(unicode)) {
2091 PyErr_BadArgument();
2092 return NULL;
2093 }
2094 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2095 PyUnicode_GET_SIZE(unicode),
2096 NULL);
2097}
2098
2099/* --- 7-bit ASCII Codec -------------------------------------------------- */
2100
2101static
2102int ascii_decoding_error(const char **source,
2103 Py_UNICODE **dest,
2104 const char *errors,
2105 const char *details)
2106{
2107 if ((errors == NULL) ||
2108 (strcmp(errors,"strict") == 0)) {
2109 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002110 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 details);
2112 return -1;
2113 }
2114 else if (strcmp(errors,"ignore") == 0) {
2115 return 0;
2116 }
2117 else if (strcmp(errors,"replace") == 0) {
2118 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2119 (*dest)++;
2120 return 0;
2121 }
2122 else {
2123 PyErr_Format(PyExc_ValueError,
2124 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002125 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002126 errors);
2127 return -1;
2128 }
2129}
2130
2131PyObject *PyUnicode_DecodeASCII(const char *s,
2132 int size,
2133 const char *errors)
2134{
2135 PyUnicodeObject *v;
2136 Py_UNICODE *p;
2137
2138 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002139 if (size == 1 && *(unsigned char*)s < 128) {
2140 Py_UNICODE r = *(unsigned char*)s;
2141 return PyUnicode_FromUnicode(&r, 1);
2142 }
2143
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 v = _PyUnicode_New(size);
2145 if (v == NULL)
2146 goto onError;
2147 if (size == 0)
2148 return (PyObject *)v;
2149 p = PyUnicode_AS_UNICODE(v);
2150 while (size-- > 0) {
2151 register unsigned char c;
2152
2153 c = (unsigned char)*s++;
2154 if (c < 128)
2155 *p++ = c;
2156 else if (ascii_decoding_error(&s, &p, errors,
2157 "ordinal not in range(128)"))
2158 goto onError;
2159 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002160 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002161 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002162 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002163 return (PyObject *)v;
2164
2165 onError:
2166 Py_XDECREF(v);
2167 return NULL;
2168}
2169
2170static
2171int ascii_encoding_error(const Py_UNICODE **source,
2172 char **dest,
2173 const char *errors,
2174 const char *details)
2175{
2176 if ((errors == NULL) ||
2177 (strcmp(errors,"strict") == 0)) {
2178 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002179 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 details);
2181 return -1;
2182 }
2183 else if (strcmp(errors,"ignore") == 0) {
2184 return 0;
2185 }
2186 else if (strcmp(errors,"replace") == 0) {
2187 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002188 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002189 return 0;
2190 }
2191 else {
2192 PyErr_Format(PyExc_ValueError,
2193 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002194 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002195 errors);
2196 return -1;
2197 }
2198}
2199
2200PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2201 int size,
2202 const char *errors)
2203{
2204 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002205 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002206
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 repr = PyString_FromStringAndSize(NULL, size);
2208 if (repr == NULL)
2209 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002210 if (size == 0)
2211 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212
2213 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002214 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002215 while (size-- > 0) {
2216 Py_UNICODE ch = *p++;
2217 if (ch >= 128) {
2218 if (ascii_encoding_error(&p, &s, errors,
2219 "ordinal not in range(128)"))
2220 goto onError;
2221 }
2222 else
2223 *s++ = (char)ch;
2224 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002225 /* Resize if error handling skipped some characters */
2226 if (s - start < PyString_GET_SIZE(repr))
Tim Peters5de98422002-04-27 18:44:32 +00002227 _PyString_Resize(&repr, s - start);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 return repr;
2229
2230 onError:
2231 Py_DECREF(repr);
2232 return NULL;
2233}
2234
2235PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2236{
2237 if (!PyUnicode_Check(unicode)) {
2238 PyErr_BadArgument();
2239 return NULL;
2240 }
2241 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2242 PyUnicode_GET_SIZE(unicode),
2243 NULL);
2244}
2245
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002246#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002247
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002248/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002249
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002250PyObject *PyUnicode_DecodeMBCS(const char *s,
2251 int size,
2252 const char *errors)
2253{
2254 PyUnicodeObject *v;
2255 Py_UNICODE *p;
2256
2257 /* First get the size of the result */
2258 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002259 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002260 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2261
2262 v = _PyUnicode_New(usize);
2263 if (v == NULL)
2264 return NULL;
2265 if (usize == 0)
2266 return (PyObject *)v;
2267 p = PyUnicode_AS_UNICODE(v);
2268 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2269 Py_DECREF(v);
2270 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2271 }
2272
2273 return (PyObject *)v;
2274}
2275
2276PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2277 int size,
2278 const char *errors)
2279{
2280 PyObject *repr;
2281 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002282 DWORD mbcssize;
2283
2284 /* If there are no characters, bail now! */
2285 if (size==0)
2286 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002287
2288 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002289 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002290 if (mbcssize==0)
2291 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2292
2293 repr = PyString_FromStringAndSize(NULL, mbcssize);
2294 if (repr == NULL)
2295 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002296 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002297 return repr;
2298
2299 /* Do the conversion */
2300 s = PyString_AS_STRING(repr);
2301 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2302 Py_DECREF(repr);
2303 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2304 }
2305 return repr;
2306}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002307
Martin v. Löwis6238d2b2002-06-30 15:26:10 +00002308#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002309
Guido van Rossumd57fd912000-03-10 22:53:23 +00002310/* --- Character Mapping Codec -------------------------------------------- */
2311
2312static
2313int charmap_decoding_error(const char **source,
2314 Py_UNICODE **dest,
2315 const char *errors,
2316 const char *details)
2317{
2318 if ((errors == NULL) ||
2319 (strcmp(errors,"strict") == 0)) {
2320 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002321 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 details);
2323 return -1;
2324 }
2325 else if (strcmp(errors,"ignore") == 0) {
2326 return 0;
2327 }
2328 else if (strcmp(errors,"replace") == 0) {
2329 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2330 (*dest)++;
2331 return 0;
2332 }
2333 else {
2334 PyErr_Format(PyExc_ValueError,
2335 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002336 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 errors);
2338 return -1;
2339 }
2340}
2341
2342PyObject *PyUnicode_DecodeCharmap(const char *s,
2343 int size,
2344 PyObject *mapping,
2345 const char *errors)
2346{
2347 PyUnicodeObject *v;
2348 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002349 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350
2351 /* Default to Latin-1 */
2352 if (mapping == NULL)
2353 return PyUnicode_DecodeLatin1(s, size, errors);
2354
2355 v = _PyUnicode_New(size);
2356 if (v == NULL)
2357 goto onError;
2358 if (size == 0)
2359 return (PyObject *)v;
2360 p = PyUnicode_AS_UNICODE(v);
2361 while (size-- > 0) {
2362 unsigned char ch = *s++;
2363 PyObject *w, *x;
2364
2365 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2366 w = PyInt_FromLong((long)ch);
2367 if (w == NULL)
2368 goto onError;
2369 x = PyObject_GetItem(mapping, w);
2370 Py_DECREF(w);
2371 if (x == NULL) {
2372 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002373 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002374 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002375 x = Py_None;
2376 Py_INCREF(x);
2377 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002378 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002379 }
2380
2381 /* Apply mapping */
2382 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002383 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384 if (value < 0 || value > 65535) {
2385 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002386 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 Py_DECREF(x);
2388 goto onError;
2389 }
2390 *p++ = (Py_UNICODE)value;
2391 }
2392 else if (x == Py_None) {
2393 /* undefined mapping */
2394 if (charmap_decoding_error(&s, &p, errors,
2395 "character maps to <undefined>")) {
2396 Py_DECREF(x);
2397 goto onError;
2398 }
2399 }
2400 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002401 int targetsize = PyUnicode_GET_SIZE(x);
2402
2403 if (targetsize == 1)
2404 /* 1-1 mapping */
2405 *p++ = *PyUnicode_AS_UNICODE(x);
2406
2407 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002409 if (targetsize > extrachars) {
2410 /* resize first */
2411 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2412 int needed = (targetsize - extrachars) + \
2413 (targetsize << 2);
2414 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002415 if (_PyUnicode_Resize(&v,
2416 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002417 Py_DECREF(x);
2418 goto onError;
2419 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002420 p = PyUnicode_AS_UNICODE(v) + oldpos;
2421 }
2422 Py_UNICODE_COPY(p,
2423 PyUnicode_AS_UNICODE(x),
2424 targetsize);
2425 p += targetsize;
2426 extrachars -= targetsize;
2427 }
2428 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 }
2430 else {
2431 /* wrong return value */
2432 PyErr_SetString(PyExc_TypeError,
2433 "character mapping must return integer, None or unicode");
2434 Py_DECREF(x);
2435 goto onError;
2436 }
2437 Py_DECREF(x);
2438 }
2439 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002440 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002441 goto onError;
2442 return (PyObject *)v;
2443
2444 onError:
2445 Py_XDECREF(v);
2446 return NULL;
2447}
2448
2449static
2450int charmap_encoding_error(const Py_UNICODE **source,
2451 char **dest,
2452 const char *errors,
2453 const char *details)
2454{
2455 if ((errors == NULL) ||
2456 (strcmp(errors,"strict") == 0)) {
2457 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002458 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002459 details);
2460 return -1;
2461 }
2462 else if (strcmp(errors,"ignore") == 0) {
2463 return 0;
2464 }
2465 else if (strcmp(errors,"replace") == 0) {
2466 **dest = '?';
2467 (*dest)++;
2468 return 0;
2469 }
2470 else {
2471 PyErr_Format(PyExc_ValueError,
2472 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002473 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 errors);
2475 return -1;
2476 }
2477}
2478
2479PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2480 int size,
2481 PyObject *mapping,
2482 const char *errors)
2483{
2484 PyObject *v;
2485 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002486 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002487
2488 /* Default to Latin-1 */
2489 if (mapping == NULL)
2490 return PyUnicode_EncodeLatin1(p, size, errors);
2491
2492 v = PyString_FromStringAndSize(NULL, size);
2493 if (v == NULL)
2494 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002495 if (size == 0)
2496 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002497 s = PyString_AS_STRING(v);
2498 while (size-- > 0) {
2499 Py_UNICODE ch = *p++;
2500 PyObject *w, *x;
2501
2502 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2503 w = PyInt_FromLong((long)ch);
2504 if (w == NULL)
2505 goto onError;
2506 x = PyObject_GetItem(mapping, w);
2507 Py_DECREF(w);
2508 if (x == NULL) {
2509 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002510 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002511 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002512 x = Py_None;
2513 Py_INCREF(x);
2514 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 }
2517
2518 /* Apply mapping */
2519 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002520 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521 if (value < 0 || value > 255) {
2522 PyErr_SetString(PyExc_TypeError,
2523 "character mapping must be in range(256)");
2524 Py_DECREF(x);
2525 goto onError;
2526 }
2527 *s++ = (char)value;
2528 }
2529 else if (x == Py_None) {
2530 /* undefined mapping */
2531 if (charmap_encoding_error(&p, &s, errors,
2532 "character maps to <undefined>")) {
2533 Py_DECREF(x);
2534 goto onError;
2535 }
2536 }
2537 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002538 int targetsize = PyString_GET_SIZE(x);
2539
2540 if (targetsize == 1)
2541 /* 1-1 mapping */
2542 *s++ = *PyString_AS_STRING(x);
2543
2544 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002546 if (targetsize > extrachars) {
2547 /* resize first */
2548 int oldpos = (int)(s - PyString_AS_STRING(v));
2549 int needed = (targetsize - extrachars) + \
2550 (targetsize << 2);
2551 extrachars += needed;
2552 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002553 Py_DECREF(x);
2554 goto onError;
2555 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002556 s = PyString_AS_STRING(v) + oldpos;
2557 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002558 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002559 s += targetsize;
2560 extrachars -= targetsize;
2561 }
2562 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 }
2564 else {
2565 /* wrong return value */
2566 PyErr_SetString(PyExc_TypeError,
2567 "character mapping must return integer, None or unicode");
2568 Py_DECREF(x);
2569 goto onError;
2570 }
2571 Py_DECREF(x);
2572 }
2573 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
Tim Peters5de98422002-04-27 18:44:32 +00002574 _PyString_Resize(&v, (int)(s - PyString_AS_STRING(v)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 return v;
2576
2577 onError:
Tim Peters5de98422002-04-27 18:44:32 +00002578 Py_XDECREF(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 return NULL;
2580}
2581
2582PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2583 PyObject *mapping)
2584{
2585 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2586 PyErr_BadArgument();
2587 return NULL;
2588 }
2589 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2590 PyUnicode_GET_SIZE(unicode),
2591 mapping,
2592 NULL);
2593}
2594
2595static
2596int translate_error(const Py_UNICODE **source,
2597 Py_UNICODE **dest,
2598 const char *errors,
2599 const char *details)
2600{
2601 if ((errors == NULL) ||
2602 (strcmp(errors,"strict") == 0)) {
2603 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002604 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002605 details);
2606 return -1;
2607 }
2608 else if (strcmp(errors,"ignore") == 0) {
2609 return 0;
2610 }
2611 else if (strcmp(errors,"replace") == 0) {
2612 **dest = '?';
2613 (*dest)++;
2614 return 0;
2615 }
2616 else {
2617 PyErr_Format(PyExc_ValueError,
2618 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002619 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 errors);
2621 return -1;
2622 }
2623}
2624
2625PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2626 int size,
2627 PyObject *mapping,
2628 const char *errors)
2629{
2630 PyUnicodeObject *v;
2631 Py_UNICODE *p;
2632
2633 if (mapping == NULL) {
2634 PyErr_BadArgument();
2635 return NULL;
2636 }
2637
2638 /* Output will never be longer than input */
2639 v = _PyUnicode_New(size);
2640 if (v == NULL)
2641 goto onError;
2642 if (size == 0)
2643 goto done;
2644 p = PyUnicode_AS_UNICODE(v);
2645 while (size-- > 0) {
2646 Py_UNICODE ch = *s++;
2647 PyObject *w, *x;
2648
2649 /* Get mapping */
2650 w = PyInt_FromLong(ch);
2651 if (w == NULL)
2652 goto onError;
2653 x = PyObject_GetItem(mapping, w);
2654 Py_DECREF(w);
2655 if (x == NULL) {
2656 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2657 /* No mapping found: default to 1-1 mapping */
2658 PyErr_Clear();
2659 *p++ = ch;
2660 continue;
2661 }
2662 goto onError;
2663 }
2664
2665 /* Apply mapping */
2666 if (PyInt_Check(x))
2667 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2668 else if (x == Py_None) {
2669 /* undefined mapping */
2670 if (translate_error(&s, &p, errors,
2671 "character maps to <undefined>")) {
2672 Py_DECREF(x);
2673 goto onError;
2674 }
2675 }
2676 else if (PyUnicode_Check(x)) {
2677 if (PyUnicode_GET_SIZE(x) != 1) {
2678 /* 1-n mapping */
2679 PyErr_SetString(PyExc_NotImplementedError,
2680 "1-n mappings are currently not implemented");
2681 Py_DECREF(x);
2682 goto onError;
2683 }
2684 *p++ = *PyUnicode_AS_UNICODE(x);
2685 }
2686 else {
2687 /* wrong return value */
2688 PyErr_SetString(PyExc_TypeError,
2689 "translate mapping must return integer, None or unicode");
2690 Py_DECREF(x);
2691 goto onError;
2692 }
2693 Py_DECREF(x);
2694 }
2695 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002696 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002697 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002698
2699 done:
2700 return (PyObject *)v;
2701
2702 onError:
2703 Py_XDECREF(v);
2704 return NULL;
2705}
2706
2707PyObject *PyUnicode_Translate(PyObject *str,
2708 PyObject *mapping,
2709 const char *errors)
2710{
2711 PyObject *result;
2712
2713 str = PyUnicode_FromObject(str);
2714 if (str == NULL)
2715 goto onError;
2716 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2717 PyUnicode_GET_SIZE(str),
2718 mapping,
2719 errors);
2720 Py_DECREF(str);
2721 return result;
2722
2723 onError:
2724 Py_XDECREF(str);
2725 return NULL;
2726}
2727
Guido van Rossum9e896b32000-04-05 20:11:21 +00002728/* --- Decimal Encoder ---------------------------------------------------- */
2729
2730int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2731 int length,
2732 char *output,
2733 const char *errors)
2734{
2735 Py_UNICODE *p, *end;
2736
2737 if (output == NULL) {
2738 PyErr_BadArgument();
2739 return -1;
2740 }
2741
2742 p = s;
2743 end = s + length;
2744 while (p < end) {
2745 register Py_UNICODE ch = *p++;
2746 int decimal;
2747
2748 if (Py_UNICODE_ISSPACE(ch)) {
2749 *output++ = ' ';
2750 continue;
2751 }
2752 decimal = Py_UNICODE_TODECIMAL(ch);
2753 if (decimal >= 0) {
2754 *output++ = '0' + decimal;
2755 continue;
2756 }
Guido van Rossumba477042000-04-06 18:18:10 +00002757 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002758 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002759 continue;
2760 }
2761 /* All other characters are considered invalid */
2762 if (errors == NULL || strcmp(errors, "strict") == 0) {
2763 PyErr_SetString(PyExc_ValueError,
2764 "invalid decimal Unicode string");
2765 goto onError;
2766 }
2767 else if (strcmp(errors, "ignore") == 0)
2768 continue;
2769 else if (strcmp(errors, "replace") == 0) {
2770 *output++ = '?';
2771 continue;
2772 }
2773 }
2774 /* 0-terminate the output string */
2775 *output++ = '\0';
2776 return 0;
2777
2778 onError:
2779 return -1;
2780}
2781
Guido van Rossumd57fd912000-03-10 22:53:23 +00002782/* --- Helpers ------------------------------------------------------------ */
2783
2784static
2785int count(PyUnicodeObject *self,
2786 int start,
2787 int end,
2788 PyUnicodeObject *substring)
2789{
2790 int count = 0;
2791
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002792 if (start < 0)
2793 start += self->length;
2794 if (start < 0)
2795 start = 0;
2796 if (end > self->length)
2797 end = self->length;
2798 if (end < 0)
2799 end += self->length;
2800 if (end < 0)
2801 end = 0;
2802
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002803 if (substring->length == 0)
2804 return (end - start + 1);
2805
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806 end -= substring->length;
2807
2808 while (start <= end)
2809 if (Py_UNICODE_MATCH(self, start, substring)) {
2810 count++;
2811 start += substring->length;
2812 } else
2813 start++;
2814
2815 return count;
2816}
2817
2818int PyUnicode_Count(PyObject *str,
2819 PyObject *substr,
2820 int start,
2821 int end)
2822{
2823 int result;
2824
2825 str = PyUnicode_FromObject(str);
2826 if (str == NULL)
2827 return -1;
2828 substr = PyUnicode_FromObject(substr);
2829 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002830 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 return -1;
2832 }
2833
2834 result = count((PyUnicodeObject *)str,
2835 start, end,
2836 (PyUnicodeObject *)substr);
2837
2838 Py_DECREF(str);
2839 Py_DECREF(substr);
2840 return result;
2841}
2842
2843static
2844int findstring(PyUnicodeObject *self,
2845 PyUnicodeObject *substring,
2846 int start,
2847 int end,
2848 int direction)
2849{
2850 if (start < 0)
2851 start += self->length;
2852 if (start < 0)
2853 start = 0;
2854
2855 if (substring->length == 0)
2856 return start;
2857
2858 if (end > self->length)
2859 end = self->length;
2860 if (end < 0)
2861 end += self->length;
2862 if (end < 0)
2863 end = 0;
2864
2865 end -= substring->length;
2866
2867 if (direction < 0) {
2868 for (; end >= start; end--)
2869 if (Py_UNICODE_MATCH(self, end, substring))
2870 return end;
2871 } else {
2872 for (; start <= end; start++)
2873 if (Py_UNICODE_MATCH(self, start, substring))
2874 return start;
2875 }
2876
2877 return -1;
2878}
2879
2880int PyUnicode_Find(PyObject *str,
2881 PyObject *substr,
2882 int start,
2883 int end,
2884 int direction)
2885{
2886 int result;
2887
2888 str = PyUnicode_FromObject(str);
2889 if (str == NULL)
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002890 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 substr = PyUnicode_FromObject(substr);
2892 if (substr == NULL) {
Marc-André Lemburg41644392002-05-29 13:46:29 +00002893 Py_DECREF(str);
Marc-André Lemburg4da6fd62002-05-29 11:33:13 +00002894 return -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 }
2896
2897 result = findstring((PyUnicodeObject *)str,
2898 (PyUnicodeObject *)substr,
2899 start, end, direction);
2900 Py_DECREF(str);
2901 Py_DECREF(substr);
2902 return result;
2903}
2904
2905static
2906int tailmatch(PyUnicodeObject *self,
2907 PyUnicodeObject *substring,
2908 int start,
2909 int end,
2910 int direction)
2911{
2912 if (start < 0)
2913 start += self->length;
2914 if (start < 0)
2915 start = 0;
2916
2917 if (substring->length == 0)
2918 return 1;
2919
2920 if (end > self->length)
2921 end = self->length;
2922 if (end < 0)
2923 end += self->length;
2924 if (end < 0)
2925 end = 0;
2926
2927 end -= substring->length;
2928 if (end < start)
2929 return 0;
2930
2931 if (direction > 0) {
2932 if (Py_UNICODE_MATCH(self, end, substring))
2933 return 1;
2934 } else {
2935 if (Py_UNICODE_MATCH(self, start, substring))
2936 return 1;
2937 }
2938
2939 return 0;
2940}
2941
2942int PyUnicode_Tailmatch(PyObject *str,
2943 PyObject *substr,
2944 int start,
2945 int end,
2946 int direction)
2947{
2948 int result;
2949
2950 str = PyUnicode_FromObject(str);
2951 if (str == NULL)
2952 return -1;
2953 substr = PyUnicode_FromObject(substr);
2954 if (substr == NULL) {
2955 Py_DECREF(substr);
2956 return -1;
2957 }
2958
2959 result = tailmatch((PyUnicodeObject *)str,
2960 (PyUnicodeObject *)substr,
2961 start, end, direction);
2962 Py_DECREF(str);
2963 Py_DECREF(substr);
2964 return result;
2965}
2966
2967static
2968const Py_UNICODE *findchar(const Py_UNICODE *s,
2969 int size,
2970 Py_UNICODE ch)
2971{
2972 /* like wcschr, but doesn't stop at NULL characters */
2973
2974 while (size-- > 0) {
2975 if (*s == ch)
2976 return s;
2977 s++;
2978 }
2979
2980 return NULL;
2981}
2982
2983/* Apply fixfct filter to the Unicode object self and return a
2984 reference to the modified object */
2985
2986static
2987PyObject *fixup(PyUnicodeObject *self,
2988 int (*fixfct)(PyUnicodeObject *s))
2989{
2990
2991 PyUnicodeObject *u;
2992
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002993 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002994 if (u == NULL)
2995 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002996
2997 Py_UNICODE_COPY(u->str, self->str, self->length);
2998
Tim Peters7a29bd52001-09-12 03:03:31 +00002999 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003000 /* fixfct should return TRUE if it modified the buffer. If
3001 FALSE, return a reference to the original buffer instead
3002 (to save space, not time) */
3003 Py_INCREF(self);
3004 Py_DECREF(u);
3005 return (PyObject*) self;
3006 }
3007 return (PyObject*) u;
3008}
3009
3010static
3011int fixupper(PyUnicodeObject *self)
3012{
3013 int len = self->length;
3014 Py_UNICODE *s = self->str;
3015 int status = 0;
3016
3017 while (len-- > 0) {
3018 register Py_UNICODE ch;
3019
3020 ch = Py_UNICODE_TOUPPER(*s);
3021 if (ch != *s) {
3022 status = 1;
3023 *s = ch;
3024 }
3025 s++;
3026 }
3027
3028 return status;
3029}
3030
3031static
3032int fixlower(PyUnicodeObject *self)
3033{
3034 int len = self->length;
3035 Py_UNICODE *s = self->str;
3036 int status = 0;
3037
3038 while (len-- > 0) {
3039 register Py_UNICODE ch;
3040
3041 ch = Py_UNICODE_TOLOWER(*s);
3042 if (ch != *s) {
3043 status = 1;
3044 *s = ch;
3045 }
3046 s++;
3047 }
3048
3049 return status;
3050}
3051
3052static
3053int fixswapcase(PyUnicodeObject *self)
3054{
3055 int len = self->length;
3056 Py_UNICODE *s = self->str;
3057 int status = 0;
3058
3059 while (len-- > 0) {
3060 if (Py_UNICODE_ISUPPER(*s)) {
3061 *s = Py_UNICODE_TOLOWER(*s);
3062 status = 1;
3063 } else if (Py_UNICODE_ISLOWER(*s)) {
3064 *s = Py_UNICODE_TOUPPER(*s);
3065 status = 1;
3066 }
3067 s++;
3068 }
3069
3070 return status;
3071}
3072
3073static
3074int fixcapitalize(PyUnicodeObject *self)
3075{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003076 int len = self->length;
3077 Py_UNICODE *s = self->str;
3078 int status = 0;
3079
3080 if (len == 0)
3081 return 0;
3082 if (Py_UNICODE_ISLOWER(*s)) {
3083 *s = Py_UNICODE_TOUPPER(*s);
3084 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003086 s++;
3087 while (--len > 0) {
3088 if (Py_UNICODE_ISUPPER(*s)) {
3089 *s = Py_UNICODE_TOLOWER(*s);
3090 status = 1;
3091 }
3092 s++;
3093 }
3094 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003095}
3096
3097static
3098int fixtitle(PyUnicodeObject *self)
3099{
3100 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3101 register Py_UNICODE *e;
3102 int previous_is_cased;
3103
3104 /* Shortcut for single character strings */
3105 if (PyUnicode_GET_SIZE(self) == 1) {
3106 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3107 if (*p != ch) {
3108 *p = ch;
3109 return 1;
3110 }
3111 else
3112 return 0;
3113 }
3114
3115 e = p + PyUnicode_GET_SIZE(self);
3116 previous_is_cased = 0;
3117 for (; p < e; p++) {
3118 register const Py_UNICODE ch = *p;
3119
3120 if (previous_is_cased)
3121 *p = Py_UNICODE_TOLOWER(ch);
3122 else
3123 *p = Py_UNICODE_TOTITLE(ch);
3124
3125 if (Py_UNICODE_ISLOWER(ch) ||
3126 Py_UNICODE_ISUPPER(ch) ||
3127 Py_UNICODE_ISTITLE(ch))
3128 previous_is_cased = 1;
3129 else
3130 previous_is_cased = 0;
3131 }
3132 return 1;
3133}
3134
3135PyObject *PyUnicode_Join(PyObject *separator,
3136 PyObject *seq)
3137{
3138 Py_UNICODE *sep;
3139 int seplen;
3140 PyUnicodeObject *res = NULL;
3141 int reslen = 0;
3142 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003143 int sz = 100;
3144 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003145 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003146
Tim Peters2cfe3682001-05-05 05:36:48 +00003147 it = PyObject_GetIter(seq);
3148 if (it == NULL)
3149 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150
3151 if (separator == NULL) {
3152 Py_UNICODE blank = ' ';
3153 sep = &blank;
3154 seplen = 1;
3155 }
3156 else {
3157 separator = PyUnicode_FromObject(separator);
3158 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003159 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003160 sep = PyUnicode_AS_UNICODE(separator);
3161 seplen = PyUnicode_GET_SIZE(separator);
3162 }
3163
3164 res = _PyUnicode_New(sz);
3165 if (res == NULL)
3166 goto onError;
3167 p = PyUnicode_AS_UNICODE(res);
3168 reslen = 0;
3169
Tim Peters2cfe3682001-05-05 05:36:48 +00003170 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003171 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003172 PyObject *item = PyIter_Next(it);
3173 if (item == NULL) {
3174 if (PyErr_Occurred())
3175 goto onError;
3176 break;
3177 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 if (!PyUnicode_Check(item)) {
3179 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003180 if (!PyString_Check(item)) {
3181 PyErr_Format(PyExc_TypeError,
3182 "sequence item %i: expected string or Unicode,"
3183 " %.80s found",
3184 i, item->ob_type->tp_name);
3185 Py_DECREF(item);
3186 goto onError;
3187 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 v = PyUnicode_FromObject(item);
3189 Py_DECREF(item);
3190 item = v;
3191 if (item == NULL)
3192 goto onError;
3193 }
3194 itemlen = PyUnicode_GET_SIZE(item);
3195 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003196 if (_PyUnicode_Resize(&res, sz*2)) {
3197 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003200 sz *= 2;
3201 p = PyUnicode_AS_UNICODE(res) + reslen;
3202 }
3203 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003204 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 p += seplen;
3206 reslen += seplen;
3207 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003208 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 p += itemlen;
3210 reslen += itemlen;
3211 Py_DECREF(item);
3212 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003213 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 goto onError;
3215
3216 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003217 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 return (PyObject *)res;
3219
3220 onError:
3221 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003222 Py_XDECREF(res);
3223 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 return NULL;
3225}
3226
3227static
3228PyUnicodeObject *pad(PyUnicodeObject *self,
3229 int left,
3230 int right,
3231 Py_UNICODE fill)
3232{
3233 PyUnicodeObject *u;
3234
3235 if (left < 0)
3236 left = 0;
3237 if (right < 0)
3238 right = 0;
3239
Tim Peters7a29bd52001-09-12 03:03:31 +00003240 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 Py_INCREF(self);
3242 return self;
3243 }
3244
3245 u = _PyUnicode_New(left + self->length + right);
3246 if (u) {
3247 if (left)
3248 Py_UNICODE_FILL(u->str, fill, left);
3249 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3250 if (right)
3251 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3252 }
3253
3254 return u;
3255}
3256
3257#define SPLIT_APPEND(data, left, right) \
3258 str = PyUnicode_FromUnicode(data + left, right - left); \
3259 if (!str) \
3260 goto onError; \
3261 if (PyList_Append(list, str)) { \
3262 Py_DECREF(str); \
3263 goto onError; \
3264 } \
3265 else \
3266 Py_DECREF(str);
3267
3268static
3269PyObject *split_whitespace(PyUnicodeObject *self,
3270 PyObject *list,
3271 int maxcount)
3272{
3273 register int i;
3274 register int j;
3275 int len = self->length;
3276 PyObject *str;
3277
3278 for (i = j = 0; i < len; ) {
3279 /* find a token */
3280 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3281 i++;
3282 j = i;
3283 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3284 i++;
3285 if (j < i) {
3286 if (maxcount-- <= 0)
3287 break;
3288 SPLIT_APPEND(self->str, j, i);
3289 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3290 i++;
3291 j = i;
3292 }
3293 }
3294 if (j < len) {
3295 SPLIT_APPEND(self->str, j, len);
3296 }
3297 return list;
3298
3299 onError:
3300 Py_DECREF(list);
3301 return NULL;
3302}
3303
3304PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003305 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003306{
3307 register int i;
3308 register int j;
3309 int len;
3310 PyObject *list;
3311 PyObject *str;
3312 Py_UNICODE *data;
3313
3314 string = PyUnicode_FromObject(string);
3315 if (string == NULL)
3316 return NULL;
3317 data = PyUnicode_AS_UNICODE(string);
3318 len = PyUnicode_GET_SIZE(string);
3319
Guido van Rossumd57fd912000-03-10 22:53:23 +00003320 list = PyList_New(0);
3321 if (!list)
3322 goto onError;
3323
3324 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003325 int eol;
3326
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 /* Find a line and append it */
3328 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3329 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330
3331 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003332 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003333 if (i < len) {
3334 if (data[i] == '\r' && i + 1 < len &&
3335 data[i+1] == '\n')
3336 i += 2;
3337 else
3338 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003339 if (keepends)
3340 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341 }
Guido van Rossum86662912000-04-11 15:38:46 +00003342 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 j = i;
3344 }
3345 if (j < len) {
3346 SPLIT_APPEND(data, j, len);
3347 }
3348
3349 Py_DECREF(string);
3350 return list;
3351
3352 onError:
3353 Py_DECREF(list);
3354 Py_DECREF(string);
3355 return NULL;
3356}
3357
3358static
3359PyObject *split_char(PyUnicodeObject *self,
3360 PyObject *list,
3361 Py_UNICODE ch,
3362 int maxcount)
3363{
3364 register int i;
3365 register int j;
3366 int len = self->length;
3367 PyObject *str;
3368
3369 for (i = j = 0; i < len; ) {
3370 if (self->str[i] == ch) {
3371 if (maxcount-- <= 0)
3372 break;
3373 SPLIT_APPEND(self->str, j, i);
3374 i = j = i + 1;
3375 } else
3376 i++;
3377 }
3378 if (j <= len) {
3379 SPLIT_APPEND(self->str, j, len);
3380 }
3381 return list;
3382
3383 onError:
3384 Py_DECREF(list);
3385 return NULL;
3386}
3387
3388static
3389PyObject *split_substring(PyUnicodeObject *self,
3390 PyObject *list,
3391 PyUnicodeObject *substring,
3392 int maxcount)
3393{
3394 register int i;
3395 register int j;
3396 int len = self->length;
3397 int sublen = substring->length;
3398 PyObject *str;
3399
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003400 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003401 if (Py_UNICODE_MATCH(self, i, substring)) {
3402 if (maxcount-- <= 0)
3403 break;
3404 SPLIT_APPEND(self->str, j, i);
3405 i = j = i + sublen;
3406 } else
3407 i++;
3408 }
3409 if (j <= len) {
3410 SPLIT_APPEND(self->str, j, len);
3411 }
3412 return list;
3413
3414 onError:
3415 Py_DECREF(list);
3416 return NULL;
3417}
3418
3419#undef SPLIT_APPEND
3420
3421static
3422PyObject *split(PyUnicodeObject *self,
3423 PyUnicodeObject *substring,
3424 int maxcount)
3425{
3426 PyObject *list;
3427
3428 if (maxcount < 0)
3429 maxcount = INT_MAX;
3430
3431 list = PyList_New(0);
3432 if (!list)
3433 return NULL;
3434
3435 if (substring == NULL)
3436 return split_whitespace(self,list,maxcount);
3437
3438 else if (substring->length == 1)
3439 return split_char(self,list,substring->str[0],maxcount);
3440
3441 else if (substring->length == 0) {
3442 Py_DECREF(list);
3443 PyErr_SetString(PyExc_ValueError, "empty separator");
3444 return NULL;
3445 }
3446 else
3447 return split_substring(self,list,substring,maxcount);
3448}
3449
3450static
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451PyObject *replace(PyUnicodeObject *self,
3452 PyUnicodeObject *str1,
3453 PyUnicodeObject *str2,
3454 int maxcount)
3455{
3456 PyUnicodeObject *u;
3457
3458 if (maxcount < 0)
3459 maxcount = INT_MAX;
3460
3461 if (str1->length == 1 && str2->length == 1) {
3462 int i;
3463
3464 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003465 if (!findchar(self->str, self->length, str1->str[0]) &&
3466 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003467 /* nothing to replace, return original string */
3468 Py_INCREF(self);
3469 u = self;
3470 } else {
3471 Py_UNICODE u1 = str1->str[0];
3472 Py_UNICODE u2 = str2->str[0];
3473
3474 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003475 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003476 self->length
3477 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003478 if (u != NULL) {
3479 Py_UNICODE_COPY(u->str, self->str,
3480 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003481 for (i = 0; i < u->length; i++)
3482 if (u->str[i] == u1) {
3483 if (--maxcount < 0)
3484 break;
3485 u->str[i] = u2;
3486 }
3487 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003488 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003489
3490 } else {
3491 int n, i;
3492 Py_UNICODE *p;
3493
3494 /* replace strings */
3495 n = count(self, 0, self->length, str1);
3496 if (n > maxcount)
3497 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003498 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003499 /* nothing to replace, return original string */
3500 Py_INCREF(self);
3501 u = self;
3502 } else {
3503 u = _PyUnicode_New(
3504 self->length + n * (str2->length - str1->length));
3505 if (u) {
3506 i = 0;
3507 p = u->str;
3508 while (i <= self->length - str1->length)
3509 if (Py_UNICODE_MATCH(self, i, str1)) {
3510 /* replace string segment */
3511 Py_UNICODE_COPY(p, str2->str, str2->length);
3512 p += str2->length;
3513 i += str1->length;
3514 if (--n <= 0) {
3515 /* copy remaining part */
3516 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3517 break;
3518 }
3519 } else
3520 *p++ = self->str[i++];
3521 }
3522 }
3523 }
3524
3525 return (PyObject *) u;
3526}
3527
3528/* --- Unicode Object Methods --------------------------------------------- */
3529
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003530PyDoc_STRVAR(title__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531"S.title() -> unicode\n\
3532\n\
3533Return a titlecased version of S, i.e. words start with title case\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003534characters, all remaining cased characters have lower case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003535
3536static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003537unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539 return fixup(self, fixtitle);
3540}
3541
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003542PyDoc_STRVAR(capitalize__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003543"S.capitalize() -> unicode\n\
3544\n\
3545Return a capitalized version of S, i.e. make the first character\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003546have upper case.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003547
3548static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003549unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003550{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 return fixup(self, fixcapitalize);
3552}
3553
3554#if 0
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003555PyDoc_STRVAR(capwords__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556"S.capwords() -> unicode\n\
3557\n\
3558Apply .capitalize() to all words in S and return the result with\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003559normalized whitespace (all whitespace strings are replaced by ' ').");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003560
3561static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003562unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563{
3564 PyObject *list;
3565 PyObject *item;
3566 int i;
3567
Guido van Rossumd57fd912000-03-10 22:53:23 +00003568 /* Split into words */
3569 list = split(self, NULL, -1);
3570 if (!list)
3571 return NULL;
3572
3573 /* Capitalize each word */
3574 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3575 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3576 fixcapitalize);
3577 if (item == NULL)
3578 goto onError;
3579 Py_DECREF(PyList_GET_ITEM(list, i));
3580 PyList_SET_ITEM(list, i, item);
3581 }
3582
3583 /* Join the words to form a new string */
3584 item = PyUnicode_Join(NULL, list);
3585
3586onError:
3587 Py_DECREF(list);
3588 return (PyObject *)item;
3589}
3590#endif
3591
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003592PyDoc_STRVAR(center__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593"S.center(width) -> unicode\n\
3594\n\
3595Return S centered in a Unicode string of length width. Padding is done\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003596using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597
3598static PyObject *
3599unicode_center(PyUnicodeObject *self, PyObject *args)
3600{
3601 int marg, left;
3602 int width;
3603
3604 if (!PyArg_ParseTuple(args, "i:center", &width))
3605 return NULL;
3606
Tim Peters7a29bd52001-09-12 03:03:31 +00003607 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608 Py_INCREF(self);
3609 return (PyObject*) self;
3610 }
3611
3612 marg = width - self->length;
3613 left = marg / 2 + (marg & width & 1);
3614
3615 return (PyObject*) pad(self, left, marg - left, ' ');
3616}
3617
Marc-André Lemburge5034372000-08-08 08:04:29 +00003618#if 0
3619
3620/* This code should go into some future Unicode collation support
3621 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003622 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003623
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003624/* speedy UTF-16 code point order comparison */
3625/* gleaned from: */
3626/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3627
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003628static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003629{
3630 0, 0, 0, 0, 0, 0, 0, 0,
3631 0, 0, 0, 0, 0, 0, 0, 0,
3632 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003633 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003634};
3635
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636static int
3637unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3638{
3639 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003640
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 Py_UNICODE *s1 = str1->str;
3642 Py_UNICODE *s2 = str2->str;
3643
3644 len1 = str1->length;
3645 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003646
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003648 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003649
3650 c1 = *s1++;
3651 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003652
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003653 if (c1 > (1<<11) * 26)
3654 c1 += utf16Fixup[c1>>11];
3655 if (c2 > (1<<11) * 26)
3656 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003657 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003658
3659 if (c1 != c2)
3660 return (c1 < c2) ? -1 : 1;
3661
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003662 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003663 }
3664
3665 return (len1 < len2) ? -1 : (len1 != len2);
3666}
3667
Marc-André Lemburge5034372000-08-08 08:04:29 +00003668#else
3669
3670static int
3671unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3672{
3673 register int len1, len2;
3674
3675 Py_UNICODE *s1 = str1->str;
3676 Py_UNICODE *s2 = str2->str;
3677
3678 len1 = str1->length;
3679 len2 = str2->length;
3680
3681 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003682 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003683
Fredrik Lundh45714e92001-06-26 16:39:36 +00003684 c1 = *s1++;
3685 c2 = *s2++;
3686
3687 if (c1 != c2)
3688 return (c1 < c2) ? -1 : 1;
3689
Marc-André Lemburge5034372000-08-08 08:04:29 +00003690 len1--; len2--;
3691 }
3692
3693 return (len1 < len2) ? -1 : (len1 != len2);
3694}
3695
3696#endif
3697
Guido van Rossumd57fd912000-03-10 22:53:23 +00003698int PyUnicode_Compare(PyObject *left,
3699 PyObject *right)
3700{
3701 PyUnicodeObject *u = NULL, *v = NULL;
3702 int result;
3703
3704 /* Coerce the two arguments */
3705 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3706 if (u == NULL)
3707 goto onError;
3708 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3709 if (v == NULL)
3710 goto onError;
3711
Thomas Wouters7e474022000-07-16 12:04:32 +00003712 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 if (v == u) {
3714 Py_DECREF(u);
3715 Py_DECREF(v);
3716 return 0;
3717 }
3718
3719 result = unicode_compare(u, v);
3720
3721 Py_DECREF(u);
3722 Py_DECREF(v);
3723 return result;
3724
3725onError:
3726 Py_XDECREF(u);
3727 Py_XDECREF(v);
3728 return -1;
3729}
3730
Guido van Rossum403d68b2000-03-13 15:55:09 +00003731int PyUnicode_Contains(PyObject *container,
3732 PyObject *element)
3733{
3734 PyUnicodeObject *u = NULL, *v = NULL;
Barry Warsaw817918c2002-08-06 16:58:21 +00003735 int result, size;
3736 register const Py_UNICODE *lhs, *end, *rhs;
Guido van Rossum403d68b2000-03-13 15:55:09 +00003737
3738 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003739 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003740 if (v == NULL) {
3741 PyErr_SetString(PyExc_TypeError,
Barry Warsaw817918c2002-08-06 16:58:21 +00003742 "'in <string>' requires string as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003743 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003744 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003745 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3746 if (u == NULL) {
3747 Py_DECREF(v);
3748 goto onError;
3749 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003750
Barry Warsaw817918c2002-08-06 16:58:21 +00003751 size = PyUnicode_GET_SIZE(v);
3752 rhs = PyUnicode_AS_UNICODE(v);
3753 lhs = PyUnicode_AS_UNICODE(u);
3754
Guido van Rossum403d68b2000-03-13 15:55:09 +00003755 result = 0;
Barry Warsaw817918c2002-08-06 16:58:21 +00003756 if (size == 1) {
3757 end = lhs + PyUnicode_GET_SIZE(u);
3758 while (lhs < end) {
3759 if (*lhs++ == *rhs) {
3760 result = 1;
3761 break;
3762 }
3763 }
3764 }
3765 else {
3766 end = lhs + (PyUnicode_GET_SIZE(u) - size);
3767 while (lhs <= end) {
Barry Warsaw6a043f32002-08-06 19:03:17 +00003768 if (memcmp(lhs++, rhs, size * sizeof(Py_UNICODE)) == 0) {
Barry Warsaw817918c2002-08-06 16:58:21 +00003769 result = 1;
3770 break;
3771 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003772 }
3773 }
3774
3775 Py_DECREF(u);
3776 Py_DECREF(v);
3777 return result;
3778
3779onError:
3780 Py_XDECREF(u);
3781 Py_XDECREF(v);
3782 return -1;
3783}
3784
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785/* Concat to string or Unicode object giving a new Unicode object. */
3786
3787PyObject *PyUnicode_Concat(PyObject *left,
3788 PyObject *right)
3789{
3790 PyUnicodeObject *u = NULL, *v = NULL, *w;
3791
3792 /* Coerce the two arguments */
3793 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3794 if (u == NULL)
3795 goto onError;
3796 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3797 if (v == NULL)
3798 goto onError;
3799
3800 /* Shortcuts */
3801 if (v == unicode_empty) {
3802 Py_DECREF(v);
3803 return (PyObject *)u;
3804 }
3805 if (u == unicode_empty) {
3806 Py_DECREF(u);
3807 return (PyObject *)v;
3808 }
3809
3810 /* Concat the two Unicode strings */
3811 w = _PyUnicode_New(u->length + v->length);
3812 if (w == NULL)
3813 goto onError;
3814 Py_UNICODE_COPY(w->str, u->str, u->length);
3815 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3816
3817 Py_DECREF(u);
3818 Py_DECREF(v);
3819 return (PyObject *)w;
3820
3821onError:
3822 Py_XDECREF(u);
3823 Py_XDECREF(v);
3824 return NULL;
3825}
3826
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003827PyDoc_STRVAR(count__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828"S.count(sub[, start[, end]]) -> int\n\
3829\n\
3830Return the number of occurrences of substring sub in Unicode string\n\
3831S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003832interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003833
3834static PyObject *
3835unicode_count(PyUnicodeObject *self, PyObject *args)
3836{
3837 PyUnicodeObject *substring;
3838 int start = 0;
3839 int end = INT_MAX;
3840 PyObject *result;
3841
Guido van Rossumb8872e62000-05-09 14:14:27 +00003842 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3843 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844 return NULL;
3845
3846 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3847 (PyObject *)substring);
3848 if (substring == NULL)
3849 return NULL;
3850
Guido van Rossumd57fd912000-03-10 22:53:23 +00003851 if (start < 0)
3852 start += self->length;
3853 if (start < 0)
3854 start = 0;
3855 if (end > self->length)
3856 end = self->length;
3857 if (end < 0)
3858 end += self->length;
3859 if (end < 0)
3860 end = 0;
3861
3862 result = PyInt_FromLong((long) count(self, start, end, substring));
3863
3864 Py_DECREF(substring);
3865 return result;
3866}
3867
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003868PyDoc_STRVAR(encode__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869"S.encode([encoding[,errors]]) -> string\n\
3870\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003871Return an encoded string version of S. Default encoding is the current\n\
3872default string encoding. errors may be given to set a different error\n\
3873handling scheme. Default is 'strict' meaning that encoding errors raise\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003874a ValueError. Other possible values are 'ignore' and 'replace'.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003875
3876static PyObject *
3877unicode_encode(PyUnicodeObject *self, PyObject *args)
3878{
3879 char *encoding = NULL;
3880 char *errors = NULL;
3881 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3882 return NULL;
3883 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3884}
3885
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003886PyDoc_STRVAR(expandtabs__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887"S.expandtabs([tabsize]) -> unicode\n\
3888\n\
3889Return a copy of S where all tab characters are expanded using spaces.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003890If tabsize is not given, a tab size of 8 characters is assumed.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891
3892static PyObject*
3893unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3894{
3895 Py_UNICODE *e;
3896 Py_UNICODE *p;
3897 Py_UNICODE *q;
3898 int i, j;
3899 PyUnicodeObject *u;
3900 int tabsize = 8;
3901
3902 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3903 return NULL;
3904
Thomas Wouters7e474022000-07-16 12:04:32 +00003905 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003906 i = j = 0;
3907 e = self->str + self->length;
3908 for (p = self->str; p < e; p++)
3909 if (*p == '\t') {
3910 if (tabsize > 0)
3911 j += tabsize - (j % tabsize);
3912 }
3913 else {
3914 j++;
3915 if (*p == '\n' || *p == '\r') {
3916 i += j;
3917 j = 0;
3918 }
3919 }
3920
3921 /* Second pass: create output string and fill it */
3922 u = _PyUnicode_New(i + j);
3923 if (!u)
3924 return NULL;
3925
3926 j = 0;
3927 q = u->str;
3928
3929 for (p = self->str; p < e; p++)
3930 if (*p == '\t') {
3931 if (tabsize > 0) {
3932 i = tabsize - (j % tabsize);
3933 j += i;
3934 while (i--)
3935 *q++ = ' ';
3936 }
3937 }
3938 else {
3939 j++;
3940 *q++ = *p;
3941 if (*p == '\n' || *p == '\r')
3942 j = 0;
3943 }
3944
3945 return (PyObject*) u;
3946}
3947
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003948PyDoc_STRVAR(find__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949"S.find(sub [,start [,end]]) -> int\n\
3950\n\
3951Return the lowest index in S where substring sub is found,\n\
3952such that sub is contained within s[start,end]. Optional\n\
3953arguments start and end are interpreted as in slice notation.\n\
3954\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00003955Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00003956
3957static PyObject *
3958unicode_find(PyUnicodeObject *self, PyObject *args)
3959{
3960 PyUnicodeObject *substring;
3961 int start = 0;
3962 int end = INT_MAX;
3963 PyObject *result;
3964
Guido van Rossumb8872e62000-05-09 14:14:27 +00003965 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3966 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 return NULL;
3968 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3969 (PyObject *)substring);
3970 if (substring == NULL)
3971 return NULL;
3972
3973 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3974
3975 Py_DECREF(substring);
3976 return result;
3977}
3978
3979static PyObject *
3980unicode_getitem(PyUnicodeObject *self, int index)
3981{
3982 if (index < 0 || index >= self->length) {
3983 PyErr_SetString(PyExc_IndexError, "string index out of range");
3984 return NULL;
3985 }
3986
3987 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3988}
3989
3990static long
3991unicode_hash(PyUnicodeObject *self)
3992{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003993 /* Since Unicode objects compare equal to their ASCII string
3994 counterparts, they should use the individual character values
3995 as basis for their hash value. This is needed to assure that
3996 strings and Unicode objects behave in the same way as
3997 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003998
Fredrik Lundhdde61642000-07-10 18:27:47 +00003999 register int len;
4000 register Py_UNICODE *p;
4001 register long x;
4002
Guido van Rossumd57fd912000-03-10 22:53:23 +00004003 if (self->hash != -1)
4004 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004005 len = PyUnicode_GET_SIZE(self);
4006 p = PyUnicode_AS_UNICODE(self);
4007 x = *p << 7;
4008 while (--len >= 0)
4009 x = (1000003*x) ^ *p++;
4010 x ^= PyUnicode_GET_SIZE(self);
4011 if (x == -1)
4012 x = -2;
4013 self->hash = x;
4014 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004015}
4016
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004017PyDoc_STRVAR(index__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018"S.index(sub [,start [,end]]) -> int\n\
4019\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004020Like S.find() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021
4022static PyObject *
4023unicode_index(PyUnicodeObject *self, PyObject *args)
4024{
4025 int result;
4026 PyUnicodeObject *substring;
4027 int start = 0;
4028 int end = INT_MAX;
4029
Guido van Rossumb8872e62000-05-09 14:14:27 +00004030 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4031 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032 return NULL;
4033
4034 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4035 (PyObject *)substring);
4036 if (substring == NULL)
4037 return NULL;
4038
4039 result = findstring(self, substring, start, end, 1);
4040
4041 Py_DECREF(substring);
4042 if (result < 0) {
4043 PyErr_SetString(PyExc_ValueError, "substring not found");
4044 return NULL;
4045 }
4046 return PyInt_FromLong(result);
4047}
4048
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004049PyDoc_STRVAR(islower__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004050"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004051\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004052Return True if all cased characters in S are lowercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004053at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054
4055static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004056unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057{
4058 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4059 register const Py_UNICODE *e;
4060 int cased;
4061
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 /* Shortcut for single character strings */
4063 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004064 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004066 /* Special case for empty strings */
4067 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004068 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004069
Guido van Rossumd57fd912000-03-10 22:53:23 +00004070 e = p + PyUnicode_GET_SIZE(self);
4071 cased = 0;
4072 for (; p < e; p++) {
4073 register const Py_UNICODE ch = *p;
4074
4075 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004076 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077 else if (!cased && Py_UNICODE_ISLOWER(ch))
4078 cased = 1;
4079 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004080 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004081}
4082
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004083PyDoc_STRVAR(isupper__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004084"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004085\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004086Return True if all cased characters in S are uppercase and there is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004087at least one cased character in S, False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088
4089static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004090unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091{
4092 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4093 register const Py_UNICODE *e;
4094 int cased;
4095
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 /* Shortcut for single character strings */
4097 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004098 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004100 /* Special case for empty strings */
4101 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004102 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004103
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 e = p + PyUnicode_GET_SIZE(self);
4105 cased = 0;
4106 for (; p < e; p++) {
4107 register const Py_UNICODE ch = *p;
4108
4109 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004110 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 else if (!cased && Py_UNICODE_ISUPPER(ch))
4112 cased = 1;
4113 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004114 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115}
4116
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004117PyDoc_STRVAR(istitle__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004118"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004119\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004120Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4121characters may only follow uncased characters and lowercase characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004122only cased ones. Return False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123
4124static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004125unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126{
4127 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4128 register const Py_UNICODE *e;
4129 int cased, previous_is_cased;
4130
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 /* Shortcut for single character strings */
4132 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004133 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4134 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004136 /* Special case for empty strings */
4137 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004138 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004139
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 e = p + PyUnicode_GET_SIZE(self);
4141 cased = 0;
4142 previous_is_cased = 0;
4143 for (; p < e; p++) {
4144 register const Py_UNICODE ch = *p;
4145
4146 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4147 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004148 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149 previous_is_cased = 1;
4150 cased = 1;
4151 }
4152 else if (Py_UNICODE_ISLOWER(ch)) {
4153 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004154 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 previous_is_cased = 1;
4156 cased = 1;
4157 }
4158 else
4159 previous_is_cased = 0;
4160 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004161 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162}
4163
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004164PyDoc_STRVAR(isspace__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004165"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004167Return True if there are only whitespace characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004168False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169
4170static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004171unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004172{
4173 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4174 register const Py_UNICODE *e;
4175
Guido van Rossumd57fd912000-03-10 22:53:23 +00004176 /* Shortcut for single character strings */
4177 if (PyUnicode_GET_SIZE(self) == 1 &&
4178 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004179 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004181 /* Special case for empty strings */
4182 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004183 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004184
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 e = p + PyUnicode_GET_SIZE(self);
4186 for (; p < e; p++) {
4187 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004188 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004190 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004191}
4192
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004193PyDoc_STRVAR(isalpha__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004194"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004195\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004196Return True if all characters in S are alphabetic\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004197and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004198
4199static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004200unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004201{
4202 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4203 register const Py_UNICODE *e;
4204
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004205 /* Shortcut for single character strings */
4206 if (PyUnicode_GET_SIZE(self) == 1 &&
4207 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004208 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004209
4210 /* Special case for empty strings */
4211 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004212 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004213
4214 e = p + PyUnicode_GET_SIZE(self);
4215 for (; p < e; p++) {
4216 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004217 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004218 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004219 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004220}
4221
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004222PyDoc_STRVAR(isalnum__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004223"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004224\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004225Return True if all characters in S are alphanumeric\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004226and there is at least one character in S, False otherwise.");
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004227
4228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004229unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004230{
4231 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4232 register const Py_UNICODE *e;
4233
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004234 /* Shortcut for single character strings */
4235 if (PyUnicode_GET_SIZE(self) == 1 &&
4236 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004237 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004238
4239 /* Special case for empty strings */
4240 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004241 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004242
4243 e = p + PyUnicode_GET_SIZE(self);
4244 for (; p < e; p++) {
4245 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004246 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004247 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004248 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004249}
4250
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004251PyDoc_STRVAR(isdecimal__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004252"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004254Return True if there are only decimal characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004255False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004256
4257static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004258unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004259{
4260 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4261 register const Py_UNICODE *e;
4262
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263 /* Shortcut for single character strings */
4264 if (PyUnicode_GET_SIZE(self) == 1 &&
4265 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004266 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004268 /* Special case for empty strings */
4269 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004270 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004271
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 e = p + PyUnicode_GET_SIZE(self);
4273 for (; p < e; p++) {
4274 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004275 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004277 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278}
4279
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004280PyDoc_STRVAR(isdigit__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004281"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004282\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004283Return True if there are only digit characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004284False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285
4286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004287unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288{
4289 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4290 register const Py_UNICODE *e;
4291
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 /* Shortcut for single character strings */
4293 if (PyUnicode_GET_SIZE(self) == 1 &&
4294 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004295 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004297 /* Special case for empty strings */
4298 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004299 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004300
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301 e = p + PyUnicode_GET_SIZE(self);
4302 for (; p < e; p++) {
4303 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004304 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004306 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307}
4308
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004309PyDoc_STRVAR(isnumeric__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004310"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004312Return True if there are only numeric characters in S,\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004313False otherwise.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314
4315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004316unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317{
4318 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4319 register const Py_UNICODE *e;
4320
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 /* Shortcut for single character strings */
4322 if (PyUnicode_GET_SIZE(self) == 1 &&
4323 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004324 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004326 /* Special case for empty strings */
4327 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004328 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004329
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 e = p + PyUnicode_GET_SIZE(self);
4331 for (; p < e; p++) {
4332 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004333 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004335 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336}
4337
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004338PyDoc_STRVAR(join__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339"S.join(sequence) -> unicode\n\
4340\n\
4341Return a string which is the concatenation of the strings in the\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004342sequence. The separator between elements is S.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343
4344static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004345unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004347 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004348}
4349
4350static int
4351unicode_length(PyUnicodeObject *self)
4352{
4353 return self->length;
4354}
4355
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004356PyDoc_STRVAR(ljust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357"S.ljust(width) -> unicode\n\
4358\n\
4359Return S left justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004360done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361
4362static PyObject *
4363unicode_ljust(PyUnicodeObject *self, PyObject *args)
4364{
4365 int width;
4366 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4367 return NULL;
4368
Tim Peters7a29bd52001-09-12 03:03:31 +00004369 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 Py_INCREF(self);
4371 return (PyObject*) self;
4372 }
4373
4374 return (PyObject*) pad(self, 0, width - self->length, ' ');
4375}
4376
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004377PyDoc_STRVAR(lower__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004378"S.lower() -> unicode\n\
4379\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004380Return a copy of the string S converted to lowercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381
4382static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004383unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385 return fixup(self, fixlower);
4386}
4387
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004388#define LEFTSTRIP 0
4389#define RIGHTSTRIP 1
4390#define BOTHSTRIP 2
4391
4392/* Arrays indexed by above */
4393static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
4394
4395#define STRIPNAME(i) (stripformat[i]+3)
4396
4397static const Py_UNICODE *
4398unicode_memchr(const Py_UNICODE *s, Py_UNICODE c, size_t n)
4399{
Tim Peters030a5ce2002-04-22 19:00:10 +00004400 size_t i;
4401 for (i = 0; i < n; ++i)
4402 if (s[i] == c)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004403 return s+i;
4404 return NULL;
4405}
4406
4407/* externally visible for str.strip(unicode) */
4408PyObject *
4409_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
4410{
4411 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4412 int len = PyUnicode_GET_SIZE(self);
4413 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
4414 int seplen = PyUnicode_GET_SIZE(sepobj);
4415 int i, j;
4416
4417 i = 0;
4418 if (striptype != RIGHTSTRIP) {
4419 while (i < len && unicode_memchr(sep, s[i], seplen)) {
4420 i++;
4421 }
4422 }
4423
4424 j = len;
4425 if (striptype != LEFTSTRIP) {
4426 do {
4427 j--;
4428 } while (j >= i && unicode_memchr(sep, s[j], seplen));
4429 j++;
4430 }
4431
4432 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4433 Py_INCREF(self);
4434 return (PyObject*)self;
4435 }
4436 else
4437 return PyUnicode_FromUnicode(s+i, j-i);
4438}
4439
Guido van Rossumd57fd912000-03-10 22:53:23 +00004440
4441static PyObject *
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004442do_strip(PyUnicodeObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443{
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004444 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
4445 int len = PyUnicode_GET_SIZE(self), i, j;
4446
4447 i = 0;
4448 if (striptype != RIGHTSTRIP) {
4449 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
4450 i++;
4451 }
4452 }
4453
4454 j = len;
4455 if (striptype != LEFTSTRIP) {
4456 do {
4457 j--;
4458 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
4459 j++;
4460 }
4461
4462 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
4463 Py_INCREF(self);
4464 return (PyObject*)self;
4465 }
4466 else
4467 return PyUnicode_FromUnicode(s+i, j-i);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004468}
4469
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004470
4471static PyObject *
4472do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
4473{
4474 PyObject *sep = NULL;
4475
4476 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
4477 return NULL;
4478
4479 if (sep != NULL && sep != Py_None) {
4480 if (PyUnicode_Check(sep))
4481 return _PyUnicode_XStrip(self, striptype, sep);
4482 else if (PyString_Check(sep)) {
4483 PyObject *res;
4484 sep = PyUnicode_FromObject(sep);
4485 if (sep==NULL)
4486 return NULL;
4487 res = _PyUnicode_XStrip(self, striptype, sep);
4488 Py_DECREF(sep);
4489 return res;
4490 }
4491 else {
4492 PyErr_Format(PyExc_TypeError,
4493 "%s arg must be None, unicode or str",
4494 STRIPNAME(striptype));
4495 return NULL;
4496 }
4497 }
4498
4499 return do_strip(self, striptype);
4500}
4501
4502
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004503PyDoc_STRVAR(strip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004504"S.strip([sep]) -> unicode\n\
4505\n\
4506Return a copy of the string S with leading and trailing\n\
4507whitespace removed.\n\
4508If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004509If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004510
4511static PyObject *
4512unicode_strip(PyUnicodeObject *self, PyObject *args)
4513{
4514 if (PyTuple_GET_SIZE(args) == 0)
4515 return do_strip(self, BOTHSTRIP); /* Common case */
4516 else
4517 return do_argstrip(self, BOTHSTRIP, args);
4518}
4519
4520
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004521PyDoc_STRVAR(lstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004522"S.lstrip([sep]) -> unicode\n\
4523\n\
4524Return a copy of the string S with leading whitespace removed.\n\
4525If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004526If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004527
4528static PyObject *
4529unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4530{
4531 if (PyTuple_GET_SIZE(args) == 0)
4532 return do_strip(self, LEFTSTRIP); /* Common case */
4533 else
4534 return do_argstrip(self, LEFTSTRIP, args);
4535}
4536
4537
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004538PyDoc_STRVAR(rstrip__doc__,
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004539"S.rstrip([sep]) -> unicode\n\
4540\n\
4541Return a copy of the string S with trailing whitespace removed.\n\
4542If sep is given and not None, remove characters in sep instead.\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004543If sep is a str, it will be converted to unicode before stripping");
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00004544
4545static PyObject *
4546unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4547{
4548 if (PyTuple_GET_SIZE(args) == 0)
4549 return do_strip(self, RIGHTSTRIP); /* Common case */
4550 else
4551 return do_argstrip(self, RIGHTSTRIP, args);
4552}
4553
4554
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555static PyObject*
4556unicode_repeat(PyUnicodeObject *str, int len)
4557{
4558 PyUnicodeObject *u;
4559 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004560 int nchars;
4561 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562
4563 if (len < 0)
4564 len = 0;
4565
Tim Peters7a29bd52001-09-12 03:03:31 +00004566 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 /* no repeat, return original string */
4568 Py_INCREF(str);
4569 return (PyObject*) str;
4570 }
Tim Peters8f422462000-09-09 06:13:41 +00004571
4572 /* ensure # of chars needed doesn't overflow int and # of bytes
4573 * needed doesn't overflow size_t
4574 */
4575 nchars = len * str->length;
4576 if (len && nchars / len != str->length) {
4577 PyErr_SetString(PyExc_OverflowError,
4578 "repeated string is too long");
4579 return NULL;
4580 }
4581 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4582 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4583 PyErr_SetString(PyExc_OverflowError,
4584 "repeated string is too long");
4585 return NULL;
4586 }
4587 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004588 if (!u)
4589 return NULL;
4590
4591 p = u->str;
4592
4593 while (len-- > 0) {
4594 Py_UNICODE_COPY(p, str->str, str->length);
4595 p += str->length;
4596 }
4597
4598 return (PyObject*) u;
4599}
4600
4601PyObject *PyUnicode_Replace(PyObject *obj,
4602 PyObject *subobj,
4603 PyObject *replobj,
4604 int maxcount)
4605{
4606 PyObject *self;
4607 PyObject *str1;
4608 PyObject *str2;
4609 PyObject *result;
4610
4611 self = PyUnicode_FromObject(obj);
4612 if (self == NULL)
4613 return NULL;
4614 str1 = PyUnicode_FromObject(subobj);
4615 if (str1 == NULL) {
4616 Py_DECREF(self);
4617 return NULL;
4618 }
4619 str2 = PyUnicode_FromObject(replobj);
4620 if (str2 == NULL) {
4621 Py_DECREF(self);
4622 Py_DECREF(str1);
4623 return NULL;
4624 }
4625 result = replace((PyUnicodeObject *)self,
4626 (PyUnicodeObject *)str1,
4627 (PyUnicodeObject *)str2,
4628 maxcount);
4629 Py_DECREF(self);
4630 Py_DECREF(str1);
4631 Py_DECREF(str2);
4632 return result;
4633}
4634
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004635PyDoc_STRVAR(replace__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636"S.replace (old, new[, maxsplit]) -> unicode\n\
4637\n\
4638Return a copy of S with all occurrences of substring\n\
4639old replaced by new. If the optional argument maxsplit is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004640given, only the first maxsplit occurrences are replaced.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641
4642static PyObject*
4643unicode_replace(PyUnicodeObject *self, PyObject *args)
4644{
4645 PyUnicodeObject *str1;
4646 PyUnicodeObject *str2;
4647 int maxcount = -1;
4648 PyObject *result;
4649
4650 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4651 return NULL;
4652 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4653 if (str1 == NULL)
4654 return NULL;
4655 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4656 if (str2 == NULL)
4657 return NULL;
4658
4659 result = replace(self, str1, str2, maxcount);
4660
4661 Py_DECREF(str1);
4662 Py_DECREF(str2);
4663 return result;
4664}
4665
4666static
4667PyObject *unicode_repr(PyObject *unicode)
4668{
4669 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4670 PyUnicode_GET_SIZE(unicode),
4671 1);
4672}
4673
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004674PyDoc_STRVAR(rfind__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675"S.rfind(sub [,start [,end]]) -> int\n\
4676\n\
4677Return the highest index in S where substring sub is found,\n\
4678such that sub is contained within s[start,end]. Optional\n\
4679arguments start and end are interpreted as in slice notation.\n\
4680\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004681Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682
4683static PyObject *
4684unicode_rfind(PyUnicodeObject *self, PyObject *args)
4685{
4686 PyUnicodeObject *substring;
4687 int start = 0;
4688 int end = INT_MAX;
4689 PyObject *result;
4690
Guido van Rossumb8872e62000-05-09 14:14:27 +00004691 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4692 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 return NULL;
4694 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4695 (PyObject *)substring);
4696 if (substring == NULL)
4697 return NULL;
4698
4699 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4700
4701 Py_DECREF(substring);
4702 return result;
4703}
4704
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004705PyDoc_STRVAR(rindex__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706"S.rindex(sub [,start [,end]]) -> int\n\
4707\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004708Like S.rfind() but raise ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709
4710static PyObject *
4711unicode_rindex(PyUnicodeObject *self, PyObject *args)
4712{
4713 int result;
4714 PyUnicodeObject *substring;
4715 int start = 0;
4716 int end = INT_MAX;
4717
Guido van Rossumb8872e62000-05-09 14:14:27 +00004718 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4719 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004720 return NULL;
4721 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4722 (PyObject *)substring);
4723 if (substring == NULL)
4724 return NULL;
4725
4726 result = findstring(self, substring, start, end, -1);
4727
4728 Py_DECREF(substring);
4729 if (result < 0) {
4730 PyErr_SetString(PyExc_ValueError, "substring not found");
4731 return NULL;
4732 }
4733 return PyInt_FromLong(result);
4734}
4735
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004736PyDoc_STRVAR(rjust__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737"S.rjust(width) -> unicode\n\
4738\n\
4739Return S right justified in a Unicode string of length width. Padding is\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004740done using spaces.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741
4742static PyObject *
4743unicode_rjust(PyUnicodeObject *self, PyObject *args)
4744{
4745 int width;
4746 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4747 return NULL;
4748
Tim Peters7a29bd52001-09-12 03:03:31 +00004749 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750 Py_INCREF(self);
4751 return (PyObject*) self;
4752 }
4753
4754 return (PyObject*) pad(self, width - self->length, 0, ' ');
4755}
4756
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757static PyObject*
4758unicode_slice(PyUnicodeObject *self, int start, int end)
4759{
4760 /* standard clamping */
4761 if (start < 0)
4762 start = 0;
4763 if (end < 0)
4764 end = 0;
4765 if (end > self->length)
4766 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004767 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768 /* full slice, return original string */
4769 Py_INCREF(self);
4770 return (PyObject*) self;
4771 }
4772 if (start > end)
4773 start = end;
4774 /* copy slice */
4775 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4776 end - start);
4777}
4778
4779PyObject *PyUnicode_Split(PyObject *s,
4780 PyObject *sep,
4781 int maxsplit)
4782{
4783 PyObject *result;
4784
4785 s = PyUnicode_FromObject(s);
4786 if (s == NULL)
4787 return NULL;
4788 if (sep != NULL) {
4789 sep = PyUnicode_FromObject(sep);
4790 if (sep == NULL) {
4791 Py_DECREF(s);
4792 return NULL;
4793 }
4794 }
4795
4796 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4797
4798 Py_DECREF(s);
4799 Py_XDECREF(sep);
4800 return result;
4801}
4802
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004803PyDoc_STRVAR(split__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004804"S.split([sep [,maxsplit]]) -> list of strings\n\
4805\n\
4806Return a list of the words in S, using sep as the\n\
4807delimiter string. If maxsplit is given, at most maxsplit\n\
4808splits are done. If sep is not specified, any whitespace string\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004809is a separator.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810
4811static PyObject*
4812unicode_split(PyUnicodeObject *self, PyObject *args)
4813{
4814 PyObject *substring = Py_None;
4815 int maxcount = -1;
4816
4817 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4818 return NULL;
4819
4820 if (substring == Py_None)
4821 return split(self, NULL, maxcount);
4822 else if (PyUnicode_Check(substring))
4823 return split(self, (PyUnicodeObject *)substring, maxcount);
4824 else
4825 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4826}
4827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004828PyDoc_STRVAR(splitlines__doc__,
Guido van Rossum86662912000-04-11 15:38:46 +00004829"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830\n\
4831Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004832Line breaks are not included in the resulting list unless keepends\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004833is given and true.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834
4835static PyObject*
4836unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4837{
Guido van Rossum86662912000-04-11 15:38:46 +00004838 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004839
Guido van Rossum86662912000-04-11 15:38:46 +00004840 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 return NULL;
4842
Guido van Rossum86662912000-04-11 15:38:46 +00004843 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844}
4845
4846static
4847PyObject *unicode_str(PyUnicodeObject *self)
4848{
Fred Drakee4315f52000-05-09 19:53:39 +00004849 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850}
4851
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004852PyDoc_STRVAR(swapcase__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853"S.swapcase() -> unicode\n\
4854\n\
4855Return a copy of S with uppercase characters converted to lowercase\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004856and vice versa.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857
4858static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004859unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004861 return fixup(self, fixswapcase);
4862}
4863
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004864PyDoc_STRVAR(translate__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865"S.translate(table) -> unicode\n\
4866\n\
4867Return a copy of the string S, where all characters have been mapped\n\
4868through the given translation table, which must be a mapping of\n\
4869Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004870are left untouched. Characters mapped to None are deleted.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871
4872static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004873unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 return PyUnicode_TranslateCharmap(self->str,
4876 self->length,
4877 table,
4878 "ignore");
4879}
4880
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004881PyDoc_STRVAR(upper__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882"S.upper() -> unicode\n\
4883\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004884Return a copy of S converted to uppercase.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885
4886static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004887unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004889 return fixup(self, fixupper);
4890}
4891
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004892PyDoc_STRVAR(zfill__doc__,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893"S.zfill(width) -> unicode\n\
4894\n\
4895Pad a numeric string x with zeros on the left, to fill a field\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004896of the specified width. The string x is never truncated.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897
4898static PyObject *
4899unicode_zfill(PyUnicodeObject *self, PyObject *args)
4900{
4901 int fill;
4902 PyUnicodeObject *u;
4903
4904 int width;
4905 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4906 return NULL;
4907
4908 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004909 if (PyUnicode_CheckExact(self)) {
4910 Py_INCREF(self);
4911 return (PyObject*) self;
4912 }
4913 else
4914 return PyUnicode_FromUnicode(
4915 PyUnicode_AS_UNICODE(self),
4916 PyUnicode_GET_SIZE(self)
4917 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 }
4919
4920 fill = width - self->length;
4921
4922 u = pad(self, fill, 0, '0');
4923
Walter Dörwald068325e2002-04-15 13:36:47 +00004924 if (u == NULL)
4925 return NULL;
4926
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 if (u->str[fill] == '+' || u->str[fill] == '-') {
4928 /* move sign to beginning of string */
4929 u->str[0] = u->str[fill];
4930 u->str[fill] = '0';
4931 }
4932
4933 return (PyObject*) u;
4934}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935
4936#if 0
4937static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004938unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004939{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940 return PyInt_FromLong(unicode_freelist_size);
4941}
4942#endif
4943
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004944PyDoc_STRVAR(startswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004945"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004947Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004949comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950
4951static PyObject *
4952unicode_startswith(PyUnicodeObject *self,
4953 PyObject *args)
4954{
4955 PyUnicodeObject *substring;
4956 int start = 0;
4957 int end = INT_MAX;
4958 PyObject *result;
4959
Guido van Rossumb8872e62000-05-09 14:14:27 +00004960 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4961 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 return NULL;
4963 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4964 (PyObject *)substring);
4965 if (substring == NULL)
4966 return NULL;
4967
Guido van Rossum77f6a652002-04-03 22:41:51 +00004968 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969
4970 Py_DECREF(substring);
4971 return result;
4972}
4973
4974
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004975PyDoc_STRVAR(endswith__doc__,
Guido van Rossum77f6a652002-04-03 22:41:51 +00004976"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004978Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979optional start, test S beginning at that position. With optional end, stop\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00004980comparing S at that position.");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981
4982static PyObject *
4983unicode_endswith(PyUnicodeObject *self,
4984 PyObject *args)
4985{
4986 PyUnicodeObject *substring;
4987 int start = 0;
4988 int end = INT_MAX;
4989 PyObject *result;
4990
Guido van Rossumb8872e62000-05-09 14:14:27 +00004991 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4992 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 return NULL;
4994 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4995 (PyObject *)substring);
4996 if (substring == NULL)
4997 return NULL;
4998
Guido van Rossum77f6a652002-04-03 22:41:51 +00004999 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000
5001 Py_DECREF(substring);
5002 return result;
5003}
5004
5005
5006static PyMethodDef unicode_methods[] = {
5007
5008 /* Order is according to common usage: often used methods should
5009 appear first, since lookup is done sequentially. */
5010
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005011 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
5012 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
5013 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
5014 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
5015 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
5016 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
5017 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
5018 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
5019 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
5020 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
5021 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
5022 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
5023 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005024 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005025/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
5026 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
5027 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
5028 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005029 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005030 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
Walter Dörwaldde02bcb2002-04-22 17:42:37 +00005031 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005032 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
5033 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
5034 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
5035 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
5036 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
5037 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
5038 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
5039 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
5040 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
5041 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
5042 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
5043 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
5044 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
5045 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005046 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00005047#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005048 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049#endif
5050
5051#if 0
5052 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00005053 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054#endif
5055
5056 {NULL, NULL}
5057};
5058
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059static PySequenceMethods unicode_as_sequence = {
5060 (inquiry) unicode_length, /* sq_length */
5061 (binaryfunc) PyUnicode_Concat, /* sq_concat */
5062 (intargfunc) unicode_repeat, /* sq_repeat */
5063 (intargfunc) unicode_getitem, /* sq_item */
5064 (intintargfunc) unicode_slice, /* sq_slice */
5065 0, /* sq_ass_item */
5066 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005067 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005068};
5069
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005070static PyObject*
5071unicode_subscript(PyUnicodeObject* self, PyObject* item)
5072{
5073 if (PyInt_Check(item)) {
5074 long i = PyInt_AS_LONG(item);
5075 if (i < 0)
5076 i += PyString_GET_SIZE(self);
5077 return unicode_getitem(self, i);
5078 } else if (PyLong_Check(item)) {
5079 long i = PyLong_AsLong(item);
5080 if (i == -1 && PyErr_Occurred())
5081 return NULL;
5082 if (i < 0)
5083 i += PyString_GET_SIZE(self);
5084 return unicode_getitem(self, i);
5085 } else if (PySlice_Check(item)) {
5086 int start, stop, step, slicelength, cur, i;
5087 Py_UNICODE* source_buf;
5088 Py_UNICODE* result_buf;
5089 PyObject* result;
5090
5091 if (PySlice_GetIndicesEx((PySliceObject*)item, PyString_GET_SIZE(self),
5092 &start, &stop, &step, &slicelength) < 0) {
5093 return NULL;
5094 }
5095
5096 if (slicelength <= 0) {
5097 return PyUnicode_FromUnicode(NULL, 0);
5098 } else {
5099 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
5100 result_buf = PyMem_MALLOC(slicelength*sizeof(Py_UNICODE));
5101
5102 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
5103 result_buf[i] = source_buf[cur];
5104 }
5105
5106 result = PyUnicode_FromUnicode(result_buf, slicelength);
5107 PyMem_FREE(result_buf);
5108 return result;
5109 }
5110 } else {
5111 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
5112 return NULL;
5113 }
5114}
5115
5116static PyMappingMethods unicode_as_mapping = {
5117 (inquiry)unicode_length, /* mp_length */
5118 (binaryfunc)unicode_subscript, /* mp_subscript */
5119 (objobjargproc)0, /* mp_ass_subscript */
5120};
5121
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122static int
5123unicode_buffer_getreadbuf(PyUnicodeObject *self,
5124 int index,
5125 const void **ptr)
5126{
5127 if (index != 0) {
5128 PyErr_SetString(PyExc_SystemError,
5129 "accessing non-existent unicode segment");
5130 return -1;
5131 }
5132 *ptr = (void *) self->str;
5133 return PyUnicode_GET_DATA_SIZE(self);
5134}
5135
5136static int
5137unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5138 const void **ptr)
5139{
5140 PyErr_SetString(PyExc_TypeError,
Neal Norwitz20e72132002-06-13 21:25:17 +00005141 "cannot use unicode as modifiable buffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 return -1;
5143}
5144
5145static int
5146unicode_buffer_getsegcount(PyUnicodeObject *self,
5147 int *lenp)
5148{
5149 if (lenp)
5150 *lenp = PyUnicode_GET_DATA_SIZE(self);
5151 return 1;
5152}
5153
5154static int
5155unicode_buffer_getcharbuf(PyUnicodeObject *self,
5156 int index,
5157 const void **ptr)
5158{
5159 PyObject *str;
5160
5161 if (index != 0) {
5162 PyErr_SetString(PyExc_SystemError,
5163 "accessing non-existent unicode segment");
5164 return -1;
5165 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005166 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 if (str == NULL)
5168 return -1;
5169 *ptr = (void *) PyString_AS_STRING(str);
5170 return PyString_GET_SIZE(str);
5171}
5172
5173/* Helpers for PyUnicode_Format() */
5174
5175static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005176getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005177{
5178 int argidx = *p_argidx;
5179 if (argidx < arglen) {
5180 (*p_argidx)++;
5181 if (arglen < 0)
5182 return args;
5183 else
5184 return PyTuple_GetItem(args, argidx);
5185 }
5186 PyErr_SetString(PyExc_TypeError,
5187 "not enough arguments for format string");
5188 return NULL;
5189}
5190
5191#define F_LJUST (1<<0)
5192#define F_SIGN (1<<1)
5193#define F_BLANK (1<<2)
5194#define F_ALT (1<<3)
5195#define F_ZERO (1<<4)
5196
5197static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199{
5200 register int i;
5201 int len;
5202 va_list va;
5203 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205
5206 /* First, format the string as char array, then expand to Py_UNICODE
5207 array. */
5208 charbuffer = (char *)buffer;
5209 len = vsprintf(charbuffer, format, va);
5210 for (i = len - 1; i >= 0; i--)
5211 buffer[i] = (Py_UNICODE) charbuffer[i];
5212
5213 va_end(va);
5214 return len;
5215}
5216
5217static int
5218formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005219 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 int flags,
5221 int prec,
5222 int type,
5223 PyObject *v)
5224{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005225 /* fmt = '%#.' + `prec` + `type`
5226 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 char fmt[20];
5228 double x;
5229
5230 x = PyFloat_AsDouble(v);
5231 if (x == -1.0 && PyErr_Occurred())
5232 return -1;
5233 if (prec < 0)
5234 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5236 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005237 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5238 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005239 /* worst case length calc to ensure no buffer overrun:
5240 fmt = %#.<prec>g
5241 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5242 for any double rep.)
5243 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5244 If prec=0 the effective precision is 1 (the leading digit is
5245 always given), therefore increase by one to 10+prec. */
5246 if (buflen <= (size_t)10 + (size_t)prec) {
5247 PyErr_SetString(PyExc_OverflowError,
5248 "formatted float is too long (precision too long?)");
5249 return -1;
5250 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005251 return usprintf(buf, fmt, x);
5252}
5253
Tim Peters38fd5b62000-09-21 05:43:11 +00005254static PyObject*
5255formatlong(PyObject *val, int flags, int prec, int type)
5256{
5257 char *buf;
5258 int i, len;
5259 PyObject *str; /* temporary string object. */
5260 PyUnicodeObject *result;
5261
5262 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5263 if (!str)
5264 return NULL;
5265 result = _PyUnicode_New(len);
5266 for (i = 0; i < len; i++)
5267 result->str[i] = buf[i];
5268 result->str[len] = 0;
5269 Py_DECREF(str);
5270 return (PyObject*)result;
5271}
5272
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273static int
5274formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005275 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276 int flags,
5277 int prec,
5278 int type,
5279 PyObject *v)
5280{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005281 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005282 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5283 * + 1 + 1
5284 * = 24
5285 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005286 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 long x;
5288
5289 x = PyInt_AsLong(v);
5290 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005291 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005293 prec = 1;
5294
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005295 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005296 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5297 */
5298 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005299 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005300 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005301 return -1;
5302 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005303
5304 if ((flags & F_ALT) &&
5305 (type == 'x' || type == 'X')) {
5306 /* When converting under %#x or %#X, there are a number
5307 * of issues that cause pain:
5308 * - when 0 is being converted, the C standard leaves off
5309 * the '0x' or '0X', which is inconsistent with other
5310 * %#x/%#X conversions and inconsistent with Python's
5311 * hex() function
5312 * - there are platforms that violate the standard and
5313 * convert 0 with the '0x' or '0X'
5314 * (Metrowerks, Compaq Tru64)
5315 * - there are platforms that give '0x' when converting
5316 * under %#X, but convert 0 in accordance with the
5317 * standard (OS/2 EMX)
5318 *
5319 * We can achieve the desired consistency by inserting our
5320 * own '0x' or '0X' prefix, and substituting %x/%X in place
5321 * of %#x/%#X.
5322 *
5323 * Note that this is the same approach as used in
5324 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005325 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005326 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5327 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005328 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005329 else {
5330 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5331 (flags&F_ALT) ? "#" : "",
5332 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005333 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005334 return usprintf(buf, fmt, x);
5335}
5336
5337static int
5338formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005339 size_t buflen,
5340 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005341{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005342 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005343 if (PyUnicode_Check(v)) {
5344 if (PyUnicode_GET_SIZE(v) != 1)
5345 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005346 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005347 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005349 else if (PyString_Check(v)) {
5350 if (PyString_GET_SIZE(v) != 1)
5351 goto onError;
5352 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5353 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354
5355 else {
5356 /* Integer input truncated to a character */
5357 long x;
5358 x = PyInt_AsLong(v);
5359 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005360 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005361 buf[0] = (char) x;
5362 }
5363 buf[1] = '\0';
5364 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005365
5366 onError:
5367 PyErr_SetString(PyExc_TypeError,
5368 "%c requires int or char");
5369 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370}
5371
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005372/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5373
5374 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5375 chars are formatted. XXX This is a magic number. Each formatting
5376 routine does bounds checking to ensure no overflow, but a better
5377 solution may be to malloc a buffer of appropriate size for each
5378 format. For now, the current solution is sufficient.
5379*/
5380#define FORMATBUFLEN (size_t)120
5381
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382PyObject *PyUnicode_Format(PyObject *format,
5383 PyObject *args)
5384{
5385 Py_UNICODE *fmt, *res;
5386 int fmtcnt, rescnt, reslen, arglen, argidx;
5387 int args_owned = 0;
5388 PyUnicodeObject *result = NULL;
5389 PyObject *dict = NULL;
5390 PyObject *uformat;
5391
5392 if (format == NULL || args == NULL) {
5393 PyErr_BadInternalCall();
5394 return NULL;
5395 }
5396 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005397 if (uformat == NULL)
5398 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005399 fmt = PyUnicode_AS_UNICODE(uformat);
5400 fmtcnt = PyUnicode_GET_SIZE(uformat);
5401
5402 reslen = rescnt = fmtcnt + 100;
5403 result = _PyUnicode_New(reslen);
5404 if (result == NULL)
5405 goto onError;
5406 res = PyUnicode_AS_UNICODE(result);
5407
5408 if (PyTuple_Check(args)) {
5409 arglen = PyTuple_Size(args);
5410 argidx = 0;
5411 }
5412 else {
5413 arglen = -1;
5414 argidx = -2;
5415 }
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005416 if (args->ob_type->tp_as_mapping && !PyTuple_Check(args))
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 dict = args;
5418
5419 while (--fmtcnt >= 0) {
5420 if (*fmt != '%') {
5421 if (--rescnt < 0) {
5422 rescnt = fmtcnt + 100;
5423 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005424 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425 return NULL;
5426 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5427 --rescnt;
5428 }
5429 *res++ = *fmt++;
5430 }
5431 else {
5432 /* Got a format specifier */
5433 int flags = 0;
5434 int width = -1;
5435 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005436 Py_UNICODE c = '\0';
5437 Py_UNICODE fill;
5438 PyObject *v = NULL;
5439 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005440 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005441 Py_UNICODE sign;
5442 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005443 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444
5445 fmt++;
5446 if (*fmt == '(') {
5447 Py_UNICODE *keystart;
5448 int keylen;
5449 PyObject *key;
5450 int pcount = 1;
5451
5452 if (dict == NULL) {
5453 PyErr_SetString(PyExc_TypeError,
5454 "format requires a mapping");
5455 goto onError;
5456 }
5457 ++fmt;
5458 --fmtcnt;
5459 keystart = fmt;
5460 /* Skip over balanced parentheses */
5461 while (pcount > 0 && --fmtcnt >= 0) {
5462 if (*fmt == ')')
5463 --pcount;
5464 else if (*fmt == '(')
5465 ++pcount;
5466 fmt++;
5467 }
5468 keylen = fmt - keystart - 1;
5469 if (fmtcnt < 0 || pcount > 0) {
5470 PyErr_SetString(PyExc_ValueError,
5471 "incomplete format key");
5472 goto onError;
5473 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005474#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005475 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005476 then looked up since Python uses strings to hold
5477 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005478 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005479 key = PyUnicode_EncodeUTF8(keystart,
5480 keylen,
5481 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005482#else
5483 key = PyUnicode_FromUnicode(keystart, keylen);
5484#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 if (key == NULL)
5486 goto onError;
5487 if (args_owned) {
5488 Py_DECREF(args);
5489 args_owned = 0;
5490 }
5491 args = PyObject_GetItem(dict, key);
5492 Py_DECREF(key);
5493 if (args == NULL) {
5494 goto onError;
5495 }
5496 args_owned = 1;
5497 arglen = -1;
5498 argidx = -2;
5499 }
5500 while (--fmtcnt >= 0) {
5501 switch (c = *fmt++) {
5502 case '-': flags |= F_LJUST; continue;
5503 case '+': flags |= F_SIGN; continue;
5504 case ' ': flags |= F_BLANK; continue;
5505 case '#': flags |= F_ALT; continue;
5506 case '0': flags |= F_ZERO; continue;
5507 }
5508 break;
5509 }
5510 if (c == '*') {
5511 v = getnextarg(args, arglen, &argidx);
5512 if (v == NULL)
5513 goto onError;
5514 if (!PyInt_Check(v)) {
5515 PyErr_SetString(PyExc_TypeError,
5516 "* wants int");
5517 goto onError;
5518 }
5519 width = PyInt_AsLong(v);
5520 if (width < 0) {
5521 flags |= F_LJUST;
5522 width = -width;
5523 }
5524 if (--fmtcnt >= 0)
5525 c = *fmt++;
5526 }
5527 else if (c >= '0' && c <= '9') {
5528 width = c - '0';
5529 while (--fmtcnt >= 0) {
5530 c = *fmt++;
5531 if (c < '0' || c > '9')
5532 break;
5533 if ((width*10) / 10 != width) {
5534 PyErr_SetString(PyExc_ValueError,
5535 "width too big");
5536 goto onError;
5537 }
5538 width = width*10 + (c - '0');
5539 }
5540 }
5541 if (c == '.') {
5542 prec = 0;
5543 if (--fmtcnt >= 0)
5544 c = *fmt++;
5545 if (c == '*') {
5546 v = getnextarg(args, arglen, &argidx);
5547 if (v == NULL)
5548 goto onError;
5549 if (!PyInt_Check(v)) {
5550 PyErr_SetString(PyExc_TypeError,
5551 "* wants int");
5552 goto onError;
5553 }
5554 prec = PyInt_AsLong(v);
5555 if (prec < 0)
5556 prec = 0;
5557 if (--fmtcnt >= 0)
5558 c = *fmt++;
5559 }
5560 else if (c >= '0' && c <= '9') {
5561 prec = c - '0';
5562 while (--fmtcnt >= 0) {
5563 c = Py_CHARMASK(*fmt++);
5564 if (c < '0' || c > '9')
5565 break;
5566 if ((prec*10) / 10 != prec) {
5567 PyErr_SetString(PyExc_ValueError,
5568 "prec too big");
5569 goto onError;
5570 }
5571 prec = prec*10 + (c - '0');
5572 }
5573 }
5574 } /* prec */
5575 if (fmtcnt >= 0) {
5576 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 if (--fmtcnt >= 0)
5578 c = *fmt++;
5579 }
5580 }
5581 if (fmtcnt < 0) {
5582 PyErr_SetString(PyExc_ValueError,
5583 "incomplete format");
5584 goto onError;
5585 }
5586 if (c != '%') {
5587 v = getnextarg(args, arglen, &argidx);
5588 if (v == NULL)
5589 goto onError;
5590 }
5591 sign = 0;
5592 fill = ' ';
5593 switch (c) {
5594
5595 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005596 pbuf = formatbuf;
5597 /* presume that buffer length is at least 1 */
5598 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005599 len = 1;
5600 break;
5601
5602 case 's':
5603 case 'r':
5604 if (PyUnicode_Check(v) && c == 's') {
5605 temp = v;
5606 Py_INCREF(temp);
5607 }
5608 else {
5609 PyObject *unicode;
5610 if (c == 's')
5611 temp = PyObject_Str(v);
5612 else
5613 temp = PyObject_Repr(v);
5614 if (temp == NULL)
5615 goto onError;
5616 if (!PyString_Check(temp)) {
5617 /* XXX Note: this should never happen, since
5618 PyObject_Repr() and PyObject_Str() assure
5619 this */
5620 Py_DECREF(temp);
5621 PyErr_SetString(PyExc_TypeError,
5622 "%s argument has non-string str()");
5623 goto onError;
5624 }
Fred Drakee4315f52000-05-09 19:53:39 +00005625 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005626 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005627 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005628 "strict");
5629 Py_DECREF(temp);
5630 temp = unicode;
5631 if (temp == NULL)
5632 goto onError;
5633 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005634 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005635 len = PyUnicode_GET_SIZE(temp);
5636 if (prec >= 0 && len > prec)
5637 len = prec;
5638 break;
5639
5640 case 'i':
5641 case 'd':
5642 case 'u':
5643 case 'o':
5644 case 'x':
5645 case 'X':
5646 if (c == 'i')
5647 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005648 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005649 temp = formatlong(v, flags, prec, c);
5650 if (!temp)
5651 goto onError;
5652 pbuf = PyUnicode_AS_UNICODE(temp);
5653 len = PyUnicode_GET_SIZE(temp);
5654 /* unbounded ints can always produce
5655 a sign character! */
5656 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005657 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005658 else {
5659 pbuf = formatbuf;
5660 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5661 flags, prec, c, v);
5662 if (len < 0)
5663 goto onError;
5664 /* only d conversion is signed */
5665 sign = c == 'd';
5666 }
5667 if (flags & F_ZERO)
5668 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005669 break;
5670
5671 case 'e':
5672 case 'E':
5673 case 'f':
5674 case 'g':
5675 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005676 pbuf = formatbuf;
5677 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5678 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005679 if (len < 0)
5680 goto onError;
5681 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005682 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005683 fill = '0';
5684 break;
5685
5686 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005687 pbuf = formatbuf;
5688 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005689 if (len < 0)
5690 goto onError;
5691 break;
5692
5693 default:
5694 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005695 "unsupported format character '%c' (0x%x) "
5696 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005697 (31<=c && c<=126) ? c : '?',
5698 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 goto onError;
5700 }
5701 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005702 if (*pbuf == '-' || *pbuf == '+') {
5703 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704 len--;
5705 }
5706 else if (flags & F_SIGN)
5707 sign = '+';
5708 else if (flags & F_BLANK)
5709 sign = ' ';
5710 else
5711 sign = 0;
5712 }
5713 if (width < len)
5714 width = len;
5715 if (rescnt < width + (sign != 0)) {
5716 reslen -= rescnt;
5717 rescnt = width + fmtcnt + 100;
5718 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005719 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005720 return NULL;
5721 res = PyUnicode_AS_UNICODE(result)
5722 + reslen - rescnt;
5723 }
5724 if (sign) {
5725 if (fill != ' ')
5726 *res++ = sign;
5727 rescnt--;
5728 if (width > len)
5729 width--;
5730 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005731 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5732 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005733 assert(pbuf[1] == c);
5734 if (fill != ' ') {
5735 *res++ = *pbuf++;
5736 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005737 }
Tim Petersfff53252001-04-12 18:38:48 +00005738 rescnt -= 2;
5739 width -= 2;
5740 if (width < 0)
5741 width = 0;
5742 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005743 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 if (width > len && !(flags & F_LJUST)) {
5745 do {
5746 --rescnt;
5747 *res++ = fill;
5748 } while (--width > len);
5749 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005750 if (fill == ' ') {
5751 if (sign)
5752 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005753 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005754 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005755 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005756 *res++ = *pbuf++;
5757 *res++ = *pbuf++;
5758 }
5759 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005760 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 res += len;
5762 rescnt -= len;
5763 while (--width >= len) {
5764 --rescnt;
5765 *res++ = ' ';
5766 }
5767 if (dict && (argidx < arglen) && c != '%') {
5768 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005769 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005770 goto onError;
5771 }
5772 Py_XDECREF(temp);
5773 } /* '%' */
5774 } /* until end */
5775 if (argidx < arglen && !dict) {
5776 PyErr_SetString(PyExc_TypeError,
Raymond Hettinger0ebac972002-05-21 15:14:57 +00005777 "not all arguments converted during string formatting");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778 goto onError;
5779 }
5780
5781 if (args_owned) {
5782 Py_DECREF(args);
5783 }
5784 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005785 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005786 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005787 return (PyObject *)result;
5788
5789 onError:
5790 Py_XDECREF(result);
5791 Py_DECREF(uformat);
5792 if (args_owned) {
5793 Py_DECREF(args);
5794 }
5795 return NULL;
5796}
5797
5798static PyBufferProcs unicode_as_buffer = {
5799 (getreadbufferproc) unicode_buffer_getreadbuf,
5800 (getwritebufferproc) unicode_buffer_getwritebuf,
5801 (getsegcountproc) unicode_buffer_getsegcount,
5802 (getcharbufferproc) unicode_buffer_getcharbuf,
5803};
5804
Jeremy Hylton938ace62002-07-17 16:30:39 +00005805static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +00005806unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5807
Tim Peters6d6c1a32001-08-02 04:15:00 +00005808static PyObject *
5809unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5810{
5811 PyObject *x = NULL;
5812 static char *kwlist[] = {"string", "encoding", "errors", 0};
5813 char *encoding = NULL;
5814 char *errors = NULL;
5815
Guido van Rossume023fe02001-08-30 03:12:59 +00005816 if (type != &PyUnicode_Type)
5817 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005818 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5819 kwlist, &x, &encoding, &errors))
5820 return NULL;
5821 if (x == NULL)
5822 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005823 if (encoding == NULL && errors == NULL)
5824 return PyObject_Unicode(x);
5825 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005826 return PyUnicode_FromEncodedObject(x, encoding, errors);
5827}
5828
Guido van Rossume023fe02001-08-30 03:12:59 +00005829static PyObject *
5830unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5831{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005832 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005833 int n;
5834
5835 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5836 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5837 if (tmp == NULL)
5838 return NULL;
5839 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005840 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5841 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005842 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005843 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5844 if (pnew->str == NULL) {
5845 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005846 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005847 return NULL;
5848 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005849 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5850 pnew->length = n;
5851 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005852 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005853 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005854}
5855
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +00005856PyDoc_STRVAR(unicode_doc,
Tim Peters6d6c1a32001-08-02 04:15:00 +00005857"unicode(string [, encoding[, errors]]) -> object\n\
5858\n\
5859Create a new Unicode object from the given encoded string.\n\
Skip Montanaro35b37a52002-07-26 16:22:46 +00005860encoding defaults to the current default string encoding.\n\
5861errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +00005862
Guido van Rossumd57fd912000-03-10 22:53:23 +00005863PyTypeObject PyUnicode_Type = {
5864 PyObject_HEAD_INIT(&PyType_Type)
5865 0, /* ob_size */
5866 "unicode", /* tp_name */
5867 sizeof(PyUnicodeObject), /* tp_size */
5868 0, /* tp_itemsize */
5869 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005870 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005871 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005872 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005873 0, /* tp_setattr */
5874 (cmpfunc) unicode_compare, /* tp_compare */
5875 (reprfunc) unicode_repr, /* tp_repr */
5876 0, /* tp_as_number */
5877 &unicode_as_sequence, /* tp_as_sequence */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +00005878 &unicode_as_mapping, /* tp_as_mapping */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005879 (hashfunc) unicode_hash, /* tp_hash*/
5880 0, /* tp_call*/
5881 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005882 PyObject_GenericGetAttr, /* tp_getattro */
5883 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005884 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005885 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005886 unicode_doc, /* tp_doc */
5887 0, /* tp_traverse */
5888 0, /* tp_clear */
5889 0, /* tp_richcompare */
5890 0, /* tp_weaklistoffset */
5891 0, /* tp_iter */
5892 0, /* tp_iternext */
5893 unicode_methods, /* tp_methods */
5894 0, /* tp_members */
5895 0, /* tp_getset */
Guido van Rossumcacfc072002-05-24 19:01:59 +00005896 &PyBaseString_Type, /* tp_base */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005897 0, /* tp_dict */
5898 0, /* tp_descr_get */
5899 0, /* tp_descr_set */
5900 0, /* tp_dictoffset */
5901 0, /* tp_init */
5902 0, /* tp_alloc */
5903 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005904 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005905};
5906
5907/* Initialize the Unicode implementation */
5908
Thomas Wouters78890102000-07-22 19:25:51 +00005909void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005911 int i;
5912
Fred Drakee4315f52000-05-09 19:53:39 +00005913 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005914 unicode_freelist = NULL;
5915 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005916 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005917 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005918 for (i = 0; i < 256; i++)
5919 unicode_latin1[i] = NULL;
Guido van Rossumcacfc072002-05-24 19:01:59 +00005920 if (PyType_Ready(&PyUnicode_Type) < 0)
5921 Py_FatalError("Can't initialize 'unicode'");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005922}
5923
5924/* Finalize the Unicode implementation */
5925
5926void
Thomas Wouters78890102000-07-22 19:25:51 +00005927_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005929 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005930 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005931
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005932 Py_XDECREF(unicode_empty);
5933 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005934
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005935 for (i = 0; i < 256; i++) {
5936 if (unicode_latin1[i]) {
5937 Py_DECREF(unicode_latin1[i]);
5938 unicode_latin1[i] = NULL;
5939 }
5940 }
5941
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005942 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005943 PyUnicodeObject *v = u;
5944 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005945 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005946 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005947 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005948 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005950 unicode_freelist = NULL;
5951 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005952}