blob: a4d455ad72e40f188abd8a2097150c9256e11e7b [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
933 }
934 return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
973{
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000977 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978 details);
979 return -1;
980 }
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
984 }
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
990 }
991 else {
992 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000993 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 errors);
995 return -1;
996 }
997}
998
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1002{
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001007 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1016
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1020
1021 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 s++;
1027 continue;
1028 }
1029
1030 n = utf8_code_length[ch];
1031
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 switch (n) {
1038
1039 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001040 errmsg = "unexpected code byte";
1041 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 errmsg = "internal error";
1045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001058 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 break;
1060
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001068 if (ch < 0x0800) {
1069 /* Note: UTF-8 encodings of surrogates are considered
1070 legal UTF-8 sequences;
1071
1072 XXX For wide builds (UCS-4) we should probably try
1073 to recombine the surrogates into a single code
1074 unit.
1075 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001080 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
Tim Peters7e3d9612002-04-21 03:26:37 +00001175PyObject *
1176PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1177 int size,
1178 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179{
1180 PyObject *v;
1181 char *p;
Tim Peters7e3d9612002-04-21 03:26:37 +00001182 int allocated = 0;
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001183 int i;
1184
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001185 /* Short-cut for emtpy strings */
1186 if (size == 0)
1187 return PyString_FromStringAndSize(NULL, 0);
1188
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001189 for (i = 0; i < size; ) {
1190 Py_UCS4 ch = s[i++];
1191 if (ch < 0x80)
1192 allocated += 1;
1193 else if (ch < 0x0800)
1194 allocated += 2;
1195 else if (ch < 0x10000) {
1196 /* Check for high surrogate */
1197 if (0xD800 <= ch && ch <= 0xDBFF &&
1198 i != size &&
1199 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
1200 allocated += 1;
1201 i++;
1202 }
1203 allocated += 3;
1204 } else
1205 allocated += 4;
1206 }
1207
1208 v = PyString_FromStringAndSize(NULL, allocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 if (v == NULL)
1210 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001212 p = PyString_AS_STRING(v);
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001213 for (i = 0; i < size; ) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001214 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001215
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001216 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 *p++ = (char) ch;
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001218 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001219
Guido van Rossumd57fd912000-03-10 22:53:23 +00001220 else if (ch < 0x0800) {
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001221 *p++ = (char)(0xc0 | (ch >> 6));
1222 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001223 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001224
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001225 else {
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001226
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001227 if (ch < 0x10000) {
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001228 /* Check for high surrogate */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001229 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1230 Py_UCS4 ch2 = s[i];
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001231 /* Check for low surrogate */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001232 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001233 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1234 *p++ = (char)((ch >> 18) | 0xf0);
1235 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1236 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1237 *p++ = (char)(0x80 | (ch & 0x3f));
1238 i++;
1239 continue;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001240 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001241 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001243 *p++ = (char)(0xe0 | (ch >> 12));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001244 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1245 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001246
1247 } else {
1248 *p++ = (char)(0xf0 | (ch>>18));
1249 *p++ = (char)(0x80 | ((ch>>12) & 0x3f));
1250 *p++ = (char)(0x80 | ((ch>>6) & 0x3f));
1251 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001252 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001253 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254 }
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001255 assert(p - PyString_AS_STRING(v) == allocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001257}
1258
Guido van Rossumd57fd912000-03-10 22:53:23 +00001259PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1260{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001261 if (!PyUnicode_Check(unicode)) {
1262 PyErr_BadArgument();
1263 return NULL;
1264 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001265 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1266 PyUnicode_GET_SIZE(unicode),
1267 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268}
1269
1270/* --- UTF-16 Codec ------------------------------------------------------- */
1271
1272static
Tim Peters772747b2001-08-09 22:21:55 +00001273int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274 const char *errors,
1275 const char *details)
1276{
1277 if ((errors == NULL) ||
1278 (strcmp(errors,"strict") == 0)) {
1279 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001280 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 details);
1282 return -1;
1283 }
1284 else if (strcmp(errors,"ignore") == 0) {
1285 return 0;
1286 }
1287 else if (strcmp(errors,"replace") == 0) {
1288 if (dest) {
1289 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1290 (*dest)++;
1291 }
1292 return 0;
1293 }
1294 else {
1295 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001296 "UTF-16 decoding error; "
1297 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298 errors);
1299 return -1;
1300 }
1301}
1302
Tim Peters772747b2001-08-09 22:21:55 +00001303PyObject *
1304PyUnicode_DecodeUTF16(const char *s,
1305 int size,
1306 const char *errors,
1307 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308{
1309 PyUnicodeObject *unicode;
1310 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001311 const unsigned char *q, *e;
1312 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001313 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001314 /* Offsets from q for retrieving byte pairs in the right order. */
1315#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1316 int ihi = 1, ilo = 0;
1317#else
1318 int ihi = 0, ilo = 1;
1319#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320
1321 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001322 if (size & 1) {
1323 if (utf16_decoding_error(NULL, errors, "truncated data"))
1324 return NULL;
1325 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001326 }
1327
1328 /* Note: size will always be longer than the resulting Unicode
1329 character count */
1330 unicode = _PyUnicode_New(size);
1331 if (!unicode)
1332 return NULL;
1333 if (size == 0)
1334 return (PyObject *)unicode;
1335
1336 /* Unpack UTF-16 encoded data */
1337 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001338 q = (unsigned char *)s;
1339 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340
1341 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001342 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001343
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001344 /* Check for BOM marks (U+FEFF) in the input and adjust current
1345 byte order setting accordingly. In native mode, the leading BOM
1346 mark is skipped, in all other modes, it is copied to the output
1347 stream as-is (giving a ZWNBSP character). */
1348 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001349 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001350#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001351 if (bom == 0xFEFF) {
1352 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001353 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001354 }
1355 else if (bom == 0xFFFE) {
1356 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001357 bo = 1;
1358 }
1359#else
Tim Peters772747b2001-08-09 22:21:55 +00001360 if (bom == 0xFEFF) {
1361 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001362 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001363 }
1364 else if (bom == 0xFFFE) {
1365 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001366 bo = -1;
1367 }
1368#endif
1369 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370
Tim Peters772747b2001-08-09 22:21:55 +00001371 if (bo == -1) {
1372 /* force LE */
1373 ihi = 1;
1374 ilo = 0;
1375 }
1376 else if (bo == 1) {
1377 /* force BE */
1378 ihi = 0;
1379 ilo = 1;
1380 }
1381
1382 while (q < e) {
1383 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1384 q += 2;
1385
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 if (ch < 0xD800 || ch > 0xDFFF) {
1387 *p++ = ch;
1388 continue;
1389 }
1390
1391 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001392 if (q >= e) {
1393 errmsg = "unexpected end of data";
1394 goto utf16Error;
1395 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001396 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001397 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1398 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001399 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001400#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001401 *p++ = ch;
1402 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001403#else
1404 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001405#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001406 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001407 }
1408 else {
1409 errmsg = "illegal UTF-16 surrogate";
1410 goto utf16Error;
1411 }
1412
Guido van Rossumd57fd912000-03-10 22:53:23 +00001413 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001414 errmsg = "illegal encoding";
1415 /* Fall through to report the error */
1416
1417 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001418 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001419 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420 }
1421
1422 if (byteorder)
1423 *byteorder = bo;
1424
1425 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001426 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001427 goto onError;
1428
1429 return (PyObject *)unicode;
1430
1431onError:
1432 Py_DECREF(unicode);
1433 return NULL;
1434}
1435
Tim Peters772747b2001-08-09 22:21:55 +00001436PyObject *
1437PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1438 int size,
1439 const char *errors,
1440 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441{
1442 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001443 unsigned char *p;
1444 int i, pairs;
1445 /* Offsets from p for storing byte pairs in the right order. */
1446#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1447 int ihi = 1, ilo = 0;
1448#else
1449 int ihi = 0, ilo = 1;
1450#endif
1451
1452#define STORECHAR(CH) \
1453 do { \
1454 p[ihi] = ((CH) >> 8) & 0xff; \
1455 p[ilo] = (CH) & 0xff; \
1456 p += 2; \
1457 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001459 for (i = pairs = 0; i < size; i++)
1460 if (s[i] >= 0x10000)
1461 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001463 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464 if (v == NULL)
1465 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001466
Tim Peters772747b2001-08-09 22:21:55 +00001467 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001468 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001469 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001470 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001471 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001472
1473 if (byteorder == -1) {
1474 /* force LE */
1475 ihi = 1;
1476 ilo = 0;
1477 }
1478 else if (byteorder == 1) {
1479 /* force BE */
1480 ihi = 0;
1481 ilo = 1;
1482 }
1483
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001484 while (size-- > 0) {
1485 Py_UNICODE ch = *s++;
1486 Py_UNICODE ch2 = 0;
1487 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001488 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1489 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 }
Tim Peters772747b2001-08-09 22:21:55 +00001491 STORECHAR(ch);
1492 if (ch2)
1493 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001494 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001496#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497}
1498
1499PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1500{
1501 if (!PyUnicode_Check(unicode)) {
1502 PyErr_BadArgument();
1503 return NULL;
1504 }
1505 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1506 PyUnicode_GET_SIZE(unicode),
1507 NULL,
1508 0);
1509}
1510
1511/* --- Unicode Escape Codec ----------------------------------------------- */
1512
1513static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001514int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515 const char *errors,
1516 const char *details)
1517{
1518 if ((errors == NULL) ||
1519 (strcmp(errors,"strict") == 0)) {
1520 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001521 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001522 details);
1523 return -1;
1524 }
1525 else if (strcmp(errors,"ignore") == 0) {
1526 return 0;
1527 }
1528 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001529 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1530 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001531 return 0;
1532 }
1533 else {
1534 PyErr_Format(PyExc_ValueError,
1535 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001536 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001537 errors);
1538 return -1;
1539 }
1540}
1541
Fredrik Lundh06d12682001-01-24 07:59:11 +00001542static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001543
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1545 int size,
1546 const char *errors)
1547{
1548 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001549 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001550 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001551 char* message;
1552 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1553
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554 /* Escaped strings will always be longer than the resulting
1555 Unicode string, so we start with size here and then reduce the
1556 length after conversion to the true value. */
1557 v = _PyUnicode_New(size);
1558 if (v == NULL)
1559 goto onError;
1560 if (size == 0)
1561 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001562
Guido van Rossumd57fd912000-03-10 22:53:23 +00001563 p = buf = PyUnicode_AS_UNICODE(v);
1564 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001565
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 while (s < end) {
1567 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001568 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001569 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001570
1571 /* Non-escape characters are interpreted as Unicode ordinals */
1572 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001573 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001574 continue;
1575 }
1576
1577 /* \ - Escapes */
1578 s++;
1579 switch (*s++) {
1580
1581 /* \x escapes */
1582 case '\n': break;
1583 case '\\': *p++ = '\\'; break;
1584 case '\'': *p++ = '\''; break;
1585 case '\"': *p++ = '\"'; break;
1586 case 'b': *p++ = '\b'; break;
1587 case 'f': *p++ = '\014'; break; /* FF */
1588 case 't': *p++ = '\t'; break;
1589 case 'n': *p++ = '\n'; break;
1590 case 'r': *p++ = '\r'; break;
1591 case 'v': *p++ = '\013'; break; /* VT */
1592 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1593
1594 /* \OOO (octal) escapes */
1595 case '0': case '1': case '2': case '3':
1596 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001597 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001599 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001601 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001603 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001604 break;
1605
Fredrik Lundhccc74732001-02-18 22:13:49 +00001606 /* hex escapes */
1607 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001609 digits = 2;
1610 message = "truncated \\xXX escape";
1611 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612
Fredrik Lundhccc74732001-02-18 22:13:49 +00001613 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001615 digits = 4;
1616 message = "truncated \\uXXXX escape";
1617 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618
Fredrik Lundhccc74732001-02-18 22:13:49 +00001619 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001620 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001621 digits = 8;
1622 message = "truncated \\UXXXXXXXX escape";
1623 hexescape:
1624 chr = 0;
1625 for (i = 0; i < digits; i++) {
1626 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001627 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001628 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001629 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001630 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001631 i++;
1632 break;
1633 }
1634 chr = (chr<<4) & ~0xF;
1635 if (c >= '0' && c <= '9')
1636 chr += c - '0';
1637 else if (c >= 'a' && c <= 'f')
1638 chr += 10 + c - 'a';
1639 else
1640 chr += 10 + c - 'A';
1641 }
1642 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001643 if (chr == 0xffffffff)
1644 /* _decoding_error will have already written into the
1645 target buffer. */
1646 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001647 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001648 /* when we get here, chr is a 32-bit unicode character */
1649 if (chr <= 0xffff)
1650 /* UCS-2 character */
1651 *p++ = (Py_UNICODE) chr;
1652 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001653 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001654 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001655#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001656 *p++ = chr;
1657#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001658 chr -= 0x10000L;
1659 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001660 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001661#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001662 } else {
1663 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001664 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001665 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001666 )
1667 goto onError;
1668 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001669 break;
1670
1671 /* \N{name} */
1672 case 'N':
1673 message = "malformed \\N character escape";
1674 if (ucnhash_CAPI == NULL) {
1675 /* load the unicode data module */
1676 PyObject *m, *v;
1677 m = PyImport_ImportModule("unicodedata");
1678 if (m == NULL)
1679 goto ucnhashError;
1680 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1681 Py_DECREF(m);
1682 if (v == NULL)
1683 goto ucnhashError;
1684 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1685 Py_DECREF(v);
1686 if (ucnhash_CAPI == NULL)
1687 goto ucnhashError;
1688 }
1689 if (*s == '{') {
1690 const char *start = s+1;
1691 /* look for the closing brace */
1692 while (*s != '}' && s < end)
1693 s++;
1694 if (s > start && s < end && *s == '}') {
1695 /* found a name. look it up in the unicode database */
1696 message = "unknown Unicode character name";
1697 s++;
1698 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1699 goto store;
1700 }
1701 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001702 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001703 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001704 break;
1705
1706 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001707 if (s > end) {
1708 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1709 goto onError;
1710 }
1711 else {
1712 *p++ = '\\';
1713 *p++ = (unsigned char)s[-1];
1714 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001715 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001716 }
1717 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001718 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001721
Fredrik Lundhccc74732001-02-18 22:13:49 +00001722ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001723 PyErr_SetString(
1724 PyExc_UnicodeError,
1725 "\\N escapes not supported (can't load unicodedata module)"
1726 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001727 return NULL;
1728
Fredrik Lundhccc74732001-02-18 22:13:49 +00001729onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 Py_XDECREF(v);
1731 return NULL;
1732}
1733
1734/* Return a Unicode-Escape string version of the Unicode object.
1735
1736 If quotes is true, the string is enclosed in u"" or u'' quotes as
1737 appropriate.
1738
1739*/
1740
Barry Warsaw51ac5802000-03-20 16:36:48 +00001741static const Py_UNICODE *findchar(const Py_UNICODE *s,
1742 int size,
1743 Py_UNICODE ch);
1744
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745static
1746PyObject *unicodeescape_string(const Py_UNICODE *s,
1747 int size,
1748 int quotes)
1749{
1750 PyObject *repr;
1751 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001753 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1756 if (repr == NULL)
1757 return NULL;
1758
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001759 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001760
1761 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762 *p++ = 'u';
1763 *p++ = (findchar(s, size, '\'') &&
1764 !findchar(s, size, '"')) ? '"' : '\'';
1765 }
1766 while (size-- > 0) {
1767 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001768
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001770 if (quotes &&
1771 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 *p++ = '\\';
1773 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001774 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001775 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001776
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001777#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001778 /* Map 21-bit characters to '\U00xxxxxx' */
1779 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001780 int offset = p - PyString_AS_STRING(repr);
1781
1782 /* Resize the string if necessary */
1783 if (offset + 12 > PyString_GET_SIZE(repr)) {
1784 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1785 goto onError;
1786 p = PyString_AS_STRING(repr) + offset;
1787 }
1788
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001789 *p++ = '\\';
1790 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001791 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1792 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1793 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1794 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1795 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1796 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1797 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001798 *p++ = hexdigit[ch & 0x0000000F];
1799 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001800 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001801#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001802 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1803 else if (ch >= 0xD800 && ch < 0xDC00) {
1804 Py_UNICODE ch2;
1805 Py_UCS4 ucs;
1806
1807 ch2 = *s++;
1808 size--;
1809 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1810 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1811 *p++ = '\\';
1812 *p++ = 'U';
1813 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1814 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1815 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1816 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1817 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1818 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1819 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1820 *p++ = hexdigit[ucs & 0x0000000F];
1821 continue;
1822 }
1823 /* Fall through: isolated surrogates are copied as-is */
1824 s--;
1825 size++;
1826 }
1827
Guido van Rossumd57fd912000-03-10 22:53:23 +00001828 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001829 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830 *p++ = '\\';
1831 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001832 *p++ = hexdigit[(ch >> 12) & 0x000F];
1833 *p++ = hexdigit[(ch >> 8) & 0x000F];
1834 *p++ = hexdigit[(ch >> 4) & 0x000F];
1835 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001837
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001838 /* Map special whitespace to '\t', \n', '\r' */
1839 else if (ch == '\t') {
1840 *p++ = '\\';
1841 *p++ = 't';
1842 }
1843 else if (ch == '\n') {
1844 *p++ = '\\';
1845 *p++ = 'n';
1846 }
1847 else if (ch == '\r') {
1848 *p++ = '\\';
1849 *p++ = 'r';
1850 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001851
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001852 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001853 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001855 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001856 *p++ = hexdigit[(ch >> 4) & 0x000F];
1857 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001859
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 /* Copy everything else as-is */
1861 else
1862 *p++ = (char) ch;
1863 }
1864 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001865 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001866
1867 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001868 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001869 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870
1871 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001872
1873 onError:
1874 Py_DECREF(repr);
1875 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876}
1877
1878PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1879 int size)
1880{
1881 return unicodeescape_string(s, size, 0);
1882}
1883
1884PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1885{
1886 if (!PyUnicode_Check(unicode)) {
1887 PyErr_BadArgument();
1888 return NULL;
1889 }
1890 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1891 PyUnicode_GET_SIZE(unicode));
1892}
1893
1894/* --- Raw Unicode Escape Codec ------------------------------------------- */
1895
1896PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1897 int size,
1898 const char *errors)
1899{
1900 PyUnicodeObject *v;
1901 Py_UNICODE *p, *buf;
1902 const char *end;
1903 const char *bs;
1904
1905 /* Escaped strings will always be longer than the resulting
1906 Unicode string, so we start with size here and then reduce the
1907 length after conversion to the true value. */
1908 v = _PyUnicode_New(size);
1909 if (v == NULL)
1910 goto onError;
1911 if (size == 0)
1912 return (PyObject *)v;
1913 p = buf = PyUnicode_AS_UNICODE(v);
1914 end = s + size;
1915 while (s < end) {
1916 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001917 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001918 int i;
1919
1920 /* Non-escape characters are interpreted as Unicode ordinals */
1921 if (*s != '\\') {
1922 *p++ = (unsigned char)*s++;
1923 continue;
1924 }
1925
1926 /* \u-escapes are only interpreted iff the number of leading
1927 backslashes if odd */
1928 bs = s;
1929 for (;s < end;) {
1930 if (*s != '\\')
1931 break;
1932 *p++ = (unsigned char)*s++;
1933 }
1934 if (((s - bs) & 1) == 0 ||
1935 s >= end ||
1936 *s != 'u') {
1937 continue;
1938 }
1939 p--;
1940 s++;
1941
1942 /* \uXXXX with 4 hex digits */
1943 for (x = 0, i = 0; i < 4; i++) {
1944 c = (unsigned char)s[i];
1945 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001946 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 "truncated \\uXXXX"))
1948 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001949 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001950 i++;
1951 break;
1952 }
1953 x = (x<<4) & ~0xF;
1954 if (c >= '0' && c <= '9')
1955 x += c - '0';
1956 else if (c >= 'a' && c <= 'f')
1957 x += 10 + c - 'a';
1958 else
1959 x += 10 + c - 'A';
1960 }
1961 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001962 if (x != 0xffffffff)
1963 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001965 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001966 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 return (PyObject *)v;
1968
1969 onError:
1970 Py_XDECREF(v);
1971 return NULL;
1972}
1973
1974PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1975 int size)
1976{
1977 PyObject *repr;
1978 char *p;
1979 char *q;
1980
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001981 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001982
1983 repr = PyString_FromStringAndSize(NULL, 6 * size);
1984 if (repr == NULL)
1985 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001986 if (size == 0)
1987 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988
1989 p = q = PyString_AS_STRING(repr);
1990 while (size-- > 0) {
1991 Py_UNICODE ch = *s++;
1992 /* Map 16-bit characters to '\uxxxx' */
1993 if (ch >= 256) {
1994 *p++ = '\\';
1995 *p++ = 'u';
1996 *p++ = hexdigit[(ch >> 12) & 0xf];
1997 *p++ = hexdigit[(ch >> 8) & 0xf];
1998 *p++ = hexdigit[(ch >> 4) & 0xf];
1999 *p++ = hexdigit[ch & 15];
2000 }
2001 /* Copy everything else as-is */
2002 else
2003 *p++ = (char) ch;
2004 }
2005 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002006 if (_PyString_Resize(&repr, p - q))
2007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002008
2009 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002010
2011 onError:
2012 Py_DECREF(repr);
2013 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002014}
2015
2016PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2017{
2018 if (!PyUnicode_Check(unicode)) {
2019 PyErr_BadArgument();
2020 return NULL;
2021 }
2022 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2023 PyUnicode_GET_SIZE(unicode));
2024}
2025
2026/* --- Latin-1 Codec ------------------------------------------------------ */
2027
2028PyObject *PyUnicode_DecodeLatin1(const char *s,
2029 int size,
2030 const char *errors)
2031{
2032 PyUnicodeObject *v;
2033 Py_UNICODE *p;
2034
2035 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002036 if (size == 1 && *(unsigned char*)s < 256) {
2037 Py_UNICODE r = *(unsigned char*)s;
2038 return PyUnicode_FromUnicode(&r, 1);
2039 }
2040
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 v = _PyUnicode_New(size);
2042 if (v == NULL)
2043 goto onError;
2044 if (size == 0)
2045 return (PyObject *)v;
2046 p = PyUnicode_AS_UNICODE(v);
2047 while (size-- > 0)
2048 *p++ = (unsigned char)*s++;
2049 return (PyObject *)v;
2050
2051 onError:
2052 Py_XDECREF(v);
2053 return NULL;
2054}
2055
2056static
2057int latin1_encoding_error(const Py_UNICODE **source,
2058 char **dest,
2059 const char *errors,
2060 const char *details)
2061{
2062 if ((errors == NULL) ||
2063 (strcmp(errors,"strict") == 0)) {
2064 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002065 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002066 details);
2067 return -1;
2068 }
2069 else if (strcmp(errors,"ignore") == 0) {
2070 return 0;
2071 }
2072 else if (strcmp(errors,"replace") == 0) {
2073 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002074 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 return 0;
2076 }
2077 else {
2078 PyErr_Format(PyExc_ValueError,
2079 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002080 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 errors);
2082 return -1;
2083 }
2084}
2085
2086PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2087 int size,
2088 const char *errors)
2089{
2090 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002091 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002092
Guido van Rossumd57fd912000-03-10 22:53:23 +00002093 repr = PyString_FromStringAndSize(NULL, size);
2094 if (repr == NULL)
2095 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002096 if (size == 0)
2097 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098
2099 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002100 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 while (size-- > 0) {
2102 Py_UNICODE ch = *p++;
2103 if (ch >= 256) {
2104 if (latin1_encoding_error(&p, &s, errors,
2105 "ordinal not in range(256)"))
2106 goto onError;
2107 }
2108 else
2109 *s++ = (char)ch;
2110 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002111 /* Resize if error handling skipped some characters */
2112 if (s - start < PyString_GET_SIZE(repr))
2113 if (_PyString_Resize(&repr, s - start))
2114 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002115 return repr;
2116
2117 onError:
2118 Py_DECREF(repr);
2119 return NULL;
2120}
2121
2122PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2123{
2124 if (!PyUnicode_Check(unicode)) {
2125 PyErr_BadArgument();
2126 return NULL;
2127 }
2128 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2129 PyUnicode_GET_SIZE(unicode),
2130 NULL);
2131}
2132
2133/* --- 7-bit ASCII Codec -------------------------------------------------- */
2134
2135static
2136int ascii_decoding_error(const char **source,
2137 Py_UNICODE **dest,
2138 const char *errors,
2139 const char *details)
2140{
2141 if ((errors == NULL) ||
2142 (strcmp(errors,"strict") == 0)) {
2143 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002144 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145 details);
2146 return -1;
2147 }
2148 else if (strcmp(errors,"ignore") == 0) {
2149 return 0;
2150 }
2151 else if (strcmp(errors,"replace") == 0) {
2152 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2153 (*dest)++;
2154 return 0;
2155 }
2156 else {
2157 PyErr_Format(PyExc_ValueError,
2158 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002159 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002160 errors);
2161 return -1;
2162 }
2163}
2164
2165PyObject *PyUnicode_DecodeASCII(const char *s,
2166 int size,
2167 const char *errors)
2168{
2169 PyUnicodeObject *v;
2170 Py_UNICODE *p;
2171
2172 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002173 if (size == 1 && *(unsigned char*)s < 128) {
2174 Py_UNICODE r = *(unsigned char*)s;
2175 return PyUnicode_FromUnicode(&r, 1);
2176 }
2177
Guido van Rossumd57fd912000-03-10 22:53:23 +00002178 v = _PyUnicode_New(size);
2179 if (v == NULL)
2180 goto onError;
2181 if (size == 0)
2182 return (PyObject *)v;
2183 p = PyUnicode_AS_UNICODE(v);
2184 while (size-- > 0) {
2185 register unsigned char c;
2186
2187 c = (unsigned char)*s++;
2188 if (c < 128)
2189 *p++ = c;
2190 else if (ascii_decoding_error(&s, &p, errors,
2191 "ordinal not in range(128)"))
2192 goto onError;
2193 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002194 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002195 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002196 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002197 return (PyObject *)v;
2198
2199 onError:
2200 Py_XDECREF(v);
2201 return NULL;
2202}
2203
2204static
2205int ascii_encoding_error(const Py_UNICODE **source,
2206 char **dest,
2207 const char *errors,
2208 const char *details)
2209{
2210 if ((errors == NULL) ||
2211 (strcmp(errors,"strict") == 0)) {
2212 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002213 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214 details);
2215 return -1;
2216 }
2217 else if (strcmp(errors,"ignore") == 0) {
2218 return 0;
2219 }
2220 else if (strcmp(errors,"replace") == 0) {
2221 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002222 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002223 return 0;
2224 }
2225 else {
2226 PyErr_Format(PyExc_ValueError,
2227 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002228 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 errors);
2230 return -1;
2231 }
2232}
2233
2234PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2235 int size,
2236 const char *errors)
2237{
2238 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002239 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002240
Guido van Rossumd57fd912000-03-10 22:53:23 +00002241 repr = PyString_FromStringAndSize(NULL, size);
2242 if (repr == NULL)
2243 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002244 if (size == 0)
2245 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002246
2247 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002248 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002249 while (size-- > 0) {
2250 Py_UNICODE ch = *p++;
2251 if (ch >= 128) {
2252 if (ascii_encoding_error(&p, &s, errors,
2253 "ordinal not in range(128)"))
2254 goto onError;
2255 }
2256 else
2257 *s++ = (char)ch;
2258 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002259 /* Resize if error handling skipped some characters */
2260 if (s - start < PyString_GET_SIZE(repr))
2261 if (_PyString_Resize(&repr, s - start))
2262 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002263 return repr;
2264
2265 onError:
2266 Py_DECREF(repr);
2267 return NULL;
2268}
2269
2270PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2271{
2272 if (!PyUnicode_Check(unicode)) {
2273 PyErr_BadArgument();
2274 return NULL;
2275 }
2276 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2277 PyUnicode_GET_SIZE(unicode),
2278 NULL);
2279}
2280
Fredrik Lundh30831632001-06-26 15:11:00 +00002281#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002282
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002283/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002284
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002285PyObject *PyUnicode_DecodeMBCS(const char *s,
2286 int size,
2287 const char *errors)
2288{
2289 PyUnicodeObject *v;
2290 Py_UNICODE *p;
2291
2292 /* First get the size of the result */
2293 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002294 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002295 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2296
2297 v = _PyUnicode_New(usize);
2298 if (v == NULL)
2299 return NULL;
2300 if (usize == 0)
2301 return (PyObject *)v;
2302 p = PyUnicode_AS_UNICODE(v);
2303 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2304 Py_DECREF(v);
2305 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2306 }
2307
2308 return (PyObject *)v;
2309}
2310
2311PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2312 int size,
2313 const char *errors)
2314{
2315 PyObject *repr;
2316 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002317 DWORD mbcssize;
2318
2319 /* If there are no characters, bail now! */
2320 if (size==0)
2321 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002322
2323 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002324 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002325 if (mbcssize==0)
2326 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2327
2328 repr = PyString_FromStringAndSize(NULL, mbcssize);
2329 if (repr == NULL)
2330 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002331 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002332 return repr;
2333
2334 /* Do the conversion */
2335 s = PyString_AS_STRING(repr);
2336 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2337 Py_DECREF(repr);
2338 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2339 }
2340 return repr;
2341}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002342
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002343#endif /* MS_WIN32 */
2344
Guido van Rossumd57fd912000-03-10 22:53:23 +00002345/* --- Character Mapping Codec -------------------------------------------- */
2346
2347static
2348int charmap_decoding_error(const char **source,
2349 Py_UNICODE **dest,
2350 const char *errors,
2351 const char *details)
2352{
2353 if ((errors == NULL) ||
2354 (strcmp(errors,"strict") == 0)) {
2355 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002356 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002357 details);
2358 return -1;
2359 }
2360 else if (strcmp(errors,"ignore") == 0) {
2361 return 0;
2362 }
2363 else if (strcmp(errors,"replace") == 0) {
2364 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2365 (*dest)++;
2366 return 0;
2367 }
2368 else {
2369 PyErr_Format(PyExc_ValueError,
2370 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002371 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002372 errors);
2373 return -1;
2374 }
2375}
2376
2377PyObject *PyUnicode_DecodeCharmap(const char *s,
2378 int size,
2379 PyObject *mapping,
2380 const char *errors)
2381{
2382 PyUnicodeObject *v;
2383 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002384 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002385
2386 /* Default to Latin-1 */
2387 if (mapping == NULL)
2388 return PyUnicode_DecodeLatin1(s, size, errors);
2389
2390 v = _PyUnicode_New(size);
2391 if (v == NULL)
2392 goto onError;
2393 if (size == 0)
2394 return (PyObject *)v;
2395 p = PyUnicode_AS_UNICODE(v);
2396 while (size-- > 0) {
2397 unsigned char ch = *s++;
2398 PyObject *w, *x;
2399
2400 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2401 w = PyInt_FromLong((long)ch);
2402 if (w == NULL)
2403 goto onError;
2404 x = PyObject_GetItem(mapping, w);
2405 Py_DECREF(w);
2406 if (x == NULL) {
2407 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002408 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002409 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002410 x = Py_None;
2411 Py_INCREF(x);
2412 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002413 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414 }
2415
2416 /* Apply mapping */
2417 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002418 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002419 if (value < 0 || value > 65535) {
2420 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002421 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002422 Py_DECREF(x);
2423 goto onError;
2424 }
2425 *p++ = (Py_UNICODE)value;
2426 }
2427 else if (x == Py_None) {
2428 /* undefined mapping */
2429 if (charmap_decoding_error(&s, &p, errors,
2430 "character maps to <undefined>")) {
2431 Py_DECREF(x);
2432 goto onError;
2433 }
2434 }
2435 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002436 int targetsize = PyUnicode_GET_SIZE(x);
2437
2438 if (targetsize == 1)
2439 /* 1-1 mapping */
2440 *p++ = *PyUnicode_AS_UNICODE(x);
2441
2442 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002443 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002444 if (targetsize > extrachars) {
2445 /* resize first */
2446 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2447 int needed = (targetsize - extrachars) + \
2448 (targetsize << 2);
2449 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002450 if (_PyUnicode_Resize(&v,
2451 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002452 Py_DECREF(x);
2453 goto onError;
2454 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002455 p = PyUnicode_AS_UNICODE(v) + oldpos;
2456 }
2457 Py_UNICODE_COPY(p,
2458 PyUnicode_AS_UNICODE(x),
2459 targetsize);
2460 p += targetsize;
2461 extrachars -= targetsize;
2462 }
2463 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002464 }
2465 else {
2466 /* wrong return value */
2467 PyErr_SetString(PyExc_TypeError,
2468 "character mapping must return integer, None or unicode");
2469 Py_DECREF(x);
2470 goto onError;
2471 }
2472 Py_DECREF(x);
2473 }
2474 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002475 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002476 goto onError;
2477 return (PyObject *)v;
2478
2479 onError:
2480 Py_XDECREF(v);
2481 return NULL;
2482}
2483
2484static
2485int charmap_encoding_error(const Py_UNICODE **source,
2486 char **dest,
2487 const char *errors,
2488 const char *details)
2489{
2490 if ((errors == NULL) ||
2491 (strcmp(errors,"strict") == 0)) {
2492 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002493 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002494 details);
2495 return -1;
2496 }
2497 else if (strcmp(errors,"ignore") == 0) {
2498 return 0;
2499 }
2500 else if (strcmp(errors,"replace") == 0) {
2501 **dest = '?';
2502 (*dest)++;
2503 return 0;
2504 }
2505 else {
2506 PyErr_Format(PyExc_ValueError,
2507 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002508 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002509 errors);
2510 return -1;
2511 }
2512}
2513
2514PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2515 int size,
2516 PyObject *mapping,
2517 const char *errors)
2518{
2519 PyObject *v;
2520 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002521 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522
2523 /* Default to Latin-1 */
2524 if (mapping == NULL)
2525 return PyUnicode_EncodeLatin1(p, size, errors);
2526
2527 v = PyString_FromStringAndSize(NULL, size);
2528 if (v == NULL)
2529 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002530 if (size == 0)
2531 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532 s = PyString_AS_STRING(v);
2533 while (size-- > 0) {
2534 Py_UNICODE ch = *p++;
2535 PyObject *w, *x;
2536
2537 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2538 w = PyInt_FromLong((long)ch);
2539 if (w == NULL)
2540 goto onError;
2541 x = PyObject_GetItem(mapping, w);
2542 Py_DECREF(w);
2543 if (x == NULL) {
2544 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002545 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002547 x = Py_None;
2548 Py_INCREF(x);
2549 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002550 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 }
2552
2553 /* Apply mapping */
2554 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002555 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 if (value < 0 || value > 255) {
2557 PyErr_SetString(PyExc_TypeError,
2558 "character mapping must be in range(256)");
2559 Py_DECREF(x);
2560 goto onError;
2561 }
2562 *s++ = (char)value;
2563 }
2564 else if (x == Py_None) {
2565 /* undefined mapping */
2566 if (charmap_encoding_error(&p, &s, errors,
2567 "character maps to <undefined>")) {
2568 Py_DECREF(x);
2569 goto onError;
2570 }
2571 }
2572 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002573 int targetsize = PyString_GET_SIZE(x);
2574
2575 if (targetsize == 1)
2576 /* 1-1 mapping */
2577 *s++ = *PyString_AS_STRING(x);
2578
2579 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002581 if (targetsize > extrachars) {
2582 /* resize first */
2583 int oldpos = (int)(s - PyString_AS_STRING(v));
2584 int needed = (targetsize - extrachars) + \
2585 (targetsize << 2);
2586 extrachars += needed;
2587 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002588 Py_DECREF(x);
2589 goto onError;
2590 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002591 s = PyString_AS_STRING(v) + oldpos;
2592 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002593 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002594 s += targetsize;
2595 extrachars -= targetsize;
2596 }
2597 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002598 }
2599 else {
2600 /* wrong return value */
2601 PyErr_SetString(PyExc_TypeError,
2602 "character mapping must return integer, None or unicode");
2603 Py_DECREF(x);
2604 goto onError;
2605 }
2606 Py_DECREF(x);
2607 }
2608 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2609 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2610 goto onError;
2611 return v;
2612
2613 onError:
2614 Py_DECREF(v);
2615 return NULL;
2616}
2617
2618PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2619 PyObject *mapping)
2620{
2621 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2622 PyErr_BadArgument();
2623 return NULL;
2624 }
2625 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2626 PyUnicode_GET_SIZE(unicode),
2627 mapping,
2628 NULL);
2629}
2630
2631static
2632int translate_error(const Py_UNICODE **source,
2633 Py_UNICODE **dest,
2634 const char *errors,
2635 const char *details)
2636{
2637 if ((errors == NULL) ||
2638 (strcmp(errors,"strict") == 0)) {
2639 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002640 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002641 details);
2642 return -1;
2643 }
2644 else if (strcmp(errors,"ignore") == 0) {
2645 return 0;
2646 }
2647 else if (strcmp(errors,"replace") == 0) {
2648 **dest = '?';
2649 (*dest)++;
2650 return 0;
2651 }
2652 else {
2653 PyErr_Format(PyExc_ValueError,
2654 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002655 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656 errors);
2657 return -1;
2658 }
2659}
2660
2661PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2662 int size,
2663 PyObject *mapping,
2664 const char *errors)
2665{
2666 PyUnicodeObject *v;
2667 Py_UNICODE *p;
2668
2669 if (mapping == NULL) {
2670 PyErr_BadArgument();
2671 return NULL;
2672 }
2673
2674 /* Output will never be longer than input */
2675 v = _PyUnicode_New(size);
2676 if (v == NULL)
2677 goto onError;
2678 if (size == 0)
2679 goto done;
2680 p = PyUnicode_AS_UNICODE(v);
2681 while (size-- > 0) {
2682 Py_UNICODE ch = *s++;
2683 PyObject *w, *x;
2684
2685 /* Get mapping */
2686 w = PyInt_FromLong(ch);
2687 if (w == NULL)
2688 goto onError;
2689 x = PyObject_GetItem(mapping, w);
2690 Py_DECREF(w);
2691 if (x == NULL) {
2692 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2693 /* No mapping found: default to 1-1 mapping */
2694 PyErr_Clear();
2695 *p++ = ch;
2696 continue;
2697 }
2698 goto onError;
2699 }
2700
2701 /* Apply mapping */
2702 if (PyInt_Check(x))
2703 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2704 else if (x == Py_None) {
2705 /* undefined mapping */
2706 if (translate_error(&s, &p, errors,
2707 "character maps to <undefined>")) {
2708 Py_DECREF(x);
2709 goto onError;
2710 }
2711 }
2712 else if (PyUnicode_Check(x)) {
2713 if (PyUnicode_GET_SIZE(x) != 1) {
2714 /* 1-n mapping */
2715 PyErr_SetString(PyExc_NotImplementedError,
2716 "1-n mappings are currently not implemented");
2717 Py_DECREF(x);
2718 goto onError;
2719 }
2720 *p++ = *PyUnicode_AS_UNICODE(x);
2721 }
2722 else {
2723 /* wrong return value */
2724 PyErr_SetString(PyExc_TypeError,
2725 "translate mapping must return integer, None or unicode");
2726 Py_DECREF(x);
2727 goto onError;
2728 }
2729 Py_DECREF(x);
2730 }
2731 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002732 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002734
2735 done:
2736 return (PyObject *)v;
2737
2738 onError:
2739 Py_XDECREF(v);
2740 return NULL;
2741}
2742
2743PyObject *PyUnicode_Translate(PyObject *str,
2744 PyObject *mapping,
2745 const char *errors)
2746{
2747 PyObject *result;
2748
2749 str = PyUnicode_FromObject(str);
2750 if (str == NULL)
2751 goto onError;
2752 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2753 PyUnicode_GET_SIZE(str),
2754 mapping,
2755 errors);
2756 Py_DECREF(str);
2757 return result;
2758
2759 onError:
2760 Py_XDECREF(str);
2761 return NULL;
2762}
2763
Guido van Rossum9e896b32000-04-05 20:11:21 +00002764/* --- Decimal Encoder ---------------------------------------------------- */
2765
2766int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2767 int length,
2768 char *output,
2769 const char *errors)
2770{
2771 Py_UNICODE *p, *end;
2772
2773 if (output == NULL) {
2774 PyErr_BadArgument();
2775 return -1;
2776 }
2777
2778 p = s;
2779 end = s + length;
2780 while (p < end) {
2781 register Py_UNICODE ch = *p++;
2782 int decimal;
2783
2784 if (Py_UNICODE_ISSPACE(ch)) {
2785 *output++ = ' ';
2786 continue;
2787 }
2788 decimal = Py_UNICODE_TODECIMAL(ch);
2789 if (decimal >= 0) {
2790 *output++ = '0' + decimal;
2791 continue;
2792 }
Guido van Rossumba477042000-04-06 18:18:10 +00002793 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002794 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002795 continue;
2796 }
2797 /* All other characters are considered invalid */
2798 if (errors == NULL || strcmp(errors, "strict") == 0) {
2799 PyErr_SetString(PyExc_ValueError,
2800 "invalid decimal Unicode string");
2801 goto onError;
2802 }
2803 else if (strcmp(errors, "ignore") == 0)
2804 continue;
2805 else if (strcmp(errors, "replace") == 0) {
2806 *output++ = '?';
2807 continue;
2808 }
2809 }
2810 /* 0-terminate the output string */
2811 *output++ = '\0';
2812 return 0;
2813
2814 onError:
2815 return -1;
2816}
2817
Guido van Rossumd57fd912000-03-10 22:53:23 +00002818/* --- Helpers ------------------------------------------------------------ */
2819
2820static
2821int count(PyUnicodeObject *self,
2822 int start,
2823 int end,
2824 PyUnicodeObject *substring)
2825{
2826 int count = 0;
2827
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002828 if (start < 0)
2829 start += self->length;
2830 if (start < 0)
2831 start = 0;
2832 if (end > self->length)
2833 end = self->length;
2834 if (end < 0)
2835 end += self->length;
2836 if (end < 0)
2837 end = 0;
2838
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002839 if (substring->length == 0)
2840 return (end - start + 1);
2841
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 end -= substring->length;
2843
2844 while (start <= end)
2845 if (Py_UNICODE_MATCH(self, start, substring)) {
2846 count++;
2847 start += substring->length;
2848 } else
2849 start++;
2850
2851 return count;
2852}
2853
2854int PyUnicode_Count(PyObject *str,
2855 PyObject *substr,
2856 int start,
2857 int end)
2858{
2859 int result;
2860
2861 str = PyUnicode_FromObject(str);
2862 if (str == NULL)
2863 return -1;
2864 substr = PyUnicode_FromObject(substr);
2865 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002866 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002867 return -1;
2868 }
2869
2870 result = count((PyUnicodeObject *)str,
2871 start, end,
2872 (PyUnicodeObject *)substr);
2873
2874 Py_DECREF(str);
2875 Py_DECREF(substr);
2876 return result;
2877}
2878
2879static
2880int findstring(PyUnicodeObject *self,
2881 PyUnicodeObject *substring,
2882 int start,
2883 int end,
2884 int direction)
2885{
2886 if (start < 0)
2887 start += self->length;
2888 if (start < 0)
2889 start = 0;
2890
2891 if (substring->length == 0)
2892 return start;
2893
2894 if (end > self->length)
2895 end = self->length;
2896 if (end < 0)
2897 end += self->length;
2898 if (end < 0)
2899 end = 0;
2900
2901 end -= substring->length;
2902
2903 if (direction < 0) {
2904 for (; end >= start; end--)
2905 if (Py_UNICODE_MATCH(self, end, substring))
2906 return end;
2907 } else {
2908 for (; start <= end; start++)
2909 if (Py_UNICODE_MATCH(self, start, substring))
2910 return start;
2911 }
2912
2913 return -1;
2914}
2915
2916int PyUnicode_Find(PyObject *str,
2917 PyObject *substr,
2918 int start,
2919 int end,
2920 int direction)
2921{
2922 int result;
2923
2924 str = PyUnicode_FromObject(str);
2925 if (str == NULL)
2926 return -1;
2927 substr = PyUnicode_FromObject(substr);
2928 if (substr == NULL) {
2929 Py_DECREF(substr);
2930 return -1;
2931 }
2932
2933 result = findstring((PyUnicodeObject *)str,
2934 (PyUnicodeObject *)substr,
2935 start, end, direction);
2936 Py_DECREF(str);
2937 Py_DECREF(substr);
2938 return result;
2939}
2940
2941static
2942int tailmatch(PyUnicodeObject *self,
2943 PyUnicodeObject *substring,
2944 int start,
2945 int end,
2946 int direction)
2947{
2948 if (start < 0)
2949 start += self->length;
2950 if (start < 0)
2951 start = 0;
2952
2953 if (substring->length == 0)
2954 return 1;
2955
2956 if (end > self->length)
2957 end = self->length;
2958 if (end < 0)
2959 end += self->length;
2960 if (end < 0)
2961 end = 0;
2962
2963 end -= substring->length;
2964 if (end < start)
2965 return 0;
2966
2967 if (direction > 0) {
2968 if (Py_UNICODE_MATCH(self, end, substring))
2969 return 1;
2970 } else {
2971 if (Py_UNICODE_MATCH(self, start, substring))
2972 return 1;
2973 }
2974
2975 return 0;
2976}
2977
2978int PyUnicode_Tailmatch(PyObject *str,
2979 PyObject *substr,
2980 int start,
2981 int end,
2982 int direction)
2983{
2984 int result;
2985
2986 str = PyUnicode_FromObject(str);
2987 if (str == NULL)
2988 return -1;
2989 substr = PyUnicode_FromObject(substr);
2990 if (substr == NULL) {
2991 Py_DECREF(substr);
2992 return -1;
2993 }
2994
2995 result = tailmatch((PyUnicodeObject *)str,
2996 (PyUnicodeObject *)substr,
2997 start, end, direction);
2998 Py_DECREF(str);
2999 Py_DECREF(substr);
3000 return result;
3001}
3002
3003static
3004const Py_UNICODE *findchar(const Py_UNICODE *s,
3005 int size,
3006 Py_UNICODE ch)
3007{
3008 /* like wcschr, but doesn't stop at NULL characters */
3009
3010 while (size-- > 0) {
3011 if (*s == ch)
3012 return s;
3013 s++;
3014 }
3015
3016 return NULL;
3017}
3018
3019/* Apply fixfct filter to the Unicode object self and return a
3020 reference to the modified object */
3021
3022static
3023PyObject *fixup(PyUnicodeObject *self,
3024 int (*fixfct)(PyUnicodeObject *s))
3025{
3026
3027 PyUnicodeObject *u;
3028
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003029 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003030 if (u == NULL)
3031 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003032
3033 Py_UNICODE_COPY(u->str, self->str, self->length);
3034
Tim Peters7a29bd52001-09-12 03:03:31 +00003035 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003036 /* fixfct should return TRUE if it modified the buffer. If
3037 FALSE, return a reference to the original buffer instead
3038 (to save space, not time) */
3039 Py_INCREF(self);
3040 Py_DECREF(u);
3041 return (PyObject*) self;
3042 }
3043 return (PyObject*) u;
3044}
3045
3046static
3047int fixupper(PyUnicodeObject *self)
3048{
3049 int len = self->length;
3050 Py_UNICODE *s = self->str;
3051 int status = 0;
3052
3053 while (len-- > 0) {
3054 register Py_UNICODE ch;
3055
3056 ch = Py_UNICODE_TOUPPER(*s);
3057 if (ch != *s) {
3058 status = 1;
3059 *s = ch;
3060 }
3061 s++;
3062 }
3063
3064 return status;
3065}
3066
3067static
3068int fixlower(PyUnicodeObject *self)
3069{
3070 int len = self->length;
3071 Py_UNICODE *s = self->str;
3072 int status = 0;
3073
3074 while (len-- > 0) {
3075 register Py_UNICODE ch;
3076
3077 ch = Py_UNICODE_TOLOWER(*s);
3078 if (ch != *s) {
3079 status = 1;
3080 *s = ch;
3081 }
3082 s++;
3083 }
3084
3085 return status;
3086}
3087
3088static
3089int fixswapcase(PyUnicodeObject *self)
3090{
3091 int len = self->length;
3092 Py_UNICODE *s = self->str;
3093 int status = 0;
3094
3095 while (len-- > 0) {
3096 if (Py_UNICODE_ISUPPER(*s)) {
3097 *s = Py_UNICODE_TOLOWER(*s);
3098 status = 1;
3099 } else if (Py_UNICODE_ISLOWER(*s)) {
3100 *s = Py_UNICODE_TOUPPER(*s);
3101 status = 1;
3102 }
3103 s++;
3104 }
3105
3106 return status;
3107}
3108
3109static
3110int fixcapitalize(PyUnicodeObject *self)
3111{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003112 int len = self->length;
3113 Py_UNICODE *s = self->str;
3114 int status = 0;
3115
3116 if (len == 0)
3117 return 0;
3118 if (Py_UNICODE_ISLOWER(*s)) {
3119 *s = Py_UNICODE_TOUPPER(*s);
3120 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003121 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003122 s++;
3123 while (--len > 0) {
3124 if (Py_UNICODE_ISUPPER(*s)) {
3125 *s = Py_UNICODE_TOLOWER(*s);
3126 status = 1;
3127 }
3128 s++;
3129 }
3130 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131}
3132
3133static
3134int fixtitle(PyUnicodeObject *self)
3135{
3136 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3137 register Py_UNICODE *e;
3138 int previous_is_cased;
3139
3140 /* Shortcut for single character strings */
3141 if (PyUnicode_GET_SIZE(self) == 1) {
3142 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3143 if (*p != ch) {
3144 *p = ch;
3145 return 1;
3146 }
3147 else
3148 return 0;
3149 }
3150
3151 e = p + PyUnicode_GET_SIZE(self);
3152 previous_is_cased = 0;
3153 for (; p < e; p++) {
3154 register const Py_UNICODE ch = *p;
3155
3156 if (previous_is_cased)
3157 *p = Py_UNICODE_TOLOWER(ch);
3158 else
3159 *p = Py_UNICODE_TOTITLE(ch);
3160
3161 if (Py_UNICODE_ISLOWER(ch) ||
3162 Py_UNICODE_ISUPPER(ch) ||
3163 Py_UNICODE_ISTITLE(ch))
3164 previous_is_cased = 1;
3165 else
3166 previous_is_cased = 0;
3167 }
3168 return 1;
3169}
3170
3171PyObject *PyUnicode_Join(PyObject *separator,
3172 PyObject *seq)
3173{
3174 Py_UNICODE *sep;
3175 int seplen;
3176 PyUnicodeObject *res = NULL;
3177 int reslen = 0;
3178 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003179 int sz = 100;
3180 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003181 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003182
Tim Peters2cfe3682001-05-05 05:36:48 +00003183 it = PyObject_GetIter(seq);
3184 if (it == NULL)
3185 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186
3187 if (separator == NULL) {
3188 Py_UNICODE blank = ' ';
3189 sep = &blank;
3190 seplen = 1;
3191 }
3192 else {
3193 separator = PyUnicode_FromObject(separator);
3194 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 sep = PyUnicode_AS_UNICODE(separator);
3197 seplen = PyUnicode_GET_SIZE(separator);
3198 }
3199
3200 res = _PyUnicode_New(sz);
3201 if (res == NULL)
3202 goto onError;
3203 p = PyUnicode_AS_UNICODE(res);
3204 reslen = 0;
3205
Tim Peters2cfe3682001-05-05 05:36:48 +00003206 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003207 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003208 PyObject *item = PyIter_Next(it);
3209 if (item == NULL) {
3210 if (PyErr_Occurred())
3211 goto onError;
3212 break;
3213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 if (!PyUnicode_Check(item)) {
3215 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003216 if (!PyString_Check(item)) {
3217 PyErr_Format(PyExc_TypeError,
3218 "sequence item %i: expected string or Unicode,"
3219 " %.80s found",
3220 i, item->ob_type->tp_name);
3221 Py_DECREF(item);
3222 goto onError;
3223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 v = PyUnicode_FromObject(item);
3225 Py_DECREF(item);
3226 item = v;
3227 if (item == NULL)
3228 goto onError;
3229 }
3230 itemlen = PyUnicode_GET_SIZE(item);
3231 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003232 if (_PyUnicode_Resize(&res, sz*2)) {
3233 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003235 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236 sz *= 2;
3237 p = PyUnicode_AS_UNICODE(res) + reslen;
3238 }
3239 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003240 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003241 p += seplen;
3242 reslen += seplen;
3243 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003244 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245 p += itemlen;
3246 reslen += itemlen;
3247 Py_DECREF(item);
3248 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003249 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 goto onError;
3251
3252 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003253 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 return (PyObject *)res;
3255
3256 onError:
3257 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003258 Py_XDECREF(res);
3259 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 return NULL;
3261}
3262
3263static
3264PyUnicodeObject *pad(PyUnicodeObject *self,
3265 int left,
3266 int right,
3267 Py_UNICODE fill)
3268{
3269 PyUnicodeObject *u;
3270
3271 if (left < 0)
3272 left = 0;
3273 if (right < 0)
3274 right = 0;
3275
Tim Peters7a29bd52001-09-12 03:03:31 +00003276 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 Py_INCREF(self);
3278 return self;
3279 }
3280
3281 u = _PyUnicode_New(left + self->length + right);
3282 if (u) {
3283 if (left)
3284 Py_UNICODE_FILL(u->str, fill, left);
3285 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3286 if (right)
3287 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3288 }
3289
3290 return u;
3291}
3292
3293#define SPLIT_APPEND(data, left, right) \
3294 str = PyUnicode_FromUnicode(data + left, right - left); \
3295 if (!str) \
3296 goto onError; \
3297 if (PyList_Append(list, str)) { \
3298 Py_DECREF(str); \
3299 goto onError; \
3300 } \
3301 else \
3302 Py_DECREF(str);
3303
3304static
3305PyObject *split_whitespace(PyUnicodeObject *self,
3306 PyObject *list,
3307 int maxcount)
3308{
3309 register int i;
3310 register int j;
3311 int len = self->length;
3312 PyObject *str;
3313
3314 for (i = j = 0; i < len; ) {
3315 /* find a token */
3316 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3317 i++;
3318 j = i;
3319 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3320 i++;
3321 if (j < i) {
3322 if (maxcount-- <= 0)
3323 break;
3324 SPLIT_APPEND(self->str, j, i);
3325 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3326 i++;
3327 j = i;
3328 }
3329 }
3330 if (j < len) {
3331 SPLIT_APPEND(self->str, j, len);
3332 }
3333 return list;
3334
3335 onError:
3336 Py_DECREF(list);
3337 return NULL;
3338}
3339
3340PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003341 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003342{
3343 register int i;
3344 register int j;
3345 int len;
3346 PyObject *list;
3347 PyObject *str;
3348 Py_UNICODE *data;
3349
3350 string = PyUnicode_FromObject(string);
3351 if (string == NULL)
3352 return NULL;
3353 data = PyUnicode_AS_UNICODE(string);
3354 len = PyUnicode_GET_SIZE(string);
3355
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 list = PyList_New(0);
3357 if (!list)
3358 goto onError;
3359
3360 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003361 int eol;
3362
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 /* Find a line and append it */
3364 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3365 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366
3367 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003368 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003369 if (i < len) {
3370 if (data[i] == '\r' && i + 1 < len &&
3371 data[i+1] == '\n')
3372 i += 2;
3373 else
3374 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003375 if (keepends)
3376 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 }
Guido van Rossum86662912000-04-11 15:38:46 +00003378 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379 j = i;
3380 }
3381 if (j < len) {
3382 SPLIT_APPEND(data, j, len);
3383 }
3384
3385 Py_DECREF(string);
3386 return list;
3387
3388 onError:
3389 Py_DECREF(list);
3390 Py_DECREF(string);
3391 return NULL;
3392}
3393
3394static
3395PyObject *split_char(PyUnicodeObject *self,
3396 PyObject *list,
3397 Py_UNICODE ch,
3398 int maxcount)
3399{
3400 register int i;
3401 register int j;
3402 int len = self->length;
3403 PyObject *str;
3404
3405 for (i = j = 0; i < len; ) {
3406 if (self->str[i] == ch) {
3407 if (maxcount-- <= 0)
3408 break;
3409 SPLIT_APPEND(self->str, j, i);
3410 i = j = i + 1;
3411 } else
3412 i++;
3413 }
3414 if (j <= len) {
3415 SPLIT_APPEND(self->str, j, len);
3416 }
3417 return list;
3418
3419 onError:
3420 Py_DECREF(list);
3421 return NULL;
3422}
3423
3424static
3425PyObject *split_substring(PyUnicodeObject *self,
3426 PyObject *list,
3427 PyUnicodeObject *substring,
3428 int maxcount)
3429{
3430 register int i;
3431 register int j;
3432 int len = self->length;
3433 int sublen = substring->length;
3434 PyObject *str;
3435
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003436 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 if (Py_UNICODE_MATCH(self, i, substring)) {
3438 if (maxcount-- <= 0)
3439 break;
3440 SPLIT_APPEND(self->str, j, i);
3441 i = j = i + sublen;
3442 } else
3443 i++;
3444 }
3445 if (j <= len) {
3446 SPLIT_APPEND(self->str, j, len);
3447 }
3448 return list;
3449
3450 onError:
3451 Py_DECREF(list);
3452 return NULL;
3453}
3454
3455#undef SPLIT_APPEND
3456
3457static
3458PyObject *split(PyUnicodeObject *self,
3459 PyUnicodeObject *substring,
3460 int maxcount)
3461{
3462 PyObject *list;
3463
3464 if (maxcount < 0)
3465 maxcount = INT_MAX;
3466
3467 list = PyList_New(0);
3468 if (!list)
3469 return NULL;
3470
3471 if (substring == NULL)
3472 return split_whitespace(self,list,maxcount);
3473
3474 else if (substring->length == 1)
3475 return split_char(self,list,substring->str[0],maxcount);
3476
3477 else if (substring->length == 0) {
3478 Py_DECREF(list);
3479 PyErr_SetString(PyExc_ValueError, "empty separator");
3480 return NULL;
3481 }
3482 else
3483 return split_substring(self,list,substring,maxcount);
3484}
3485
3486static
3487PyObject *strip(PyUnicodeObject *self,
3488 int left,
3489 int right)
3490{
3491 Py_UNICODE *p = self->str;
3492 int start = 0;
3493 int end = self->length;
3494
3495 if (left)
3496 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3497 start++;
3498
3499 if (right)
3500 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3501 end--;
3502
Tim Peters7a29bd52001-09-12 03:03:31 +00003503 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 /* couldn't strip anything off, return original string */
3505 Py_INCREF(self);
3506 return (PyObject*) self;
3507 }
3508
3509 return (PyObject*) PyUnicode_FromUnicode(
3510 self->str + start,
3511 end - start
3512 );
3513}
3514
3515static
3516PyObject *replace(PyUnicodeObject *self,
3517 PyUnicodeObject *str1,
3518 PyUnicodeObject *str2,
3519 int maxcount)
3520{
3521 PyUnicodeObject *u;
3522
3523 if (maxcount < 0)
3524 maxcount = INT_MAX;
3525
3526 if (str1->length == 1 && str2->length == 1) {
3527 int i;
3528
3529 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003530 if (!findchar(self->str, self->length, str1->str[0]) &&
3531 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 /* nothing to replace, return original string */
3533 Py_INCREF(self);
3534 u = self;
3535 } else {
3536 Py_UNICODE u1 = str1->str[0];
3537 Py_UNICODE u2 = str2->str[0];
3538
3539 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003540 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003541 self->length
3542 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003543 if (u != NULL) {
3544 Py_UNICODE_COPY(u->str, self->str,
3545 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003546 for (i = 0; i < u->length; i++)
3547 if (u->str[i] == u1) {
3548 if (--maxcount < 0)
3549 break;
3550 u->str[i] = u2;
3551 }
3552 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003553 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554
3555 } else {
3556 int n, i;
3557 Py_UNICODE *p;
3558
3559 /* replace strings */
3560 n = count(self, 0, self->length, str1);
3561 if (n > maxcount)
3562 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003563 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 /* nothing to replace, return original string */
3565 Py_INCREF(self);
3566 u = self;
3567 } else {
3568 u = _PyUnicode_New(
3569 self->length + n * (str2->length - str1->length));
3570 if (u) {
3571 i = 0;
3572 p = u->str;
3573 while (i <= self->length - str1->length)
3574 if (Py_UNICODE_MATCH(self, i, str1)) {
3575 /* replace string segment */
3576 Py_UNICODE_COPY(p, str2->str, str2->length);
3577 p += str2->length;
3578 i += str1->length;
3579 if (--n <= 0) {
3580 /* copy remaining part */
3581 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3582 break;
3583 }
3584 } else
3585 *p++ = self->str[i++];
3586 }
3587 }
3588 }
3589
3590 return (PyObject *) u;
3591}
3592
3593/* --- Unicode Object Methods --------------------------------------------- */
3594
3595static char title__doc__[] =
3596"S.title() -> unicode\n\
3597\n\
3598Return a titlecased version of S, i.e. words start with title case\n\
3599characters, all remaining cased characters have lower case.";
3600
3601static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003602unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003604 return fixup(self, fixtitle);
3605}
3606
3607static char capitalize__doc__[] =
3608"S.capitalize() -> unicode\n\
3609\n\
3610Return a capitalized version of S, i.e. make the first character\n\
3611have upper case.";
3612
3613static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003614unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003616 return fixup(self, fixcapitalize);
3617}
3618
3619#if 0
3620static char capwords__doc__[] =
3621"S.capwords() -> unicode\n\
3622\n\
3623Apply .capitalize() to all words in S and return the result with\n\
3624normalized whitespace (all whitespace strings are replaced by ' ').";
3625
3626static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003627unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003628{
3629 PyObject *list;
3630 PyObject *item;
3631 int i;
3632
Guido van Rossumd57fd912000-03-10 22:53:23 +00003633 /* Split into words */
3634 list = split(self, NULL, -1);
3635 if (!list)
3636 return NULL;
3637
3638 /* Capitalize each word */
3639 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3640 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3641 fixcapitalize);
3642 if (item == NULL)
3643 goto onError;
3644 Py_DECREF(PyList_GET_ITEM(list, i));
3645 PyList_SET_ITEM(list, i, item);
3646 }
3647
3648 /* Join the words to form a new string */
3649 item = PyUnicode_Join(NULL, list);
3650
3651onError:
3652 Py_DECREF(list);
3653 return (PyObject *)item;
3654}
3655#endif
3656
3657static char center__doc__[] =
3658"S.center(width) -> unicode\n\
3659\n\
3660Return S centered in a Unicode string of length width. Padding is done\n\
3661using spaces.";
3662
3663static PyObject *
3664unicode_center(PyUnicodeObject *self, PyObject *args)
3665{
3666 int marg, left;
3667 int width;
3668
3669 if (!PyArg_ParseTuple(args, "i:center", &width))
3670 return NULL;
3671
Tim Peters7a29bd52001-09-12 03:03:31 +00003672 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 Py_INCREF(self);
3674 return (PyObject*) self;
3675 }
3676
3677 marg = width - self->length;
3678 left = marg / 2 + (marg & width & 1);
3679
3680 return (PyObject*) pad(self, left, marg - left, ' ');
3681}
3682
Marc-André Lemburge5034372000-08-08 08:04:29 +00003683#if 0
3684
3685/* This code should go into some future Unicode collation support
3686 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003687 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003688
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003689/* speedy UTF-16 code point order comparison */
3690/* gleaned from: */
3691/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3692
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003693static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003694{
3695 0, 0, 0, 0, 0, 0, 0, 0,
3696 0, 0, 0, 0, 0, 0, 0, 0,
3697 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003698 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003699};
3700
Guido van Rossumd57fd912000-03-10 22:53:23 +00003701static int
3702unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3703{
3704 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003705
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 Py_UNICODE *s1 = str1->str;
3707 Py_UNICODE *s2 = str2->str;
3708
3709 len1 = str1->length;
3710 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003711
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003713 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003714
3715 c1 = *s1++;
3716 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003717
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003718 if (c1 > (1<<11) * 26)
3719 c1 += utf16Fixup[c1>>11];
3720 if (c2 > (1<<11) * 26)
3721 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003722 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003723
3724 if (c1 != c2)
3725 return (c1 < c2) ? -1 : 1;
3726
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003727 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003728 }
3729
3730 return (len1 < len2) ? -1 : (len1 != len2);
3731}
3732
Marc-André Lemburge5034372000-08-08 08:04:29 +00003733#else
3734
3735static int
3736unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3737{
3738 register int len1, len2;
3739
3740 Py_UNICODE *s1 = str1->str;
3741 Py_UNICODE *s2 = str2->str;
3742
3743 len1 = str1->length;
3744 len2 = str2->length;
3745
3746 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003747 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003748
Fredrik Lundh45714e92001-06-26 16:39:36 +00003749 c1 = *s1++;
3750 c2 = *s2++;
3751
3752 if (c1 != c2)
3753 return (c1 < c2) ? -1 : 1;
3754
Marc-André Lemburge5034372000-08-08 08:04:29 +00003755 len1--; len2--;
3756 }
3757
3758 return (len1 < len2) ? -1 : (len1 != len2);
3759}
3760
3761#endif
3762
Guido van Rossumd57fd912000-03-10 22:53:23 +00003763int PyUnicode_Compare(PyObject *left,
3764 PyObject *right)
3765{
3766 PyUnicodeObject *u = NULL, *v = NULL;
3767 int result;
3768
3769 /* Coerce the two arguments */
3770 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3771 if (u == NULL)
3772 goto onError;
3773 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3774 if (v == NULL)
3775 goto onError;
3776
Thomas Wouters7e474022000-07-16 12:04:32 +00003777 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003778 if (v == u) {
3779 Py_DECREF(u);
3780 Py_DECREF(v);
3781 return 0;
3782 }
3783
3784 result = unicode_compare(u, v);
3785
3786 Py_DECREF(u);
3787 Py_DECREF(v);
3788 return result;
3789
3790onError:
3791 Py_XDECREF(u);
3792 Py_XDECREF(v);
3793 return -1;
3794}
3795
Guido van Rossum403d68b2000-03-13 15:55:09 +00003796int PyUnicode_Contains(PyObject *container,
3797 PyObject *element)
3798{
3799 PyUnicodeObject *u = NULL, *v = NULL;
3800 int result;
3801 register const Py_UNICODE *p, *e;
3802 register Py_UNICODE ch;
3803
3804 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003805 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003806 if (v == NULL) {
3807 PyErr_SetString(PyExc_TypeError,
3808 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003809 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003810 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003811 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3812 if (u == NULL) {
3813 Py_DECREF(v);
3814 goto onError;
3815 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003816
3817 /* Check v in u */
3818 if (PyUnicode_GET_SIZE(v) != 1) {
3819 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003820 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003821 goto onError;
3822 }
3823 ch = *PyUnicode_AS_UNICODE(v);
3824 p = PyUnicode_AS_UNICODE(u);
3825 e = p + PyUnicode_GET_SIZE(u);
3826 result = 0;
3827 while (p < e) {
3828 if (*p++ == ch) {
3829 result = 1;
3830 break;
3831 }
3832 }
3833
3834 Py_DECREF(u);
3835 Py_DECREF(v);
3836 return result;
3837
3838onError:
3839 Py_XDECREF(u);
3840 Py_XDECREF(v);
3841 return -1;
3842}
3843
Guido van Rossumd57fd912000-03-10 22:53:23 +00003844/* Concat to string or Unicode object giving a new Unicode object. */
3845
3846PyObject *PyUnicode_Concat(PyObject *left,
3847 PyObject *right)
3848{
3849 PyUnicodeObject *u = NULL, *v = NULL, *w;
3850
3851 /* Coerce the two arguments */
3852 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3853 if (u == NULL)
3854 goto onError;
3855 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3856 if (v == NULL)
3857 goto onError;
3858
3859 /* Shortcuts */
3860 if (v == unicode_empty) {
3861 Py_DECREF(v);
3862 return (PyObject *)u;
3863 }
3864 if (u == unicode_empty) {
3865 Py_DECREF(u);
3866 return (PyObject *)v;
3867 }
3868
3869 /* Concat the two Unicode strings */
3870 w = _PyUnicode_New(u->length + v->length);
3871 if (w == NULL)
3872 goto onError;
3873 Py_UNICODE_COPY(w->str, u->str, u->length);
3874 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3875
3876 Py_DECREF(u);
3877 Py_DECREF(v);
3878 return (PyObject *)w;
3879
3880onError:
3881 Py_XDECREF(u);
3882 Py_XDECREF(v);
3883 return NULL;
3884}
3885
3886static char count__doc__[] =
3887"S.count(sub[, start[, end]]) -> int\n\
3888\n\
3889Return the number of occurrences of substring sub in Unicode string\n\
3890S[start:end]. Optional arguments start and end are\n\
3891interpreted as in slice notation.";
3892
3893static PyObject *
3894unicode_count(PyUnicodeObject *self, PyObject *args)
3895{
3896 PyUnicodeObject *substring;
3897 int start = 0;
3898 int end = INT_MAX;
3899 PyObject *result;
3900
Guido van Rossumb8872e62000-05-09 14:14:27 +00003901 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3902 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003903 return NULL;
3904
3905 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3906 (PyObject *)substring);
3907 if (substring == NULL)
3908 return NULL;
3909
Guido van Rossumd57fd912000-03-10 22:53:23 +00003910 if (start < 0)
3911 start += self->length;
3912 if (start < 0)
3913 start = 0;
3914 if (end > self->length)
3915 end = self->length;
3916 if (end < 0)
3917 end += self->length;
3918 if (end < 0)
3919 end = 0;
3920
3921 result = PyInt_FromLong((long) count(self, start, end, substring));
3922
3923 Py_DECREF(substring);
3924 return result;
3925}
3926
3927static char encode__doc__[] =
3928"S.encode([encoding[,errors]]) -> string\n\
3929\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003930Return an encoded string version of S. Default encoding is the current\n\
3931default string encoding. errors may be given to set a different error\n\
3932handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3933a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003934
3935static PyObject *
3936unicode_encode(PyUnicodeObject *self, PyObject *args)
3937{
3938 char *encoding = NULL;
3939 char *errors = NULL;
3940 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3941 return NULL;
3942 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3943}
3944
3945static char expandtabs__doc__[] =
3946"S.expandtabs([tabsize]) -> unicode\n\
3947\n\
3948Return a copy of S where all tab characters are expanded using spaces.\n\
3949If tabsize is not given, a tab size of 8 characters is assumed.";
3950
3951static PyObject*
3952unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3953{
3954 Py_UNICODE *e;
3955 Py_UNICODE *p;
3956 Py_UNICODE *q;
3957 int i, j;
3958 PyUnicodeObject *u;
3959 int tabsize = 8;
3960
3961 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3962 return NULL;
3963
Thomas Wouters7e474022000-07-16 12:04:32 +00003964 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 i = j = 0;
3966 e = self->str + self->length;
3967 for (p = self->str; p < e; p++)
3968 if (*p == '\t') {
3969 if (tabsize > 0)
3970 j += tabsize - (j % tabsize);
3971 }
3972 else {
3973 j++;
3974 if (*p == '\n' || *p == '\r') {
3975 i += j;
3976 j = 0;
3977 }
3978 }
3979
3980 /* Second pass: create output string and fill it */
3981 u = _PyUnicode_New(i + j);
3982 if (!u)
3983 return NULL;
3984
3985 j = 0;
3986 q = u->str;
3987
3988 for (p = self->str; p < e; p++)
3989 if (*p == '\t') {
3990 if (tabsize > 0) {
3991 i = tabsize - (j % tabsize);
3992 j += i;
3993 while (i--)
3994 *q++ = ' ';
3995 }
3996 }
3997 else {
3998 j++;
3999 *q++ = *p;
4000 if (*p == '\n' || *p == '\r')
4001 j = 0;
4002 }
4003
4004 return (PyObject*) u;
4005}
4006
4007static char find__doc__[] =
4008"S.find(sub [,start [,end]]) -> int\n\
4009\n\
4010Return the lowest index in S where substring sub is found,\n\
4011such that sub is contained within s[start,end]. Optional\n\
4012arguments start and end are interpreted as in slice notation.\n\
4013\n\
4014Return -1 on failure.";
4015
4016static PyObject *
4017unicode_find(PyUnicodeObject *self, PyObject *args)
4018{
4019 PyUnicodeObject *substring;
4020 int start = 0;
4021 int end = INT_MAX;
4022 PyObject *result;
4023
Guido van Rossumb8872e62000-05-09 14:14:27 +00004024 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4025 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026 return NULL;
4027 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4028 (PyObject *)substring);
4029 if (substring == NULL)
4030 return NULL;
4031
4032 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4033
4034 Py_DECREF(substring);
4035 return result;
4036}
4037
4038static PyObject *
4039unicode_getitem(PyUnicodeObject *self, int index)
4040{
4041 if (index < 0 || index >= self->length) {
4042 PyErr_SetString(PyExc_IndexError, "string index out of range");
4043 return NULL;
4044 }
4045
4046 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4047}
4048
4049static long
4050unicode_hash(PyUnicodeObject *self)
4051{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004052 /* Since Unicode objects compare equal to their ASCII string
4053 counterparts, they should use the individual character values
4054 as basis for their hash value. This is needed to assure that
4055 strings and Unicode objects behave in the same way as
4056 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057
Fredrik Lundhdde61642000-07-10 18:27:47 +00004058 register int len;
4059 register Py_UNICODE *p;
4060 register long x;
4061
Guido van Rossumd57fd912000-03-10 22:53:23 +00004062 if (self->hash != -1)
4063 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004064 len = PyUnicode_GET_SIZE(self);
4065 p = PyUnicode_AS_UNICODE(self);
4066 x = *p << 7;
4067 while (--len >= 0)
4068 x = (1000003*x) ^ *p++;
4069 x ^= PyUnicode_GET_SIZE(self);
4070 if (x == -1)
4071 x = -2;
4072 self->hash = x;
4073 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004074}
4075
4076static char index__doc__[] =
4077"S.index(sub [,start [,end]]) -> int\n\
4078\n\
4079Like S.find() but raise ValueError when the substring is not found.";
4080
4081static PyObject *
4082unicode_index(PyUnicodeObject *self, PyObject *args)
4083{
4084 int result;
4085 PyUnicodeObject *substring;
4086 int start = 0;
4087 int end = INT_MAX;
4088
Guido van Rossumb8872e62000-05-09 14:14:27 +00004089 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4090 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004091 return NULL;
4092
4093 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4094 (PyObject *)substring);
4095 if (substring == NULL)
4096 return NULL;
4097
4098 result = findstring(self, substring, start, end, 1);
4099
4100 Py_DECREF(substring);
4101 if (result < 0) {
4102 PyErr_SetString(PyExc_ValueError, "substring not found");
4103 return NULL;
4104 }
4105 return PyInt_FromLong(result);
4106}
4107
4108static char islower__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004109"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004110\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004111Return True if all cased characters in S are lowercase and there is\n\
4112at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113
4114static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004115unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116{
4117 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4118 register const Py_UNICODE *e;
4119 int cased;
4120
Guido van Rossumd57fd912000-03-10 22:53:23 +00004121 /* Shortcut for single character strings */
4122 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004123 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004125 /* Special case for empty strings */
4126 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004127 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004128
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 e = p + PyUnicode_GET_SIZE(self);
4130 cased = 0;
4131 for (; p < e; p++) {
4132 register const Py_UNICODE ch = *p;
4133
4134 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004135 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004136 else if (!cased && Py_UNICODE_ISLOWER(ch))
4137 cased = 1;
4138 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004139 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140}
4141
4142static char isupper__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004143"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004144\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004145Return True if all cased characters in S are uppercase and there is\n\
4146at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147
4148static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004149unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150{
4151 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4152 register const Py_UNICODE *e;
4153 int cased;
4154
Guido van Rossumd57fd912000-03-10 22:53:23 +00004155 /* Shortcut for single character strings */
4156 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004157 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004159 /* Special case for empty strings */
4160 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004161 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004162
Guido van Rossumd57fd912000-03-10 22:53:23 +00004163 e = p + PyUnicode_GET_SIZE(self);
4164 cased = 0;
4165 for (; p < e; p++) {
4166 register const Py_UNICODE ch = *p;
4167
4168 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004169 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 else if (!cased && Py_UNICODE_ISUPPER(ch))
4171 cased = 1;
4172 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004173 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174}
4175
4176static char istitle__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004177"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004178\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004179Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4180characters may only follow uncased characters and lowercase characters\n\
4181only cased ones. Return False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004182
4183static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004184unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185{
4186 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4187 register const Py_UNICODE *e;
4188 int cased, previous_is_cased;
4189
Guido van Rossumd57fd912000-03-10 22:53:23 +00004190 /* Shortcut for single character strings */
4191 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004192 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4193 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004194
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004195 /* Special case for empty strings */
4196 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004197 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004198
Guido van Rossumd57fd912000-03-10 22:53:23 +00004199 e = p + PyUnicode_GET_SIZE(self);
4200 cased = 0;
4201 previous_is_cased = 0;
4202 for (; p < e; p++) {
4203 register const Py_UNICODE ch = *p;
4204
4205 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4206 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004207 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004208 previous_is_cased = 1;
4209 cased = 1;
4210 }
4211 else if (Py_UNICODE_ISLOWER(ch)) {
4212 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004213 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004214 previous_is_cased = 1;
4215 cased = 1;
4216 }
4217 else
4218 previous_is_cased = 0;
4219 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004220 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004221}
4222
4223static char isspace__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004224"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004225\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004226Return True if there are only whitespace characters in S,\n\
4227False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228
4229static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004230unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231{
4232 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4233 register const Py_UNICODE *e;
4234
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235 /* Shortcut for single character strings */
4236 if (PyUnicode_GET_SIZE(self) == 1 &&
4237 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004238 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004239
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004240 /* Special case for empty strings */
4241 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004242 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004243
Guido van Rossumd57fd912000-03-10 22:53:23 +00004244 e = p + PyUnicode_GET_SIZE(self);
4245 for (; p < e; p++) {
4246 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004247 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004248 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004249 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250}
4251
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004252static char isalpha__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004253"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004254\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004255Return True if all characters in S are alphabetic\n\
4256and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004257
4258static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004259unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004260{
4261 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4262 register const Py_UNICODE *e;
4263
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004264 /* Shortcut for single character strings */
4265 if (PyUnicode_GET_SIZE(self) == 1 &&
4266 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004267 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004268
4269 /* Special case for empty strings */
4270 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004271 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004272
4273 e = p + PyUnicode_GET_SIZE(self);
4274 for (; p < e; p++) {
4275 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004276 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004277 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004278 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004279}
4280
4281static char isalnum__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004282"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004283\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004284Return True if all characters in S are alphanumeric\n\
4285and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004286
4287static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004288unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004289{
4290 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4291 register const Py_UNICODE *e;
4292
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004293 /* Shortcut for single character strings */
4294 if (PyUnicode_GET_SIZE(self) == 1 &&
4295 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004296 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004297
4298 /* Special case for empty strings */
4299 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004300 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004301
4302 e = p + PyUnicode_GET_SIZE(self);
4303 for (; p < e; p++) {
4304 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004305 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004306 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004307 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004308}
4309
Guido van Rossumd57fd912000-03-10 22:53:23 +00004310static char isdecimal__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004311"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004313Return True if there are only decimal characters in S,\n\
4314False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315
4316static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004317unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004318{
4319 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4320 register const Py_UNICODE *e;
4321
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322 /* Shortcut for single character strings */
4323 if (PyUnicode_GET_SIZE(self) == 1 &&
4324 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004325 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004326
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004327 /* Special case for empty strings */
4328 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004329 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004330
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 e = p + PyUnicode_GET_SIZE(self);
4332 for (; p < e; p++) {
4333 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004334 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004336 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337}
4338
4339static char isdigit__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004340"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004342Return True if there are only digit characters in S,\n\
4343False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344
4345static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004346unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347{
4348 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4349 register const Py_UNICODE *e;
4350
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351 /* Shortcut for single character strings */
4352 if (PyUnicode_GET_SIZE(self) == 1 &&
4353 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004354 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004355
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004356 /* Special case for empty strings */
4357 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004358 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004359
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360 e = p + PyUnicode_GET_SIZE(self);
4361 for (; p < e; p++) {
4362 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004363 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004365 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366}
4367
4368static char isnumeric__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004369"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004371Return True if there are only numeric characters in S,\n\
4372False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373
4374static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004375unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376{
4377 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4378 register const Py_UNICODE *e;
4379
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 /* Shortcut for single character strings */
4381 if (PyUnicode_GET_SIZE(self) == 1 &&
4382 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004383 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004384
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004385 /* Special case for empty strings */
4386 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004387 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004388
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389 e = p + PyUnicode_GET_SIZE(self);
4390 for (; p < e; p++) {
4391 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004392 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004394 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004395}
4396
4397static char join__doc__[] =
4398"S.join(sequence) -> unicode\n\
4399\n\
4400Return a string which is the concatenation of the strings in the\n\
4401sequence. The separator between elements is S.";
4402
4403static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004404unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004406 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407}
4408
4409static int
4410unicode_length(PyUnicodeObject *self)
4411{
4412 return self->length;
4413}
4414
4415static char ljust__doc__[] =
4416"S.ljust(width) -> unicode\n\
4417\n\
4418Return S left justified in a Unicode string of length width. Padding is\n\
4419done using spaces.";
4420
4421static PyObject *
4422unicode_ljust(PyUnicodeObject *self, PyObject *args)
4423{
4424 int width;
4425 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4426 return NULL;
4427
Tim Peters7a29bd52001-09-12 03:03:31 +00004428 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 Py_INCREF(self);
4430 return (PyObject*) self;
4431 }
4432
4433 return (PyObject*) pad(self, 0, width - self->length, ' ');
4434}
4435
4436static char lower__doc__[] =
4437"S.lower() -> unicode\n\
4438\n\
4439Return a copy of the string S converted to lowercase.";
4440
4441static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004442unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 return fixup(self, fixlower);
4445}
4446
4447static char lstrip__doc__[] =
4448"S.lstrip() -> unicode\n\
4449\n\
4450Return a copy of the string S with leading whitespace removed.";
4451
4452static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004453unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004455 return strip(self, 1, 0);
4456}
4457
4458static PyObject*
4459unicode_repeat(PyUnicodeObject *str, int len)
4460{
4461 PyUnicodeObject *u;
4462 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004463 int nchars;
4464 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465
4466 if (len < 0)
4467 len = 0;
4468
Tim Peters7a29bd52001-09-12 03:03:31 +00004469 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004470 /* no repeat, return original string */
4471 Py_INCREF(str);
4472 return (PyObject*) str;
4473 }
Tim Peters8f422462000-09-09 06:13:41 +00004474
4475 /* ensure # of chars needed doesn't overflow int and # of bytes
4476 * needed doesn't overflow size_t
4477 */
4478 nchars = len * str->length;
4479 if (len && nchars / len != str->length) {
4480 PyErr_SetString(PyExc_OverflowError,
4481 "repeated string is too long");
4482 return NULL;
4483 }
4484 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4485 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4486 PyErr_SetString(PyExc_OverflowError,
4487 "repeated string is too long");
4488 return NULL;
4489 }
4490 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004491 if (!u)
4492 return NULL;
4493
4494 p = u->str;
4495
4496 while (len-- > 0) {
4497 Py_UNICODE_COPY(p, str->str, str->length);
4498 p += str->length;
4499 }
4500
4501 return (PyObject*) u;
4502}
4503
4504PyObject *PyUnicode_Replace(PyObject *obj,
4505 PyObject *subobj,
4506 PyObject *replobj,
4507 int maxcount)
4508{
4509 PyObject *self;
4510 PyObject *str1;
4511 PyObject *str2;
4512 PyObject *result;
4513
4514 self = PyUnicode_FromObject(obj);
4515 if (self == NULL)
4516 return NULL;
4517 str1 = PyUnicode_FromObject(subobj);
4518 if (str1 == NULL) {
4519 Py_DECREF(self);
4520 return NULL;
4521 }
4522 str2 = PyUnicode_FromObject(replobj);
4523 if (str2 == NULL) {
4524 Py_DECREF(self);
4525 Py_DECREF(str1);
4526 return NULL;
4527 }
4528 result = replace((PyUnicodeObject *)self,
4529 (PyUnicodeObject *)str1,
4530 (PyUnicodeObject *)str2,
4531 maxcount);
4532 Py_DECREF(self);
4533 Py_DECREF(str1);
4534 Py_DECREF(str2);
4535 return result;
4536}
4537
4538static char replace__doc__[] =
4539"S.replace (old, new[, maxsplit]) -> unicode\n\
4540\n\
4541Return a copy of S with all occurrences of substring\n\
4542old replaced by new. If the optional argument maxsplit is\n\
4543given, only the first maxsplit occurrences are replaced.";
4544
4545static PyObject*
4546unicode_replace(PyUnicodeObject *self, PyObject *args)
4547{
4548 PyUnicodeObject *str1;
4549 PyUnicodeObject *str2;
4550 int maxcount = -1;
4551 PyObject *result;
4552
4553 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4554 return NULL;
4555 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4556 if (str1 == NULL)
4557 return NULL;
4558 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4559 if (str2 == NULL)
4560 return NULL;
4561
4562 result = replace(self, str1, str2, maxcount);
4563
4564 Py_DECREF(str1);
4565 Py_DECREF(str2);
4566 return result;
4567}
4568
4569static
4570PyObject *unicode_repr(PyObject *unicode)
4571{
4572 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4573 PyUnicode_GET_SIZE(unicode),
4574 1);
4575}
4576
4577static char rfind__doc__[] =
4578"S.rfind(sub [,start [,end]]) -> int\n\
4579\n\
4580Return the highest index in S where substring sub is found,\n\
4581such that sub is contained within s[start,end]. Optional\n\
4582arguments start and end are interpreted as in slice notation.\n\
4583\n\
4584Return -1 on failure.";
4585
4586static PyObject *
4587unicode_rfind(PyUnicodeObject *self, PyObject *args)
4588{
4589 PyUnicodeObject *substring;
4590 int start = 0;
4591 int end = INT_MAX;
4592 PyObject *result;
4593
Guido van Rossumb8872e62000-05-09 14:14:27 +00004594 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4595 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004596 return NULL;
4597 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4598 (PyObject *)substring);
4599 if (substring == NULL)
4600 return NULL;
4601
4602 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4603
4604 Py_DECREF(substring);
4605 return result;
4606}
4607
4608static char rindex__doc__[] =
4609"S.rindex(sub [,start [,end]]) -> int\n\
4610\n\
4611Like S.rfind() but raise ValueError when the substring is not found.";
4612
4613static PyObject *
4614unicode_rindex(PyUnicodeObject *self, PyObject *args)
4615{
4616 int result;
4617 PyUnicodeObject *substring;
4618 int start = 0;
4619 int end = INT_MAX;
4620
Guido van Rossumb8872e62000-05-09 14:14:27 +00004621 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4622 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 return NULL;
4624 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4625 (PyObject *)substring);
4626 if (substring == NULL)
4627 return NULL;
4628
4629 result = findstring(self, substring, start, end, -1);
4630
4631 Py_DECREF(substring);
4632 if (result < 0) {
4633 PyErr_SetString(PyExc_ValueError, "substring not found");
4634 return NULL;
4635 }
4636 return PyInt_FromLong(result);
4637}
4638
4639static char rjust__doc__[] =
4640"S.rjust(width) -> unicode\n\
4641\n\
4642Return S right justified in a Unicode string of length width. Padding is\n\
4643done using spaces.";
4644
4645static PyObject *
4646unicode_rjust(PyUnicodeObject *self, PyObject *args)
4647{
4648 int width;
4649 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4650 return NULL;
4651
Tim Peters7a29bd52001-09-12 03:03:31 +00004652 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653 Py_INCREF(self);
4654 return (PyObject*) self;
4655 }
4656
4657 return (PyObject*) pad(self, width - self->length, 0, ' ');
4658}
4659
4660static char rstrip__doc__[] =
4661"S.rstrip() -> unicode\n\
4662\n\
4663Return a copy of the string S with trailing whitespace removed.";
4664
4665static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004666unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668 return strip(self, 0, 1);
4669}
4670
4671static PyObject*
4672unicode_slice(PyUnicodeObject *self, int start, int end)
4673{
4674 /* standard clamping */
4675 if (start < 0)
4676 start = 0;
4677 if (end < 0)
4678 end = 0;
4679 if (end > self->length)
4680 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004681 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 /* full slice, return original string */
4683 Py_INCREF(self);
4684 return (PyObject*) self;
4685 }
4686 if (start > end)
4687 start = end;
4688 /* copy slice */
4689 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4690 end - start);
4691}
4692
4693PyObject *PyUnicode_Split(PyObject *s,
4694 PyObject *sep,
4695 int maxsplit)
4696{
4697 PyObject *result;
4698
4699 s = PyUnicode_FromObject(s);
4700 if (s == NULL)
4701 return NULL;
4702 if (sep != NULL) {
4703 sep = PyUnicode_FromObject(sep);
4704 if (sep == NULL) {
4705 Py_DECREF(s);
4706 return NULL;
4707 }
4708 }
4709
4710 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4711
4712 Py_DECREF(s);
4713 Py_XDECREF(sep);
4714 return result;
4715}
4716
4717static char split__doc__[] =
4718"S.split([sep [,maxsplit]]) -> list of strings\n\
4719\n\
4720Return a list of the words in S, using sep as the\n\
4721delimiter string. If maxsplit is given, at most maxsplit\n\
4722splits are done. If sep is not specified, any whitespace string\n\
4723is a separator.";
4724
4725static PyObject*
4726unicode_split(PyUnicodeObject *self, PyObject *args)
4727{
4728 PyObject *substring = Py_None;
4729 int maxcount = -1;
4730
4731 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4732 return NULL;
4733
4734 if (substring == Py_None)
4735 return split(self, NULL, maxcount);
4736 else if (PyUnicode_Check(substring))
4737 return split(self, (PyUnicodeObject *)substring, maxcount);
4738 else
4739 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4740}
4741
4742static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004743"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744\n\
4745Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004746Line breaks are not included in the resulting list unless keepends\n\
4747is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748
4749static PyObject*
4750unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4751{
Guido van Rossum86662912000-04-11 15:38:46 +00004752 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753
Guido van Rossum86662912000-04-11 15:38:46 +00004754 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 return NULL;
4756
Guido van Rossum86662912000-04-11 15:38:46 +00004757 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758}
4759
4760static
4761PyObject *unicode_str(PyUnicodeObject *self)
4762{
Fred Drakee4315f52000-05-09 19:53:39 +00004763 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764}
4765
4766static char strip__doc__[] =
4767"S.strip() -> unicode\n\
4768\n\
4769Return a copy of S with leading and trailing whitespace removed.";
4770
4771static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004772unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 return strip(self, 1, 1);
4775}
4776
4777static char swapcase__doc__[] =
4778"S.swapcase() -> unicode\n\
4779\n\
4780Return a copy of S with uppercase characters converted to lowercase\n\
4781and vice versa.";
4782
4783static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004784unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 return fixup(self, fixswapcase);
4787}
4788
4789static char translate__doc__[] =
4790"S.translate(table) -> unicode\n\
4791\n\
4792Return a copy of the string S, where all characters have been mapped\n\
4793through the given translation table, which must be a mapping of\n\
4794Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4795are left untouched. Characters mapped to None are deleted.";
4796
4797static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004798unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800 return PyUnicode_TranslateCharmap(self->str,
4801 self->length,
4802 table,
4803 "ignore");
4804}
4805
4806static char upper__doc__[] =
4807"S.upper() -> unicode\n\
4808\n\
4809Return a copy of S converted to uppercase.";
4810
4811static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004812unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814 return fixup(self, fixupper);
4815}
4816
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817static char zfill__doc__[] =
4818"S.zfill(width) -> unicode\n\
4819\n\
4820Pad a numeric string x with zeros on the left, to fill a field\n\
4821of the specified width. The string x is never truncated.";
4822
4823static PyObject *
4824unicode_zfill(PyUnicodeObject *self, PyObject *args)
4825{
4826 int fill;
4827 PyUnicodeObject *u;
4828
4829 int width;
4830 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4831 return NULL;
4832
4833 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004834 if (PyUnicode_CheckExact(self)) {
4835 Py_INCREF(self);
4836 return (PyObject*) self;
4837 }
4838 else
4839 return PyUnicode_FromUnicode(
4840 PyUnicode_AS_UNICODE(self),
4841 PyUnicode_GET_SIZE(self)
4842 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 }
4844
4845 fill = width - self->length;
4846
4847 u = pad(self, fill, 0, '0');
4848
Walter Dörwald068325e2002-04-15 13:36:47 +00004849 if (u == NULL)
4850 return NULL;
4851
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 if (u->str[fill] == '+' || u->str[fill] == '-') {
4853 /* move sign to beginning of string */
4854 u->str[0] = u->str[fill];
4855 u->str[fill] = '0';
4856 }
4857
4858 return (PyObject*) u;
4859}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860
4861#if 0
4862static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004863unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004865 return PyInt_FromLong(unicode_freelist_size);
4866}
4867#endif
4868
4869static char startswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004870"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004872Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873optional start, test S beginning at that position. With optional end, stop\n\
4874comparing S at that position.";
4875
4876static PyObject *
4877unicode_startswith(PyUnicodeObject *self,
4878 PyObject *args)
4879{
4880 PyUnicodeObject *substring;
4881 int start = 0;
4882 int end = INT_MAX;
4883 PyObject *result;
4884
Guido van Rossumb8872e62000-05-09 14:14:27 +00004885 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4886 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004887 return NULL;
4888 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4889 (PyObject *)substring);
4890 if (substring == NULL)
4891 return NULL;
4892
Guido van Rossum77f6a652002-04-03 22:41:51 +00004893 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894
4895 Py_DECREF(substring);
4896 return result;
4897}
4898
4899
4900static char endswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004901"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004903Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904optional start, test S beginning at that position. With optional end, stop\n\
4905comparing S at that position.";
4906
4907static PyObject *
4908unicode_endswith(PyUnicodeObject *self,
4909 PyObject *args)
4910{
4911 PyUnicodeObject *substring;
4912 int start = 0;
4913 int end = INT_MAX;
4914 PyObject *result;
4915
Guido van Rossumb8872e62000-05-09 14:14:27 +00004916 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4917 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004918 return NULL;
4919 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4920 (PyObject *)substring);
4921 if (substring == NULL)
4922 return NULL;
4923
Guido van Rossum77f6a652002-04-03 22:41:51 +00004924 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925
4926 Py_DECREF(substring);
4927 return result;
4928}
4929
4930
4931static PyMethodDef unicode_methods[] = {
4932
4933 /* Order is according to common usage: often used methods should
4934 appear first, since lookup is done sequentially. */
4935
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004936 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4937 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4938 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4939 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4940 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4941 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4942 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4943 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4944 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4945 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4946 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4947 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4948 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4949 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4950/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4951 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4952 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4953 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4954 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4955 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4956 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4957 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4958 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4959 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4960 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4961 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4962 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4963 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4964 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4965 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4966 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4967 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4968 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4969 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4970 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004971 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00004972#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004973 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004974#endif
4975
4976#if 0
4977 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004978 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979#endif
4980
4981 {NULL, NULL}
4982};
4983
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984static PySequenceMethods unicode_as_sequence = {
4985 (inquiry) unicode_length, /* sq_length */
4986 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4987 (intargfunc) unicode_repeat, /* sq_repeat */
4988 (intargfunc) unicode_getitem, /* sq_item */
4989 (intintargfunc) unicode_slice, /* sq_slice */
4990 0, /* sq_ass_item */
4991 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004992 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993};
4994
4995static int
4996unicode_buffer_getreadbuf(PyUnicodeObject *self,
4997 int index,
4998 const void **ptr)
4999{
5000 if (index != 0) {
5001 PyErr_SetString(PyExc_SystemError,
5002 "accessing non-existent unicode segment");
5003 return -1;
5004 }
5005 *ptr = (void *) self->str;
5006 return PyUnicode_GET_DATA_SIZE(self);
5007}
5008
5009static int
5010unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5011 const void **ptr)
5012{
5013 PyErr_SetString(PyExc_TypeError,
5014 "cannot use unicode as modifyable buffer");
5015 return -1;
5016}
5017
5018static int
5019unicode_buffer_getsegcount(PyUnicodeObject *self,
5020 int *lenp)
5021{
5022 if (lenp)
5023 *lenp = PyUnicode_GET_DATA_SIZE(self);
5024 return 1;
5025}
5026
5027static int
5028unicode_buffer_getcharbuf(PyUnicodeObject *self,
5029 int index,
5030 const void **ptr)
5031{
5032 PyObject *str;
5033
5034 if (index != 0) {
5035 PyErr_SetString(PyExc_SystemError,
5036 "accessing non-existent unicode segment");
5037 return -1;
5038 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005039 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 if (str == NULL)
5041 return -1;
5042 *ptr = (void *) PyString_AS_STRING(str);
5043 return PyString_GET_SIZE(str);
5044}
5045
5046/* Helpers for PyUnicode_Format() */
5047
5048static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005049getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050{
5051 int argidx = *p_argidx;
5052 if (argidx < arglen) {
5053 (*p_argidx)++;
5054 if (arglen < 0)
5055 return args;
5056 else
5057 return PyTuple_GetItem(args, argidx);
5058 }
5059 PyErr_SetString(PyExc_TypeError,
5060 "not enough arguments for format string");
5061 return NULL;
5062}
5063
5064#define F_LJUST (1<<0)
5065#define F_SIGN (1<<1)
5066#define F_BLANK (1<<2)
5067#define F_ALT (1<<3)
5068#define F_ZERO (1<<4)
5069
5070static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072{
5073 register int i;
5074 int len;
5075 va_list va;
5076 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005078
5079 /* First, format the string as char array, then expand to Py_UNICODE
5080 array. */
5081 charbuffer = (char *)buffer;
5082 len = vsprintf(charbuffer, format, va);
5083 for (i = len - 1; i >= 0; i--)
5084 buffer[i] = (Py_UNICODE) charbuffer[i];
5085
5086 va_end(va);
5087 return len;
5088}
5089
5090static int
5091formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005092 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005093 int flags,
5094 int prec,
5095 int type,
5096 PyObject *v)
5097{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005098 /* fmt = '%#.' + `prec` + `type`
5099 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005100 char fmt[20];
5101 double x;
5102
5103 x = PyFloat_AsDouble(v);
5104 if (x == -1.0 && PyErr_Occurred())
5105 return -1;
5106 if (prec < 0)
5107 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005108 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5109 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005110 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5111 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005112 /* worst case length calc to ensure no buffer overrun:
5113 fmt = %#.<prec>g
5114 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5115 for any double rep.)
5116 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5117 If prec=0 the effective precision is 1 (the leading digit is
5118 always given), therefore increase by one to 10+prec. */
5119 if (buflen <= (size_t)10 + (size_t)prec) {
5120 PyErr_SetString(PyExc_OverflowError,
5121 "formatted float is too long (precision too long?)");
5122 return -1;
5123 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 return usprintf(buf, fmt, x);
5125}
5126
Tim Peters38fd5b62000-09-21 05:43:11 +00005127static PyObject*
5128formatlong(PyObject *val, int flags, int prec, int type)
5129{
5130 char *buf;
5131 int i, len;
5132 PyObject *str; /* temporary string object. */
5133 PyUnicodeObject *result;
5134
5135 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5136 if (!str)
5137 return NULL;
5138 result = _PyUnicode_New(len);
5139 for (i = 0; i < len; i++)
5140 result->str[i] = buf[i];
5141 result->str[len] = 0;
5142 Py_DECREF(str);
5143 return (PyObject*)result;
5144}
5145
Guido van Rossumd57fd912000-03-10 22:53:23 +00005146static int
5147formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005148 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 int flags,
5150 int prec,
5151 int type,
5152 PyObject *v)
5153{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005154 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005155 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5156 * + 1 + 1
5157 * = 24
5158 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005159 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005160 long x;
5161
5162 x = PyInt_AsLong(v);
5163 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005164 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005166 prec = 1;
5167
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005168 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005169 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5170 */
5171 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005172 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005173 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005174 return -1;
5175 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005176
5177 if ((flags & F_ALT) &&
5178 (type == 'x' || type == 'X')) {
5179 /* When converting under %#x or %#X, there are a number
5180 * of issues that cause pain:
5181 * - when 0 is being converted, the C standard leaves off
5182 * the '0x' or '0X', which is inconsistent with other
5183 * %#x/%#X conversions and inconsistent with Python's
5184 * hex() function
5185 * - there are platforms that violate the standard and
5186 * convert 0 with the '0x' or '0X'
5187 * (Metrowerks, Compaq Tru64)
5188 * - there are platforms that give '0x' when converting
5189 * under %#X, but convert 0 in accordance with the
5190 * standard (OS/2 EMX)
5191 *
5192 * We can achieve the desired consistency by inserting our
5193 * own '0x' or '0X' prefix, and substituting %x/%X in place
5194 * of %#x/%#X.
5195 *
5196 * Note that this is the same approach as used in
5197 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005198 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005199 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5200 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005201 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005202 else {
5203 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5204 (flags&F_ALT) ? "#" : "",
5205 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005206 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 return usprintf(buf, fmt, x);
5208}
5209
5210static int
5211formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005212 size_t buflen,
5213 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005215 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005216 if (PyUnicode_Check(v)) {
5217 if (PyUnicode_GET_SIZE(v) != 1)
5218 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005220 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005222 else if (PyString_Check(v)) {
5223 if (PyString_GET_SIZE(v) != 1)
5224 goto onError;
5225 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227
5228 else {
5229 /* Integer input truncated to a character */
5230 long x;
5231 x = PyInt_AsLong(v);
5232 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005233 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 buf[0] = (char) x;
5235 }
5236 buf[1] = '\0';
5237 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005238
5239 onError:
5240 PyErr_SetString(PyExc_TypeError,
5241 "%c requires int or char");
5242 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005243}
5244
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005245/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5246
5247 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5248 chars are formatted. XXX This is a magic number. Each formatting
5249 routine does bounds checking to ensure no overflow, but a better
5250 solution may be to malloc a buffer of appropriate size for each
5251 format. For now, the current solution is sufficient.
5252*/
5253#define FORMATBUFLEN (size_t)120
5254
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255PyObject *PyUnicode_Format(PyObject *format,
5256 PyObject *args)
5257{
5258 Py_UNICODE *fmt, *res;
5259 int fmtcnt, rescnt, reslen, arglen, argidx;
5260 int args_owned = 0;
5261 PyUnicodeObject *result = NULL;
5262 PyObject *dict = NULL;
5263 PyObject *uformat;
5264
5265 if (format == NULL || args == NULL) {
5266 PyErr_BadInternalCall();
5267 return NULL;
5268 }
5269 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005270 if (uformat == NULL)
5271 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272 fmt = PyUnicode_AS_UNICODE(uformat);
5273 fmtcnt = PyUnicode_GET_SIZE(uformat);
5274
5275 reslen = rescnt = fmtcnt + 100;
5276 result = _PyUnicode_New(reslen);
5277 if (result == NULL)
5278 goto onError;
5279 res = PyUnicode_AS_UNICODE(result);
5280
5281 if (PyTuple_Check(args)) {
5282 arglen = PyTuple_Size(args);
5283 argidx = 0;
5284 }
5285 else {
5286 arglen = -1;
5287 argidx = -2;
5288 }
5289 if (args->ob_type->tp_as_mapping)
5290 dict = args;
5291
5292 while (--fmtcnt >= 0) {
5293 if (*fmt != '%') {
5294 if (--rescnt < 0) {
5295 rescnt = fmtcnt + 100;
5296 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005297 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 return NULL;
5299 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5300 --rescnt;
5301 }
5302 *res++ = *fmt++;
5303 }
5304 else {
5305 /* Got a format specifier */
5306 int flags = 0;
5307 int width = -1;
5308 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005309 Py_UNICODE c = '\0';
5310 Py_UNICODE fill;
5311 PyObject *v = NULL;
5312 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005313 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005314 Py_UNICODE sign;
5315 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005316 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317
5318 fmt++;
5319 if (*fmt == '(') {
5320 Py_UNICODE *keystart;
5321 int keylen;
5322 PyObject *key;
5323 int pcount = 1;
5324
5325 if (dict == NULL) {
5326 PyErr_SetString(PyExc_TypeError,
5327 "format requires a mapping");
5328 goto onError;
5329 }
5330 ++fmt;
5331 --fmtcnt;
5332 keystart = fmt;
5333 /* Skip over balanced parentheses */
5334 while (pcount > 0 && --fmtcnt >= 0) {
5335 if (*fmt == ')')
5336 --pcount;
5337 else if (*fmt == '(')
5338 ++pcount;
5339 fmt++;
5340 }
5341 keylen = fmt - keystart - 1;
5342 if (fmtcnt < 0 || pcount > 0) {
5343 PyErr_SetString(PyExc_ValueError,
5344 "incomplete format key");
5345 goto onError;
5346 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005347#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005348 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005349 then looked up since Python uses strings to hold
5350 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005351 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352 key = PyUnicode_EncodeUTF8(keystart,
5353 keylen,
5354 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005355#else
5356 key = PyUnicode_FromUnicode(keystart, keylen);
5357#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 if (key == NULL)
5359 goto onError;
5360 if (args_owned) {
5361 Py_DECREF(args);
5362 args_owned = 0;
5363 }
5364 args = PyObject_GetItem(dict, key);
5365 Py_DECREF(key);
5366 if (args == NULL) {
5367 goto onError;
5368 }
5369 args_owned = 1;
5370 arglen = -1;
5371 argidx = -2;
5372 }
5373 while (--fmtcnt >= 0) {
5374 switch (c = *fmt++) {
5375 case '-': flags |= F_LJUST; continue;
5376 case '+': flags |= F_SIGN; continue;
5377 case ' ': flags |= F_BLANK; continue;
5378 case '#': flags |= F_ALT; continue;
5379 case '0': flags |= F_ZERO; continue;
5380 }
5381 break;
5382 }
5383 if (c == '*') {
5384 v = getnextarg(args, arglen, &argidx);
5385 if (v == NULL)
5386 goto onError;
5387 if (!PyInt_Check(v)) {
5388 PyErr_SetString(PyExc_TypeError,
5389 "* wants int");
5390 goto onError;
5391 }
5392 width = PyInt_AsLong(v);
5393 if (width < 0) {
5394 flags |= F_LJUST;
5395 width = -width;
5396 }
5397 if (--fmtcnt >= 0)
5398 c = *fmt++;
5399 }
5400 else if (c >= '0' && c <= '9') {
5401 width = c - '0';
5402 while (--fmtcnt >= 0) {
5403 c = *fmt++;
5404 if (c < '0' || c > '9')
5405 break;
5406 if ((width*10) / 10 != width) {
5407 PyErr_SetString(PyExc_ValueError,
5408 "width too big");
5409 goto onError;
5410 }
5411 width = width*10 + (c - '0');
5412 }
5413 }
5414 if (c == '.') {
5415 prec = 0;
5416 if (--fmtcnt >= 0)
5417 c = *fmt++;
5418 if (c == '*') {
5419 v = getnextarg(args, arglen, &argidx);
5420 if (v == NULL)
5421 goto onError;
5422 if (!PyInt_Check(v)) {
5423 PyErr_SetString(PyExc_TypeError,
5424 "* wants int");
5425 goto onError;
5426 }
5427 prec = PyInt_AsLong(v);
5428 if (prec < 0)
5429 prec = 0;
5430 if (--fmtcnt >= 0)
5431 c = *fmt++;
5432 }
5433 else if (c >= '0' && c <= '9') {
5434 prec = c - '0';
5435 while (--fmtcnt >= 0) {
5436 c = Py_CHARMASK(*fmt++);
5437 if (c < '0' || c > '9')
5438 break;
5439 if ((prec*10) / 10 != prec) {
5440 PyErr_SetString(PyExc_ValueError,
5441 "prec too big");
5442 goto onError;
5443 }
5444 prec = prec*10 + (c - '0');
5445 }
5446 }
5447 } /* prec */
5448 if (fmtcnt >= 0) {
5449 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 if (--fmtcnt >= 0)
5451 c = *fmt++;
5452 }
5453 }
5454 if (fmtcnt < 0) {
5455 PyErr_SetString(PyExc_ValueError,
5456 "incomplete format");
5457 goto onError;
5458 }
5459 if (c != '%') {
5460 v = getnextarg(args, arglen, &argidx);
5461 if (v == NULL)
5462 goto onError;
5463 }
5464 sign = 0;
5465 fill = ' ';
5466 switch (c) {
5467
5468 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005469 pbuf = formatbuf;
5470 /* presume that buffer length is at least 1 */
5471 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005472 len = 1;
5473 break;
5474
5475 case 's':
5476 case 'r':
5477 if (PyUnicode_Check(v) && c == 's') {
5478 temp = v;
5479 Py_INCREF(temp);
5480 }
5481 else {
5482 PyObject *unicode;
5483 if (c == 's')
5484 temp = PyObject_Str(v);
5485 else
5486 temp = PyObject_Repr(v);
5487 if (temp == NULL)
5488 goto onError;
5489 if (!PyString_Check(temp)) {
5490 /* XXX Note: this should never happen, since
5491 PyObject_Repr() and PyObject_Str() assure
5492 this */
5493 Py_DECREF(temp);
5494 PyErr_SetString(PyExc_TypeError,
5495 "%s argument has non-string str()");
5496 goto onError;
5497 }
Fred Drakee4315f52000-05-09 19:53:39 +00005498 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005499 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005500 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 "strict");
5502 Py_DECREF(temp);
5503 temp = unicode;
5504 if (temp == NULL)
5505 goto onError;
5506 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005507 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005508 len = PyUnicode_GET_SIZE(temp);
5509 if (prec >= 0 && len > prec)
5510 len = prec;
5511 break;
5512
5513 case 'i':
5514 case 'd':
5515 case 'u':
5516 case 'o':
5517 case 'x':
5518 case 'X':
5519 if (c == 'i')
5520 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005521 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005522 temp = formatlong(v, flags, prec, c);
5523 if (!temp)
5524 goto onError;
5525 pbuf = PyUnicode_AS_UNICODE(temp);
5526 len = PyUnicode_GET_SIZE(temp);
5527 /* unbounded ints can always produce
5528 a sign character! */
5529 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005530 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005531 else {
5532 pbuf = formatbuf;
5533 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5534 flags, prec, c, v);
5535 if (len < 0)
5536 goto onError;
5537 /* only d conversion is signed */
5538 sign = c == 'd';
5539 }
5540 if (flags & F_ZERO)
5541 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005542 break;
5543
5544 case 'e':
5545 case 'E':
5546 case 'f':
5547 case 'g':
5548 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005549 pbuf = formatbuf;
5550 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5551 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 if (len < 0)
5553 goto onError;
5554 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005555 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005556 fill = '0';
5557 break;
5558
5559 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005560 pbuf = formatbuf;
5561 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 if (len < 0)
5563 goto onError;
5564 break;
5565
5566 default:
5567 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005568 "unsupported format character '%c' (0x%x) "
5569 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005570 (31<=c && c<=126) ? c : '?',
5571 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 goto onError;
5573 }
5574 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005575 if (*pbuf == '-' || *pbuf == '+') {
5576 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005577 len--;
5578 }
5579 else if (flags & F_SIGN)
5580 sign = '+';
5581 else if (flags & F_BLANK)
5582 sign = ' ';
5583 else
5584 sign = 0;
5585 }
5586 if (width < len)
5587 width = len;
5588 if (rescnt < width + (sign != 0)) {
5589 reslen -= rescnt;
5590 rescnt = width + fmtcnt + 100;
5591 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005592 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005593 return NULL;
5594 res = PyUnicode_AS_UNICODE(result)
5595 + reslen - rescnt;
5596 }
5597 if (sign) {
5598 if (fill != ' ')
5599 *res++ = sign;
5600 rescnt--;
5601 if (width > len)
5602 width--;
5603 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005604 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5605 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005606 assert(pbuf[1] == c);
5607 if (fill != ' ') {
5608 *res++ = *pbuf++;
5609 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005610 }
Tim Petersfff53252001-04-12 18:38:48 +00005611 rescnt -= 2;
5612 width -= 2;
5613 if (width < 0)
5614 width = 0;
5615 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005617 if (width > len && !(flags & F_LJUST)) {
5618 do {
5619 --rescnt;
5620 *res++ = fill;
5621 } while (--width > len);
5622 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005623 if (fill == ' ') {
5624 if (sign)
5625 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005626 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005627 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005628 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005629 *res++ = *pbuf++;
5630 *res++ = *pbuf++;
5631 }
5632 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005633 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005634 res += len;
5635 rescnt -= len;
5636 while (--width >= len) {
5637 --rescnt;
5638 *res++ = ' ';
5639 }
5640 if (dict && (argidx < arglen) && c != '%') {
5641 PyErr_SetString(PyExc_TypeError,
5642 "not all arguments converted");
5643 goto onError;
5644 }
5645 Py_XDECREF(temp);
5646 } /* '%' */
5647 } /* until end */
5648 if (argidx < arglen && !dict) {
5649 PyErr_SetString(PyExc_TypeError,
5650 "not all arguments converted");
5651 goto onError;
5652 }
5653
5654 if (args_owned) {
5655 Py_DECREF(args);
5656 }
5657 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005658 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005659 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005660 return (PyObject *)result;
5661
5662 onError:
5663 Py_XDECREF(result);
5664 Py_DECREF(uformat);
5665 if (args_owned) {
5666 Py_DECREF(args);
5667 }
5668 return NULL;
5669}
5670
5671static PyBufferProcs unicode_as_buffer = {
5672 (getreadbufferproc) unicode_buffer_getreadbuf,
5673 (getwritebufferproc) unicode_buffer_getwritebuf,
5674 (getsegcountproc) unicode_buffer_getsegcount,
5675 (getcharbufferproc) unicode_buffer_getcharbuf,
5676};
5677
Guido van Rossume023fe02001-08-30 03:12:59 +00005678staticforward PyObject *
5679unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5680
Tim Peters6d6c1a32001-08-02 04:15:00 +00005681static PyObject *
5682unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5683{
5684 PyObject *x = NULL;
5685 static char *kwlist[] = {"string", "encoding", "errors", 0};
5686 char *encoding = NULL;
5687 char *errors = NULL;
5688
Guido van Rossume023fe02001-08-30 03:12:59 +00005689 if (type != &PyUnicode_Type)
5690 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005691 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5692 kwlist, &x, &encoding, &errors))
5693 return NULL;
5694 if (x == NULL)
5695 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005696 if (encoding == NULL && errors == NULL)
5697 return PyObject_Unicode(x);
5698 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005699 return PyUnicode_FromEncodedObject(x, encoding, errors);
5700}
5701
Guido van Rossume023fe02001-08-30 03:12:59 +00005702static PyObject *
5703unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5704{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005705 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005706 int n;
5707
5708 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5709 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5710 if (tmp == NULL)
5711 return NULL;
5712 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005713 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5714 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005715 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005716 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5717 if (pnew->str == NULL) {
5718 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005719 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005720 return NULL;
5721 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005722 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5723 pnew->length = n;
5724 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005725 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005726 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005727}
5728
Tim Peters6d6c1a32001-08-02 04:15:00 +00005729static char unicode_doc[] =
5730"unicode(string [, encoding[, errors]]) -> object\n\
5731\n\
5732Create a new Unicode object from the given encoded string.\n\
5733encoding defaults to the current default string encoding and \n\
5734errors, defining the error handling, to 'strict'.";
5735
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736PyTypeObject PyUnicode_Type = {
5737 PyObject_HEAD_INIT(&PyType_Type)
5738 0, /* ob_size */
5739 "unicode", /* tp_name */
5740 sizeof(PyUnicodeObject), /* tp_size */
5741 0, /* tp_itemsize */
5742 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005743 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005745 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746 0, /* tp_setattr */
5747 (cmpfunc) unicode_compare, /* tp_compare */
5748 (reprfunc) unicode_repr, /* tp_repr */
5749 0, /* tp_as_number */
5750 &unicode_as_sequence, /* tp_as_sequence */
5751 0, /* tp_as_mapping */
5752 (hashfunc) unicode_hash, /* tp_hash*/
5753 0, /* tp_call*/
5754 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005755 PyObject_GenericGetAttr, /* tp_getattro */
5756 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005758 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005759 unicode_doc, /* tp_doc */
5760 0, /* tp_traverse */
5761 0, /* tp_clear */
5762 0, /* tp_richcompare */
5763 0, /* tp_weaklistoffset */
5764 0, /* tp_iter */
5765 0, /* tp_iternext */
5766 unicode_methods, /* tp_methods */
5767 0, /* tp_members */
5768 0, /* tp_getset */
5769 0, /* tp_base */
5770 0, /* tp_dict */
5771 0, /* tp_descr_get */
5772 0, /* tp_descr_set */
5773 0, /* tp_dictoffset */
5774 0, /* tp_init */
5775 0, /* tp_alloc */
5776 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005777 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005778};
5779
5780/* Initialize the Unicode implementation */
5781
Thomas Wouters78890102000-07-22 19:25:51 +00005782void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005784 int i;
5785
Fred Drakee4315f52000-05-09 19:53:39 +00005786 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005787 unicode_freelist = NULL;
5788 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005789 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005790 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005791 for (i = 0; i < 256; i++)
5792 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793}
5794
5795/* Finalize the Unicode implementation */
5796
5797void
Thomas Wouters78890102000-07-22 19:25:51 +00005798_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005800 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005801 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005802
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005803 Py_XDECREF(unicode_empty);
5804 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005805
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005806 for (i = 0; i < 256; i++) {
5807 if (unicode_latin1[i]) {
5808 Py_DECREF(unicode_latin1[i]);
5809 unicode_latin1[i] = NULL;
5810 }
5811 }
5812
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005813 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814 PyUnicodeObject *v = u;
5815 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005816 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005817 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005818 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005819 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005820 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005821 unicode_freelist = NULL;
5822 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823}