blob: abaa67c5c5f07ab127ed82863ce63d8889bb38cc [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
933 }
934 return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
973{
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000977 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978 details);
979 return -1;
980 }
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
984 }
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
990 }
991 else {
992 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000993 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 errors);
995 return -1;
996 }
997}
998
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1002{
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001007 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1016
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1020
1021 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 s++;
1027 continue;
1028 }
1029
1030 n = utf8_code_length[ch];
1031
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 switch (n) {
1038
1039 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001040 errmsg = "unexpected code byte";
1041 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 errmsg = "internal error";
1045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001058 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 break;
1060
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001068 if (ch < 0x0800) {
1069 /* Note: UTF-8 encodings of surrogates are considered
1070 legal UTF-8 sequences;
1071
1072 XXX For wide builds (UCS-4) we should probably try
1073 to recombine the surrogates into a single code
1074 unit.
1075 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001080 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
1175PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1176 int size,
1177 const char *errors)
1178{
1179 PyObject *v;
1180 char *p;
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001181 unsigned int allocated = 0;
1182 int i;
1183
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001184 /* Short-cut for emtpy strings */
1185 if (size == 0)
1186 return PyString_FromStringAndSize(NULL, 0);
1187
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001188 for (i = 0; i < size; ) {
1189 Py_UCS4 ch = s[i++];
1190 if (ch < 0x80)
1191 allocated += 1;
1192 else if (ch < 0x0800)
1193 allocated += 2;
1194 else if (ch < 0x10000) {
1195 /* Check for high surrogate */
1196 if (0xD800 <= ch && ch <= 0xDBFF &&
1197 i != size &&
1198 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
1199 allocated += 1;
1200 i++;
1201 }
1202 allocated += 3;
1203 } else
1204 allocated += 4;
1205 }
1206
1207 v = PyString_FromStringAndSize(NULL, allocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 if (v == NULL)
1209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001211 p = PyString_AS_STRING(v);
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001212 for (i = 0; i < size; ) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001213 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001214
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001215 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 *p++ = (char) ch;
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001217 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001218
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 else if (ch < 0x0800) {
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001220 *p++ = (char)(0xc0 | (ch >> 6));
1221 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001222 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001223
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001224 else {
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001225
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001226 if (ch < 0x10000) {
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001227 /* Check for high surrogate */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001228 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1229 Py_UCS4 ch2 = s[i];
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001230 /* Check for low surrogate */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001231 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001232 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1233 *p++ = (char)((ch >> 18) | 0xf0);
1234 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1235 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1236 *p++ = (char)(0x80 | (ch & 0x3f));
1237 i++;
1238 continue;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001239 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001240 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001241 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001242 *p++ = (char)(0xe0 | (ch >> 12));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001243 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1244 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001245
1246 } else {
1247 *p++ = (char)(0xf0 | (ch>>18));
1248 *p++ = (char)(0x80 | ((ch>>12) & 0x3f));
1249 *p++ = (char)(0x80 | ((ch>>6) & 0x3f));
1250 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001251 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001252 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 }
Martin v. Löwisa4eb14b2002-04-20 13:44:01 +00001254 assert(p - PyString_AS_STRING(v) == allocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001255 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256}
1257
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1259{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 if (!PyUnicode_Check(unicode)) {
1261 PyErr_BadArgument();
1262 return NULL;
1263 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001264 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1265 PyUnicode_GET_SIZE(unicode),
1266 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267}
1268
1269/* --- UTF-16 Codec ------------------------------------------------------- */
1270
1271static
Tim Peters772747b2001-08-09 22:21:55 +00001272int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001273 const char *errors,
1274 const char *details)
1275{
1276 if ((errors == NULL) ||
1277 (strcmp(errors,"strict") == 0)) {
1278 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001279 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 details);
1281 return -1;
1282 }
1283 else if (strcmp(errors,"ignore") == 0) {
1284 return 0;
1285 }
1286 else if (strcmp(errors,"replace") == 0) {
1287 if (dest) {
1288 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1289 (*dest)++;
1290 }
1291 return 0;
1292 }
1293 else {
1294 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001295 "UTF-16 decoding error; "
1296 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 errors);
1298 return -1;
1299 }
1300}
1301
Tim Peters772747b2001-08-09 22:21:55 +00001302PyObject *
1303PyUnicode_DecodeUTF16(const char *s,
1304 int size,
1305 const char *errors,
1306 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001307{
1308 PyUnicodeObject *unicode;
1309 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001310 const unsigned char *q, *e;
1311 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001312 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001313 /* Offsets from q for retrieving byte pairs in the right order. */
1314#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1315 int ihi = 1, ilo = 0;
1316#else
1317 int ihi = 0, ilo = 1;
1318#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319
1320 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001321 if (size & 1) {
1322 if (utf16_decoding_error(NULL, errors, "truncated data"))
1323 return NULL;
1324 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325 }
1326
1327 /* Note: size will always be longer than the resulting Unicode
1328 character count */
1329 unicode = _PyUnicode_New(size);
1330 if (!unicode)
1331 return NULL;
1332 if (size == 0)
1333 return (PyObject *)unicode;
1334
1335 /* Unpack UTF-16 encoded data */
1336 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001337 q = (unsigned char *)s;
1338 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339
1340 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001341 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001343 /* Check for BOM marks (U+FEFF) in the input and adjust current
1344 byte order setting accordingly. In native mode, the leading BOM
1345 mark is skipped, in all other modes, it is copied to the output
1346 stream as-is (giving a ZWNBSP character). */
1347 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001348 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001349#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001350 if (bom == 0xFEFF) {
1351 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001352 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001353 }
1354 else if (bom == 0xFFFE) {
1355 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001356 bo = 1;
1357 }
1358#else
Tim Peters772747b2001-08-09 22:21:55 +00001359 if (bom == 0xFEFF) {
1360 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001361 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001362 }
1363 else if (bom == 0xFFFE) {
1364 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001365 bo = -1;
1366 }
1367#endif
1368 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001369
Tim Peters772747b2001-08-09 22:21:55 +00001370 if (bo == -1) {
1371 /* force LE */
1372 ihi = 1;
1373 ilo = 0;
1374 }
1375 else if (bo == 1) {
1376 /* force BE */
1377 ihi = 0;
1378 ilo = 1;
1379 }
1380
1381 while (q < e) {
1382 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1383 q += 2;
1384
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 if (ch < 0xD800 || ch > 0xDFFF) {
1386 *p++ = ch;
1387 continue;
1388 }
1389
1390 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001391 if (q >= e) {
1392 errmsg = "unexpected end of data";
1393 goto utf16Error;
1394 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001395 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001396 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1397 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001398 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001399#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001400 *p++ = ch;
1401 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001402#else
1403 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001404#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001405 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001406 }
1407 else {
1408 errmsg = "illegal UTF-16 surrogate";
1409 goto utf16Error;
1410 }
1411
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001413 errmsg = "illegal encoding";
1414 /* Fall through to report the error */
1415
1416 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001417 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001418 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419 }
1420
1421 if (byteorder)
1422 *byteorder = bo;
1423
1424 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001425 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 goto onError;
1427
1428 return (PyObject *)unicode;
1429
1430onError:
1431 Py_DECREF(unicode);
1432 return NULL;
1433}
1434
Tim Peters772747b2001-08-09 22:21:55 +00001435PyObject *
1436PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1437 int size,
1438 const char *errors,
1439 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440{
1441 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001442 unsigned char *p;
1443 int i, pairs;
1444 /* Offsets from p for storing byte pairs in the right order. */
1445#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1446 int ihi = 1, ilo = 0;
1447#else
1448 int ihi = 0, ilo = 1;
1449#endif
1450
1451#define STORECHAR(CH) \
1452 do { \
1453 p[ihi] = ((CH) >> 8) & 0xff; \
1454 p[ilo] = (CH) & 0xff; \
1455 p += 2; \
1456 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001458 for (i = pairs = 0; i < size; i++)
1459 if (s[i] >= 0x10000)
1460 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001462 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463 if (v == NULL)
1464 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465
Tim Peters772747b2001-08-09 22:21:55 +00001466 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001467 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001468 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001469 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001470 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001471
1472 if (byteorder == -1) {
1473 /* force LE */
1474 ihi = 1;
1475 ilo = 0;
1476 }
1477 else if (byteorder == 1) {
1478 /* force BE */
1479 ihi = 0;
1480 ilo = 1;
1481 }
1482
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001483 while (size-- > 0) {
1484 Py_UNICODE ch = *s++;
1485 Py_UNICODE ch2 = 0;
1486 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001487 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1488 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 }
Tim Peters772747b2001-08-09 22:21:55 +00001490 STORECHAR(ch);
1491 if (ch2)
1492 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001495#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496}
1497
1498PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1499{
1500 if (!PyUnicode_Check(unicode)) {
1501 PyErr_BadArgument();
1502 return NULL;
1503 }
1504 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1505 PyUnicode_GET_SIZE(unicode),
1506 NULL,
1507 0);
1508}
1509
1510/* --- Unicode Escape Codec ----------------------------------------------- */
1511
1512static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001513int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001514 const char *errors,
1515 const char *details)
1516{
1517 if ((errors == NULL) ||
1518 (strcmp(errors,"strict") == 0)) {
1519 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001520 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521 details);
1522 return -1;
1523 }
1524 else if (strcmp(errors,"ignore") == 0) {
1525 return 0;
1526 }
1527 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001528 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1529 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001530 return 0;
1531 }
1532 else {
1533 PyErr_Format(PyExc_ValueError,
1534 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001535 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536 errors);
1537 return -1;
1538 }
1539}
1540
Fredrik Lundh06d12682001-01-24 07:59:11 +00001541static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001542
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1544 int size,
1545 const char *errors)
1546{
1547 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001548 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001550 char* message;
1551 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1552
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553 /* Escaped strings will always be longer than the resulting
1554 Unicode string, so we start with size here and then reduce the
1555 length after conversion to the true value. */
1556 v = _PyUnicode_New(size);
1557 if (v == NULL)
1558 goto onError;
1559 if (size == 0)
1560 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001561
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 p = buf = PyUnicode_AS_UNICODE(v);
1563 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001564
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 while (s < end) {
1566 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001567 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001568 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569
1570 /* Non-escape characters are interpreted as Unicode ordinals */
1571 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001572 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573 continue;
1574 }
1575
1576 /* \ - Escapes */
1577 s++;
1578 switch (*s++) {
1579
1580 /* \x escapes */
1581 case '\n': break;
1582 case '\\': *p++ = '\\'; break;
1583 case '\'': *p++ = '\''; break;
1584 case '\"': *p++ = '\"'; break;
1585 case 'b': *p++ = '\b'; break;
1586 case 'f': *p++ = '\014'; break; /* FF */
1587 case 't': *p++ = '\t'; break;
1588 case 'n': *p++ = '\n'; break;
1589 case 'r': *p++ = '\r'; break;
1590 case 'v': *p++ = '\013'; break; /* VT */
1591 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1592
1593 /* \OOO (octal) escapes */
1594 case '0': case '1': case '2': case '3':
1595 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001596 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001598 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001600 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001602 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 break;
1604
Fredrik Lundhccc74732001-02-18 22:13:49 +00001605 /* hex escapes */
1606 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001607 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001608 digits = 2;
1609 message = "truncated \\xXX escape";
1610 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611
Fredrik Lundhccc74732001-02-18 22:13:49 +00001612 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001614 digits = 4;
1615 message = "truncated \\uXXXX escape";
1616 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001617
Fredrik Lundhccc74732001-02-18 22:13:49 +00001618 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001619 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001620 digits = 8;
1621 message = "truncated \\UXXXXXXXX escape";
1622 hexescape:
1623 chr = 0;
1624 for (i = 0; i < digits; i++) {
1625 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001626 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001627 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001628 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001629 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001630 i++;
1631 break;
1632 }
1633 chr = (chr<<4) & ~0xF;
1634 if (c >= '0' && c <= '9')
1635 chr += c - '0';
1636 else if (c >= 'a' && c <= 'f')
1637 chr += 10 + c - 'a';
1638 else
1639 chr += 10 + c - 'A';
1640 }
1641 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001642 if (chr == 0xffffffff)
1643 /* _decoding_error will have already written into the
1644 target buffer. */
1645 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001646 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001647 /* when we get here, chr is a 32-bit unicode character */
1648 if (chr <= 0xffff)
1649 /* UCS-2 character */
1650 *p++ = (Py_UNICODE) chr;
1651 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001652 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001653 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001654#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001655 *p++ = chr;
1656#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001657 chr -= 0x10000L;
1658 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001659 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001660#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001661 } else {
1662 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001663 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001664 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001665 )
1666 goto onError;
1667 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001668 break;
1669
1670 /* \N{name} */
1671 case 'N':
1672 message = "malformed \\N character escape";
1673 if (ucnhash_CAPI == NULL) {
1674 /* load the unicode data module */
1675 PyObject *m, *v;
1676 m = PyImport_ImportModule("unicodedata");
1677 if (m == NULL)
1678 goto ucnhashError;
1679 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1680 Py_DECREF(m);
1681 if (v == NULL)
1682 goto ucnhashError;
1683 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1684 Py_DECREF(v);
1685 if (ucnhash_CAPI == NULL)
1686 goto ucnhashError;
1687 }
1688 if (*s == '{') {
1689 const char *start = s+1;
1690 /* look for the closing brace */
1691 while (*s != '}' && s < end)
1692 s++;
1693 if (s > start && s < end && *s == '}') {
1694 /* found a name. look it up in the unicode database */
1695 message = "unknown Unicode character name";
1696 s++;
1697 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1698 goto store;
1699 }
1700 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001701 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001702 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001703 break;
1704
1705 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001706 if (s > end) {
1707 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1708 goto onError;
1709 }
1710 else {
1711 *p++ = '\\';
1712 *p++ = (unsigned char)s[-1];
1713 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001714 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001715 }
1716 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001717 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001718 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001720
Fredrik Lundhccc74732001-02-18 22:13:49 +00001721ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001722 PyErr_SetString(
1723 PyExc_UnicodeError,
1724 "\\N escapes not supported (can't load unicodedata module)"
1725 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001726 return NULL;
1727
Fredrik Lundhccc74732001-02-18 22:13:49 +00001728onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001729 Py_XDECREF(v);
1730 return NULL;
1731}
1732
1733/* Return a Unicode-Escape string version of the Unicode object.
1734
1735 If quotes is true, the string is enclosed in u"" or u'' quotes as
1736 appropriate.
1737
1738*/
1739
Barry Warsaw51ac5802000-03-20 16:36:48 +00001740static const Py_UNICODE *findchar(const Py_UNICODE *s,
1741 int size,
1742 Py_UNICODE ch);
1743
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744static
1745PyObject *unicodeescape_string(const Py_UNICODE *s,
1746 int size,
1747 int quotes)
1748{
1749 PyObject *repr;
1750 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001752 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001753
1754 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1755 if (repr == NULL)
1756 return NULL;
1757
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001758 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759
1760 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001761 *p++ = 'u';
1762 *p++ = (findchar(s, size, '\'') &&
1763 !findchar(s, size, '"')) ? '"' : '\'';
1764 }
1765 while (size-- > 0) {
1766 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001767
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001769 if (quotes &&
1770 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 *p++ = '\\';
1772 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001773 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001775
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001776#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001777 /* Map 21-bit characters to '\U00xxxxxx' */
1778 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001779 int offset = p - PyString_AS_STRING(repr);
1780
1781 /* Resize the string if necessary */
1782 if (offset + 12 > PyString_GET_SIZE(repr)) {
1783 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1784 goto onError;
1785 p = PyString_AS_STRING(repr) + offset;
1786 }
1787
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001788 *p++ = '\\';
1789 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001790 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1791 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1792 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1793 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1794 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1795 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1796 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001797 *p++ = hexdigit[ch & 0x0000000F];
1798 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001799 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001800#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001801 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1802 else if (ch >= 0xD800 && ch < 0xDC00) {
1803 Py_UNICODE ch2;
1804 Py_UCS4 ucs;
1805
1806 ch2 = *s++;
1807 size--;
1808 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1809 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1810 *p++ = '\\';
1811 *p++ = 'U';
1812 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1813 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1814 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1815 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1816 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1817 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1818 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1819 *p++ = hexdigit[ucs & 0x0000000F];
1820 continue;
1821 }
1822 /* Fall through: isolated surrogates are copied as-is */
1823 s--;
1824 size++;
1825 }
1826
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001828 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001829 *p++ = '\\';
1830 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001831 *p++ = hexdigit[(ch >> 12) & 0x000F];
1832 *p++ = hexdigit[(ch >> 8) & 0x000F];
1833 *p++ = hexdigit[(ch >> 4) & 0x000F];
1834 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001836
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001837 /* Map special whitespace to '\t', \n', '\r' */
1838 else if (ch == '\t') {
1839 *p++ = '\\';
1840 *p++ = 't';
1841 }
1842 else if (ch == '\n') {
1843 *p++ = '\\';
1844 *p++ = 'n';
1845 }
1846 else if (ch == '\r') {
1847 *p++ = '\\';
1848 *p++ = 'r';
1849 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001850
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001851 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001852 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001854 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001855 *p++ = hexdigit[(ch >> 4) & 0x000F];
1856 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001857 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001858
Guido van Rossumd57fd912000-03-10 22:53:23 +00001859 /* Copy everything else as-is */
1860 else
1861 *p++ = (char) ch;
1862 }
1863 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001864 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865
1866 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001867 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001869
1870 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001871
1872 onError:
1873 Py_DECREF(repr);
1874 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875}
1876
1877PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1878 int size)
1879{
1880 return unicodeescape_string(s, size, 0);
1881}
1882
1883PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1884{
1885 if (!PyUnicode_Check(unicode)) {
1886 PyErr_BadArgument();
1887 return NULL;
1888 }
1889 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1890 PyUnicode_GET_SIZE(unicode));
1891}
1892
1893/* --- Raw Unicode Escape Codec ------------------------------------------- */
1894
1895PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1896 int size,
1897 const char *errors)
1898{
1899 PyUnicodeObject *v;
1900 Py_UNICODE *p, *buf;
1901 const char *end;
1902 const char *bs;
1903
1904 /* Escaped strings will always be longer than the resulting
1905 Unicode string, so we start with size here and then reduce the
1906 length after conversion to the true value. */
1907 v = _PyUnicode_New(size);
1908 if (v == NULL)
1909 goto onError;
1910 if (size == 0)
1911 return (PyObject *)v;
1912 p = buf = PyUnicode_AS_UNICODE(v);
1913 end = s + size;
1914 while (s < end) {
1915 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001916 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 int i;
1918
1919 /* Non-escape characters are interpreted as Unicode ordinals */
1920 if (*s != '\\') {
1921 *p++ = (unsigned char)*s++;
1922 continue;
1923 }
1924
1925 /* \u-escapes are only interpreted iff the number of leading
1926 backslashes if odd */
1927 bs = s;
1928 for (;s < end;) {
1929 if (*s != '\\')
1930 break;
1931 *p++ = (unsigned char)*s++;
1932 }
1933 if (((s - bs) & 1) == 0 ||
1934 s >= end ||
1935 *s != 'u') {
1936 continue;
1937 }
1938 p--;
1939 s++;
1940
1941 /* \uXXXX with 4 hex digits */
1942 for (x = 0, i = 0; i < 4; i++) {
1943 c = (unsigned char)s[i];
1944 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001945 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001946 "truncated \\uXXXX"))
1947 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001948 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001949 i++;
1950 break;
1951 }
1952 x = (x<<4) & ~0xF;
1953 if (c >= '0' && c <= '9')
1954 x += c - '0';
1955 else if (c >= 'a' && c <= 'f')
1956 x += 10 + c - 'a';
1957 else
1958 x += 10 + c - 'A';
1959 }
1960 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001961 if (x != 0xffffffff)
1962 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001963 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001964 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001965 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 return (PyObject *)v;
1967
1968 onError:
1969 Py_XDECREF(v);
1970 return NULL;
1971}
1972
1973PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1974 int size)
1975{
1976 PyObject *repr;
1977 char *p;
1978 char *q;
1979
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001980 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981
1982 repr = PyString_FromStringAndSize(NULL, 6 * size);
1983 if (repr == NULL)
1984 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001985 if (size == 0)
1986 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987
1988 p = q = PyString_AS_STRING(repr);
1989 while (size-- > 0) {
1990 Py_UNICODE ch = *s++;
1991 /* Map 16-bit characters to '\uxxxx' */
1992 if (ch >= 256) {
1993 *p++ = '\\';
1994 *p++ = 'u';
1995 *p++ = hexdigit[(ch >> 12) & 0xf];
1996 *p++ = hexdigit[(ch >> 8) & 0xf];
1997 *p++ = hexdigit[(ch >> 4) & 0xf];
1998 *p++ = hexdigit[ch & 15];
1999 }
2000 /* Copy everything else as-is */
2001 else
2002 *p++ = (char) ch;
2003 }
2004 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002005 if (_PyString_Resize(&repr, p - q))
2006 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002007
2008 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002009
2010 onError:
2011 Py_DECREF(repr);
2012 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013}
2014
2015PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2016{
2017 if (!PyUnicode_Check(unicode)) {
2018 PyErr_BadArgument();
2019 return NULL;
2020 }
2021 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2022 PyUnicode_GET_SIZE(unicode));
2023}
2024
2025/* --- Latin-1 Codec ------------------------------------------------------ */
2026
2027PyObject *PyUnicode_DecodeLatin1(const char *s,
2028 int size,
2029 const char *errors)
2030{
2031 PyUnicodeObject *v;
2032 Py_UNICODE *p;
2033
2034 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002035 if (size == 1 && *(unsigned char*)s < 256) {
2036 Py_UNICODE r = *(unsigned char*)s;
2037 return PyUnicode_FromUnicode(&r, 1);
2038 }
2039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 v = _PyUnicode_New(size);
2041 if (v == NULL)
2042 goto onError;
2043 if (size == 0)
2044 return (PyObject *)v;
2045 p = PyUnicode_AS_UNICODE(v);
2046 while (size-- > 0)
2047 *p++ = (unsigned char)*s++;
2048 return (PyObject *)v;
2049
2050 onError:
2051 Py_XDECREF(v);
2052 return NULL;
2053}
2054
2055static
2056int latin1_encoding_error(const Py_UNICODE **source,
2057 char **dest,
2058 const char *errors,
2059 const char *details)
2060{
2061 if ((errors == NULL) ||
2062 (strcmp(errors,"strict") == 0)) {
2063 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002064 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 details);
2066 return -1;
2067 }
2068 else if (strcmp(errors,"ignore") == 0) {
2069 return 0;
2070 }
2071 else if (strcmp(errors,"replace") == 0) {
2072 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002073 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002074 return 0;
2075 }
2076 else {
2077 PyErr_Format(PyExc_ValueError,
2078 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002079 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002080 errors);
2081 return -1;
2082 }
2083}
2084
2085PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2086 int size,
2087 const char *errors)
2088{
2089 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002090 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002091
Guido van Rossumd57fd912000-03-10 22:53:23 +00002092 repr = PyString_FromStringAndSize(NULL, size);
2093 if (repr == NULL)
2094 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002095 if (size == 0)
2096 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002097
2098 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002099 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 while (size-- > 0) {
2101 Py_UNICODE ch = *p++;
2102 if (ch >= 256) {
2103 if (latin1_encoding_error(&p, &s, errors,
2104 "ordinal not in range(256)"))
2105 goto onError;
2106 }
2107 else
2108 *s++ = (char)ch;
2109 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002110 /* Resize if error handling skipped some characters */
2111 if (s - start < PyString_GET_SIZE(repr))
2112 if (_PyString_Resize(&repr, s - start))
2113 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002114 return repr;
2115
2116 onError:
2117 Py_DECREF(repr);
2118 return NULL;
2119}
2120
2121PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2122{
2123 if (!PyUnicode_Check(unicode)) {
2124 PyErr_BadArgument();
2125 return NULL;
2126 }
2127 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2128 PyUnicode_GET_SIZE(unicode),
2129 NULL);
2130}
2131
2132/* --- 7-bit ASCII Codec -------------------------------------------------- */
2133
2134static
2135int ascii_decoding_error(const char **source,
2136 Py_UNICODE **dest,
2137 const char *errors,
2138 const char *details)
2139{
2140 if ((errors == NULL) ||
2141 (strcmp(errors,"strict") == 0)) {
2142 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002143 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 details);
2145 return -1;
2146 }
2147 else if (strcmp(errors,"ignore") == 0) {
2148 return 0;
2149 }
2150 else if (strcmp(errors,"replace") == 0) {
2151 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2152 (*dest)++;
2153 return 0;
2154 }
2155 else {
2156 PyErr_Format(PyExc_ValueError,
2157 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002158 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002159 errors);
2160 return -1;
2161 }
2162}
2163
2164PyObject *PyUnicode_DecodeASCII(const char *s,
2165 int size,
2166 const char *errors)
2167{
2168 PyUnicodeObject *v;
2169 Py_UNICODE *p;
2170
2171 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002172 if (size == 1 && *(unsigned char*)s < 128) {
2173 Py_UNICODE r = *(unsigned char*)s;
2174 return PyUnicode_FromUnicode(&r, 1);
2175 }
2176
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 v = _PyUnicode_New(size);
2178 if (v == NULL)
2179 goto onError;
2180 if (size == 0)
2181 return (PyObject *)v;
2182 p = PyUnicode_AS_UNICODE(v);
2183 while (size-- > 0) {
2184 register unsigned char c;
2185
2186 c = (unsigned char)*s++;
2187 if (c < 128)
2188 *p++ = c;
2189 else if (ascii_decoding_error(&s, &p, errors,
2190 "ordinal not in range(128)"))
2191 goto onError;
2192 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002193 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002194 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002196 return (PyObject *)v;
2197
2198 onError:
2199 Py_XDECREF(v);
2200 return NULL;
2201}
2202
2203static
2204int ascii_encoding_error(const Py_UNICODE **source,
2205 char **dest,
2206 const char *errors,
2207 const char *details)
2208{
2209 if ((errors == NULL) ||
2210 (strcmp(errors,"strict") == 0)) {
2211 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002212 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 details);
2214 return -1;
2215 }
2216 else if (strcmp(errors,"ignore") == 0) {
2217 return 0;
2218 }
2219 else if (strcmp(errors,"replace") == 0) {
2220 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002221 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 return 0;
2223 }
2224 else {
2225 PyErr_Format(PyExc_ValueError,
2226 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002227 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002228 errors);
2229 return -1;
2230 }
2231}
2232
2233PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2234 int size,
2235 const char *errors)
2236{
2237 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002238 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002239
Guido van Rossumd57fd912000-03-10 22:53:23 +00002240 repr = PyString_FromStringAndSize(NULL, size);
2241 if (repr == NULL)
2242 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002243 if (size == 0)
2244 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245
2246 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002247 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002248 while (size-- > 0) {
2249 Py_UNICODE ch = *p++;
2250 if (ch >= 128) {
2251 if (ascii_encoding_error(&p, &s, errors,
2252 "ordinal not in range(128)"))
2253 goto onError;
2254 }
2255 else
2256 *s++ = (char)ch;
2257 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002258 /* Resize if error handling skipped some characters */
2259 if (s - start < PyString_GET_SIZE(repr))
2260 if (_PyString_Resize(&repr, s - start))
2261 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002262 return repr;
2263
2264 onError:
2265 Py_DECREF(repr);
2266 return NULL;
2267}
2268
2269PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2270{
2271 if (!PyUnicode_Check(unicode)) {
2272 PyErr_BadArgument();
2273 return NULL;
2274 }
2275 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2276 PyUnicode_GET_SIZE(unicode),
2277 NULL);
2278}
2279
Fredrik Lundh30831632001-06-26 15:11:00 +00002280#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002281
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002282/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002283
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002284PyObject *PyUnicode_DecodeMBCS(const char *s,
2285 int size,
2286 const char *errors)
2287{
2288 PyUnicodeObject *v;
2289 Py_UNICODE *p;
2290
2291 /* First get the size of the result */
2292 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002293 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002294 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2295
2296 v = _PyUnicode_New(usize);
2297 if (v == NULL)
2298 return NULL;
2299 if (usize == 0)
2300 return (PyObject *)v;
2301 p = PyUnicode_AS_UNICODE(v);
2302 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2303 Py_DECREF(v);
2304 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2305 }
2306
2307 return (PyObject *)v;
2308}
2309
2310PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2311 int size,
2312 const char *errors)
2313{
2314 PyObject *repr;
2315 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002316 DWORD mbcssize;
2317
2318 /* If there are no characters, bail now! */
2319 if (size==0)
2320 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002321
2322 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002323 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002324 if (mbcssize==0)
2325 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2326
2327 repr = PyString_FromStringAndSize(NULL, mbcssize);
2328 if (repr == NULL)
2329 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002330 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002331 return repr;
2332
2333 /* Do the conversion */
2334 s = PyString_AS_STRING(repr);
2335 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2336 Py_DECREF(repr);
2337 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2338 }
2339 return repr;
2340}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002341
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002342#endif /* MS_WIN32 */
2343
Guido van Rossumd57fd912000-03-10 22:53:23 +00002344/* --- Character Mapping Codec -------------------------------------------- */
2345
2346static
2347int charmap_decoding_error(const char **source,
2348 Py_UNICODE **dest,
2349 const char *errors,
2350 const char *details)
2351{
2352 if ((errors == NULL) ||
2353 (strcmp(errors,"strict") == 0)) {
2354 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002355 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 details);
2357 return -1;
2358 }
2359 else if (strcmp(errors,"ignore") == 0) {
2360 return 0;
2361 }
2362 else if (strcmp(errors,"replace") == 0) {
2363 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2364 (*dest)++;
2365 return 0;
2366 }
2367 else {
2368 PyErr_Format(PyExc_ValueError,
2369 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002370 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371 errors);
2372 return -1;
2373 }
2374}
2375
2376PyObject *PyUnicode_DecodeCharmap(const char *s,
2377 int size,
2378 PyObject *mapping,
2379 const char *errors)
2380{
2381 PyUnicodeObject *v;
2382 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002383 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002384
2385 /* Default to Latin-1 */
2386 if (mapping == NULL)
2387 return PyUnicode_DecodeLatin1(s, size, errors);
2388
2389 v = _PyUnicode_New(size);
2390 if (v == NULL)
2391 goto onError;
2392 if (size == 0)
2393 return (PyObject *)v;
2394 p = PyUnicode_AS_UNICODE(v);
2395 while (size-- > 0) {
2396 unsigned char ch = *s++;
2397 PyObject *w, *x;
2398
2399 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2400 w = PyInt_FromLong((long)ch);
2401 if (w == NULL)
2402 goto onError;
2403 x = PyObject_GetItem(mapping, w);
2404 Py_DECREF(w);
2405 if (x == NULL) {
2406 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002407 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002408 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002409 x = Py_None;
2410 Py_INCREF(x);
2411 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002412 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002413 }
2414
2415 /* Apply mapping */
2416 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002417 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002418 if (value < 0 || value > 65535) {
2419 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002420 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002421 Py_DECREF(x);
2422 goto onError;
2423 }
2424 *p++ = (Py_UNICODE)value;
2425 }
2426 else if (x == Py_None) {
2427 /* undefined mapping */
2428 if (charmap_decoding_error(&s, &p, errors,
2429 "character maps to <undefined>")) {
2430 Py_DECREF(x);
2431 goto onError;
2432 }
2433 }
2434 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002435 int targetsize = PyUnicode_GET_SIZE(x);
2436
2437 if (targetsize == 1)
2438 /* 1-1 mapping */
2439 *p++ = *PyUnicode_AS_UNICODE(x);
2440
2441 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002443 if (targetsize > extrachars) {
2444 /* resize first */
2445 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2446 int needed = (targetsize - extrachars) + \
2447 (targetsize << 2);
2448 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002449 if (_PyUnicode_Resize(&v,
2450 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002451 Py_DECREF(x);
2452 goto onError;
2453 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002454 p = PyUnicode_AS_UNICODE(v) + oldpos;
2455 }
2456 Py_UNICODE_COPY(p,
2457 PyUnicode_AS_UNICODE(x),
2458 targetsize);
2459 p += targetsize;
2460 extrachars -= targetsize;
2461 }
2462 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002463 }
2464 else {
2465 /* wrong return value */
2466 PyErr_SetString(PyExc_TypeError,
2467 "character mapping must return integer, None or unicode");
2468 Py_DECREF(x);
2469 goto onError;
2470 }
2471 Py_DECREF(x);
2472 }
2473 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002474 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002475 goto onError;
2476 return (PyObject *)v;
2477
2478 onError:
2479 Py_XDECREF(v);
2480 return NULL;
2481}
2482
2483static
2484int charmap_encoding_error(const Py_UNICODE **source,
2485 char **dest,
2486 const char *errors,
2487 const char *details)
2488{
2489 if ((errors == NULL) ||
2490 (strcmp(errors,"strict") == 0)) {
2491 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002492 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493 details);
2494 return -1;
2495 }
2496 else if (strcmp(errors,"ignore") == 0) {
2497 return 0;
2498 }
2499 else if (strcmp(errors,"replace") == 0) {
2500 **dest = '?';
2501 (*dest)++;
2502 return 0;
2503 }
2504 else {
2505 PyErr_Format(PyExc_ValueError,
2506 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002507 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002508 errors);
2509 return -1;
2510 }
2511}
2512
2513PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2514 int size,
2515 PyObject *mapping,
2516 const char *errors)
2517{
2518 PyObject *v;
2519 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002520 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521
2522 /* Default to Latin-1 */
2523 if (mapping == NULL)
2524 return PyUnicode_EncodeLatin1(p, size, errors);
2525
2526 v = PyString_FromStringAndSize(NULL, size);
2527 if (v == NULL)
2528 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002529 if (size == 0)
2530 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 s = PyString_AS_STRING(v);
2532 while (size-- > 0) {
2533 Py_UNICODE ch = *p++;
2534 PyObject *w, *x;
2535
2536 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2537 w = PyInt_FromLong((long)ch);
2538 if (w == NULL)
2539 goto onError;
2540 x = PyObject_GetItem(mapping, w);
2541 Py_DECREF(w);
2542 if (x == NULL) {
2543 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002544 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002546 x = Py_None;
2547 Py_INCREF(x);
2548 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002549 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550 }
2551
2552 /* Apply mapping */
2553 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002554 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 if (value < 0 || value > 255) {
2556 PyErr_SetString(PyExc_TypeError,
2557 "character mapping must be in range(256)");
2558 Py_DECREF(x);
2559 goto onError;
2560 }
2561 *s++ = (char)value;
2562 }
2563 else if (x == Py_None) {
2564 /* undefined mapping */
2565 if (charmap_encoding_error(&p, &s, errors,
2566 "character maps to <undefined>")) {
2567 Py_DECREF(x);
2568 goto onError;
2569 }
2570 }
2571 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002572 int targetsize = PyString_GET_SIZE(x);
2573
2574 if (targetsize == 1)
2575 /* 1-1 mapping */
2576 *s++ = *PyString_AS_STRING(x);
2577
2578 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002580 if (targetsize > extrachars) {
2581 /* resize first */
2582 int oldpos = (int)(s - PyString_AS_STRING(v));
2583 int needed = (targetsize - extrachars) + \
2584 (targetsize << 2);
2585 extrachars += needed;
2586 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002587 Py_DECREF(x);
2588 goto onError;
2589 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002590 s = PyString_AS_STRING(v) + oldpos;
2591 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002592 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002593 s += targetsize;
2594 extrachars -= targetsize;
2595 }
2596 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 }
2598 else {
2599 /* wrong return value */
2600 PyErr_SetString(PyExc_TypeError,
2601 "character mapping must return integer, None or unicode");
2602 Py_DECREF(x);
2603 goto onError;
2604 }
2605 Py_DECREF(x);
2606 }
2607 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2608 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2609 goto onError;
2610 return v;
2611
2612 onError:
2613 Py_DECREF(v);
2614 return NULL;
2615}
2616
2617PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2618 PyObject *mapping)
2619{
2620 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2621 PyErr_BadArgument();
2622 return NULL;
2623 }
2624 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2625 PyUnicode_GET_SIZE(unicode),
2626 mapping,
2627 NULL);
2628}
2629
2630static
2631int translate_error(const Py_UNICODE **source,
2632 Py_UNICODE **dest,
2633 const char *errors,
2634 const char *details)
2635{
2636 if ((errors == NULL) ||
2637 (strcmp(errors,"strict") == 0)) {
2638 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002639 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 details);
2641 return -1;
2642 }
2643 else if (strcmp(errors,"ignore") == 0) {
2644 return 0;
2645 }
2646 else if (strcmp(errors,"replace") == 0) {
2647 **dest = '?';
2648 (*dest)++;
2649 return 0;
2650 }
2651 else {
2652 PyErr_Format(PyExc_ValueError,
2653 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002654 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002655 errors);
2656 return -1;
2657 }
2658}
2659
2660PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2661 int size,
2662 PyObject *mapping,
2663 const char *errors)
2664{
2665 PyUnicodeObject *v;
2666 Py_UNICODE *p;
2667
2668 if (mapping == NULL) {
2669 PyErr_BadArgument();
2670 return NULL;
2671 }
2672
2673 /* Output will never be longer than input */
2674 v = _PyUnicode_New(size);
2675 if (v == NULL)
2676 goto onError;
2677 if (size == 0)
2678 goto done;
2679 p = PyUnicode_AS_UNICODE(v);
2680 while (size-- > 0) {
2681 Py_UNICODE ch = *s++;
2682 PyObject *w, *x;
2683
2684 /* Get mapping */
2685 w = PyInt_FromLong(ch);
2686 if (w == NULL)
2687 goto onError;
2688 x = PyObject_GetItem(mapping, w);
2689 Py_DECREF(w);
2690 if (x == NULL) {
2691 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2692 /* No mapping found: default to 1-1 mapping */
2693 PyErr_Clear();
2694 *p++ = ch;
2695 continue;
2696 }
2697 goto onError;
2698 }
2699
2700 /* Apply mapping */
2701 if (PyInt_Check(x))
2702 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2703 else if (x == Py_None) {
2704 /* undefined mapping */
2705 if (translate_error(&s, &p, errors,
2706 "character maps to <undefined>")) {
2707 Py_DECREF(x);
2708 goto onError;
2709 }
2710 }
2711 else if (PyUnicode_Check(x)) {
2712 if (PyUnicode_GET_SIZE(x) != 1) {
2713 /* 1-n mapping */
2714 PyErr_SetString(PyExc_NotImplementedError,
2715 "1-n mappings are currently not implemented");
2716 Py_DECREF(x);
2717 goto onError;
2718 }
2719 *p++ = *PyUnicode_AS_UNICODE(x);
2720 }
2721 else {
2722 /* wrong return value */
2723 PyErr_SetString(PyExc_TypeError,
2724 "translate mapping must return integer, None or unicode");
2725 Py_DECREF(x);
2726 goto onError;
2727 }
2728 Py_DECREF(x);
2729 }
2730 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002731 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002732 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002733
2734 done:
2735 return (PyObject *)v;
2736
2737 onError:
2738 Py_XDECREF(v);
2739 return NULL;
2740}
2741
2742PyObject *PyUnicode_Translate(PyObject *str,
2743 PyObject *mapping,
2744 const char *errors)
2745{
2746 PyObject *result;
2747
2748 str = PyUnicode_FromObject(str);
2749 if (str == NULL)
2750 goto onError;
2751 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2752 PyUnicode_GET_SIZE(str),
2753 mapping,
2754 errors);
2755 Py_DECREF(str);
2756 return result;
2757
2758 onError:
2759 Py_XDECREF(str);
2760 return NULL;
2761}
2762
Guido van Rossum9e896b32000-04-05 20:11:21 +00002763/* --- Decimal Encoder ---------------------------------------------------- */
2764
2765int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2766 int length,
2767 char *output,
2768 const char *errors)
2769{
2770 Py_UNICODE *p, *end;
2771
2772 if (output == NULL) {
2773 PyErr_BadArgument();
2774 return -1;
2775 }
2776
2777 p = s;
2778 end = s + length;
2779 while (p < end) {
2780 register Py_UNICODE ch = *p++;
2781 int decimal;
2782
2783 if (Py_UNICODE_ISSPACE(ch)) {
2784 *output++ = ' ';
2785 continue;
2786 }
2787 decimal = Py_UNICODE_TODECIMAL(ch);
2788 if (decimal >= 0) {
2789 *output++ = '0' + decimal;
2790 continue;
2791 }
Guido van Rossumba477042000-04-06 18:18:10 +00002792 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002793 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002794 continue;
2795 }
2796 /* All other characters are considered invalid */
2797 if (errors == NULL || strcmp(errors, "strict") == 0) {
2798 PyErr_SetString(PyExc_ValueError,
2799 "invalid decimal Unicode string");
2800 goto onError;
2801 }
2802 else if (strcmp(errors, "ignore") == 0)
2803 continue;
2804 else if (strcmp(errors, "replace") == 0) {
2805 *output++ = '?';
2806 continue;
2807 }
2808 }
2809 /* 0-terminate the output string */
2810 *output++ = '\0';
2811 return 0;
2812
2813 onError:
2814 return -1;
2815}
2816
Guido van Rossumd57fd912000-03-10 22:53:23 +00002817/* --- Helpers ------------------------------------------------------------ */
2818
2819static
2820int count(PyUnicodeObject *self,
2821 int start,
2822 int end,
2823 PyUnicodeObject *substring)
2824{
2825 int count = 0;
2826
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002827 if (start < 0)
2828 start += self->length;
2829 if (start < 0)
2830 start = 0;
2831 if (end > self->length)
2832 end = self->length;
2833 if (end < 0)
2834 end += self->length;
2835 if (end < 0)
2836 end = 0;
2837
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002838 if (substring->length == 0)
2839 return (end - start + 1);
2840
Guido van Rossumd57fd912000-03-10 22:53:23 +00002841 end -= substring->length;
2842
2843 while (start <= end)
2844 if (Py_UNICODE_MATCH(self, start, substring)) {
2845 count++;
2846 start += substring->length;
2847 } else
2848 start++;
2849
2850 return count;
2851}
2852
2853int PyUnicode_Count(PyObject *str,
2854 PyObject *substr,
2855 int start,
2856 int end)
2857{
2858 int result;
2859
2860 str = PyUnicode_FromObject(str);
2861 if (str == NULL)
2862 return -1;
2863 substr = PyUnicode_FromObject(substr);
2864 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002865 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 return -1;
2867 }
2868
2869 result = count((PyUnicodeObject *)str,
2870 start, end,
2871 (PyUnicodeObject *)substr);
2872
2873 Py_DECREF(str);
2874 Py_DECREF(substr);
2875 return result;
2876}
2877
2878static
2879int findstring(PyUnicodeObject *self,
2880 PyUnicodeObject *substring,
2881 int start,
2882 int end,
2883 int direction)
2884{
2885 if (start < 0)
2886 start += self->length;
2887 if (start < 0)
2888 start = 0;
2889
2890 if (substring->length == 0)
2891 return start;
2892
2893 if (end > self->length)
2894 end = self->length;
2895 if (end < 0)
2896 end += self->length;
2897 if (end < 0)
2898 end = 0;
2899
2900 end -= substring->length;
2901
2902 if (direction < 0) {
2903 for (; end >= start; end--)
2904 if (Py_UNICODE_MATCH(self, end, substring))
2905 return end;
2906 } else {
2907 for (; start <= end; start++)
2908 if (Py_UNICODE_MATCH(self, start, substring))
2909 return start;
2910 }
2911
2912 return -1;
2913}
2914
2915int PyUnicode_Find(PyObject *str,
2916 PyObject *substr,
2917 int start,
2918 int end,
2919 int direction)
2920{
2921 int result;
2922
2923 str = PyUnicode_FromObject(str);
2924 if (str == NULL)
2925 return -1;
2926 substr = PyUnicode_FromObject(substr);
2927 if (substr == NULL) {
2928 Py_DECREF(substr);
2929 return -1;
2930 }
2931
2932 result = findstring((PyUnicodeObject *)str,
2933 (PyUnicodeObject *)substr,
2934 start, end, direction);
2935 Py_DECREF(str);
2936 Py_DECREF(substr);
2937 return result;
2938}
2939
2940static
2941int tailmatch(PyUnicodeObject *self,
2942 PyUnicodeObject *substring,
2943 int start,
2944 int end,
2945 int direction)
2946{
2947 if (start < 0)
2948 start += self->length;
2949 if (start < 0)
2950 start = 0;
2951
2952 if (substring->length == 0)
2953 return 1;
2954
2955 if (end > self->length)
2956 end = self->length;
2957 if (end < 0)
2958 end += self->length;
2959 if (end < 0)
2960 end = 0;
2961
2962 end -= substring->length;
2963 if (end < start)
2964 return 0;
2965
2966 if (direction > 0) {
2967 if (Py_UNICODE_MATCH(self, end, substring))
2968 return 1;
2969 } else {
2970 if (Py_UNICODE_MATCH(self, start, substring))
2971 return 1;
2972 }
2973
2974 return 0;
2975}
2976
2977int PyUnicode_Tailmatch(PyObject *str,
2978 PyObject *substr,
2979 int start,
2980 int end,
2981 int direction)
2982{
2983 int result;
2984
2985 str = PyUnicode_FromObject(str);
2986 if (str == NULL)
2987 return -1;
2988 substr = PyUnicode_FromObject(substr);
2989 if (substr == NULL) {
2990 Py_DECREF(substr);
2991 return -1;
2992 }
2993
2994 result = tailmatch((PyUnicodeObject *)str,
2995 (PyUnicodeObject *)substr,
2996 start, end, direction);
2997 Py_DECREF(str);
2998 Py_DECREF(substr);
2999 return result;
3000}
3001
3002static
3003const Py_UNICODE *findchar(const Py_UNICODE *s,
3004 int size,
3005 Py_UNICODE ch)
3006{
3007 /* like wcschr, but doesn't stop at NULL characters */
3008
3009 while (size-- > 0) {
3010 if (*s == ch)
3011 return s;
3012 s++;
3013 }
3014
3015 return NULL;
3016}
3017
3018/* Apply fixfct filter to the Unicode object self and return a
3019 reference to the modified object */
3020
3021static
3022PyObject *fixup(PyUnicodeObject *self,
3023 int (*fixfct)(PyUnicodeObject *s))
3024{
3025
3026 PyUnicodeObject *u;
3027
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003028 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003029 if (u == NULL)
3030 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003031
3032 Py_UNICODE_COPY(u->str, self->str, self->length);
3033
Tim Peters7a29bd52001-09-12 03:03:31 +00003034 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003035 /* fixfct should return TRUE if it modified the buffer. If
3036 FALSE, return a reference to the original buffer instead
3037 (to save space, not time) */
3038 Py_INCREF(self);
3039 Py_DECREF(u);
3040 return (PyObject*) self;
3041 }
3042 return (PyObject*) u;
3043}
3044
3045static
3046int fixupper(PyUnicodeObject *self)
3047{
3048 int len = self->length;
3049 Py_UNICODE *s = self->str;
3050 int status = 0;
3051
3052 while (len-- > 0) {
3053 register Py_UNICODE ch;
3054
3055 ch = Py_UNICODE_TOUPPER(*s);
3056 if (ch != *s) {
3057 status = 1;
3058 *s = ch;
3059 }
3060 s++;
3061 }
3062
3063 return status;
3064}
3065
3066static
3067int fixlower(PyUnicodeObject *self)
3068{
3069 int len = self->length;
3070 Py_UNICODE *s = self->str;
3071 int status = 0;
3072
3073 while (len-- > 0) {
3074 register Py_UNICODE ch;
3075
3076 ch = Py_UNICODE_TOLOWER(*s);
3077 if (ch != *s) {
3078 status = 1;
3079 *s = ch;
3080 }
3081 s++;
3082 }
3083
3084 return status;
3085}
3086
3087static
3088int fixswapcase(PyUnicodeObject *self)
3089{
3090 int len = self->length;
3091 Py_UNICODE *s = self->str;
3092 int status = 0;
3093
3094 while (len-- > 0) {
3095 if (Py_UNICODE_ISUPPER(*s)) {
3096 *s = Py_UNICODE_TOLOWER(*s);
3097 status = 1;
3098 } else if (Py_UNICODE_ISLOWER(*s)) {
3099 *s = Py_UNICODE_TOUPPER(*s);
3100 status = 1;
3101 }
3102 s++;
3103 }
3104
3105 return status;
3106}
3107
3108static
3109int fixcapitalize(PyUnicodeObject *self)
3110{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003111 int len = self->length;
3112 Py_UNICODE *s = self->str;
3113 int status = 0;
3114
3115 if (len == 0)
3116 return 0;
3117 if (Py_UNICODE_ISLOWER(*s)) {
3118 *s = Py_UNICODE_TOUPPER(*s);
3119 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003120 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003121 s++;
3122 while (--len > 0) {
3123 if (Py_UNICODE_ISUPPER(*s)) {
3124 *s = Py_UNICODE_TOLOWER(*s);
3125 status = 1;
3126 }
3127 s++;
3128 }
3129 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003130}
3131
3132static
3133int fixtitle(PyUnicodeObject *self)
3134{
3135 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3136 register Py_UNICODE *e;
3137 int previous_is_cased;
3138
3139 /* Shortcut for single character strings */
3140 if (PyUnicode_GET_SIZE(self) == 1) {
3141 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3142 if (*p != ch) {
3143 *p = ch;
3144 return 1;
3145 }
3146 else
3147 return 0;
3148 }
3149
3150 e = p + PyUnicode_GET_SIZE(self);
3151 previous_is_cased = 0;
3152 for (; p < e; p++) {
3153 register const Py_UNICODE ch = *p;
3154
3155 if (previous_is_cased)
3156 *p = Py_UNICODE_TOLOWER(ch);
3157 else
3158 *p = Py_UNICODE_TOTITLE(ch);
3159
3160 if (Py_UNICODE_ISLOWER(ch) ||
3161 Py_UNICODE_ISUPPER(ch) ||
3162 Py_UNICODE_ISTITLE(ch))
3163 previous_is_cased = 1;
3164 else
3165 previous_is_cased = 0;
3166 }
3167 return 1;
3168}
3169
3170PyObject *PyUnicode_Join(PyObject *separator,
3171 PyObject *seq)
3172{
3173 Py_UNICODE *sep;
3174 int seplen;
3175 PyUnicodeObject *res = NULL;
3176 int reslen = 0;
3177 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 int sz = 100;
3179 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003180 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003181
Tim Peters2cfe3682001-05-05 05:36:48 +00003182 it = PyObject_GetIter(seq);
3183 if (it == NULL)
3184 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185
3186 if (separator == NULL) {
3187 Py_UNICODE blank = ' ';
3188 sep = &blank;
3189 seplen = 1;
3190 }
3191 else {
3192 separator = PyUnicode_FromObject(separator);
3193 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003194 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 sep = PyUnicode_AS_UNICODE(separator);
3196 seplen = PyUnicode_GET_SIZE(separator);
3197 }
3198
3199 res = _PyUnicode_New(sz);
3200 if (res == NULL)
3201 goto onError;
3202 p = PyUnicode_AS_UNICODE(res);
3203 reslen = 0;
3204
Tim Peters2cfe3682001-05-05 05:36:48 +00003205 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003207 PyObject *item = PyIter_Next(it);
3208 if (item == NULL) {
3209 if (PyErr_Occurred())
3210 goto onError;
3211 break;
3212 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 if (!PyUnicode_Check(item)) {
3214 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003215 if (!PyString_Check(item)) {
3216 PyErr_Format(PyExc_TypeError,
3217 "sequence item %i: expected string or Unicode,"
3218 " %.80s found",
3219 i, item->ob_type->tp_name);
3220 Py_DECREF(item);
3221 goto onError;
3222 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003223 v = PyUnicode_FromObject(item);
3224 Py_DECREF(item);
3225 item = v;
3226 if (item == NULL)
3227 goto onError;
3228 }
3229 itemlen = PyUnicode_GET_SIZE(item);
3230 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003231 if (_PyUnicode_Resize(&res, sz*2)) {
3232 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003233 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003235 sz *= 2;
3236 p = PyUnicode_AS_UNICODE(res) + reslen;
3237 }
3238 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003239 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 p += seplen;
3241 reslen += seplen;
3242 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003243 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 p += itemlen;
3245 reslen += itemlen;
3246 Py_DECREF(item);
3247 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003248 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 goto onError;
3250
3251 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003252 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 return (PyObject *)res;
3254
3255 onError:
3256 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003257 Py_XDECREF(res);
3258 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003259 return NULL;
3260}
3261
3262static
3263PyUnicodeObject *pad(PyUnicodeObject *self,
3264 int left,
3265 int right,
3266 Py_UNICODE fill)
3267{
3268 PyUnicodeObject *u;
3269
3270 if (left < 0)
3271 left = 0;
3272 if (right < 0)
3273 right = 0;
3274
Tim Peters7a29bd52001-09-12 03:03:31 +00003275 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003276 Py_INCREF(self);
3277 return self;
3278 }
3279
3280 u = _PyUnicode_New(left + self->length + right);
3281 if (u) {
3282 if (left)
3283 Py_UNICODE_FILL(u->str, fill, left);
3284 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3285 if (right)
3286 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3287 }
3288
3289 return u;
3290}
3291
3292#define SPLIT_APPEND(data, left, right) \
3293 str = PyUnicode_FromUnicode(data + left, right - left); \
3294 if (!str) \
3295 goto onError; \
3296 if (PyList_Append(list, str)) { \
3297 Py_DECREF(str); \
3298 goto onError; \
3299 } \
3300 else \
3301 Py_DECREF(str);
3302
3303static
3304PyObject *split_whitespace(PyUnicodeObject *self,
3305 PyObject *list,
3306 int maxcount)
3307{
3308 register int i;
3309 register int j;
3310 int len = self->length;
3311 PyObject *str;
3312
3313 for (i = j = 0; i < len; ) {
3314 /* find a token */
3315 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3316 i++;
3317 j = i;
3318 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3319 i++;
3320 if (j < i) {
3321 if (maxcount-- <= 0)
3322 break;
3323 SPLIT_APPEND(self->str, j, i);
3324 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3325 i++;
3326 j = i;
3327 }
3328 }
3329 if (j < len) {
3330 SPLIT_APPEND(self->str, j, len);
3331 }
3332 return list;
3333
3334 onError:
3335 Py_DECREF(list);
3336 return NULL;
3337}
3338
3339PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003340 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003341{
3342 register int i;
3343 register int j;
3344 int len;
3345 PyObject *list;
3346 PyObject *str;
3347 Py_UNICODE *data;
3348
3349 string = PyUnicode_FromObject(string);
3350 if (string == NULL)
3351 return NULL;
3352 data = PyUnicode_AS_UNICODE(string);
3353 len = PyUnicode_GET_SIZE(string);
3354
Guido van Rossumd57fd912000-03-10 22:53:23 +00003355 list = PyList_New(0);
3356 if (!list)
3357 goto onError;
3358
3359 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003360 int eol;
3361
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 /* Find a line and append it */
3363 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3364 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003365
3366 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003367 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 if (i < len) {
3369 if (data[i] == '\r' && i + 1 < len &&
3370 data[i+1] == '\n')
3371 i += 2;
3372 else
3373 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003374 if (keepends)
3375 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376 }
Guido van Rossum86662912000-04-11 15:38:46 +00003377 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 j = i;
3379 }
3380 if (j < len) {
3381 SPLIT_APPEND(data, j, len);
3382 }
3383
3384 Py_DECREF(string);
3385 return list;
3386
3387 onError:
3388 Py_DECREF(list);
3389 Py_DECREF(string);
3390 return NULL;
3391}
3392
3393static
3394PyObject *split_char(PyUnicodeObject *self,
3395 PyObject *list,
3396 Py_UNICODE ch,
3397 int maxcount)
3398{
3399 register int i;
3400 register int j;
3401 int len = self->length;
3402 PyObject *str;
3403
3404 for (i = j = 0; i < len; ) {
3405 if (self->str[i] == ch) {
3406 if (maxcount-- <= 0)
3407 break;
3408 SPLIT_APPEND(self->str, j, i);
3409 i = j = i + 1;
3410 } else
3411 i++;
3412 }
3413 if (j <= len) {
3414 SPLIT_APPEND(self->str, j, len);
3415 }
3416 return list;
3417
3418 onError:
3419 Py_DECREF(list);
3420 return NULL;
3421}
3422
3423static
3424PyObject *split_substring(PyUnicodeObject *self,
3425 PyObject *list,
3426 PyUnicodeObject *substring,
3427 int maxcount)
3428{
3429 register int i;
3430 register int j;
3431 int len = self->length;
3432 int sublen = substring->length;
3433 PyObject *str;
3434
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003435 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436 if (Py_UNICODE_MATCH(self, i, substring)) {
3437 if (maxcount-- <= 0)
3438 break;
3439 SPLIT_APPEND(self->str, j, i);
3440 i = j = i + sublen;
3441 } else
3442 i++;
3443 }
3444 if (j <= len) {
3445 SPLIT_APPEND(self->str, j, len);
3446 }
3447 return list;
3448
3449 onError:
3450 Py_DECREF(list);
3451 return NULL;
3452}
3453
3454#undef SPLIT_APPEND
3455
3456static
3457PyObject *split(PyUnicodeObject *self,
3458 PyUnicodeObject *substring,
3459 int maxcount)
3460{
3461 PyObject *list;
3462
3463 if (maxcount < 0)
3464 maxcount = INT_MAX;
3465
3466 list = PyList_New(0);
3467 if (!list)
3468 return NULL;
3469
3470 if (substring == NULL)
3471 return split_whitespace(self,list,maxcount);
3472
3473 else if (substring->length == 1)
3474 return split_char(self,list,substring->str[0],maxcount);
3475
3476 else if (substring->length == 0) {
3477 Py_DECREF(list);
3478 PyErr_SetString(PyExc_ValueError, "empty separator");
3479 return NULL;
3480 }
3481 else
3482 return split_substring(self,list,substring,maxcount);
3483}
3484
3485static
3486PyObject *strip(PyUnicodeObject *self,
3487 int left,
3488 int right)
3489{
3490 Py_UNICODE *p = self->str;
3491 int start = 0;
3492 int end = self->length;
3493
3494 if (left)
3495 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3496 start++;
3497
3498 if (right)
3499 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3500 end--;
3501
Tim Peters7a29bd52001-09-12 03:03:31 +00003502 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 /* couldn't strip anything off, return original string */
3504 Py_INCREF(self);
3505 return (PyObject*) self;
3506 }
3507
3508 return (PyObject*) PyUnicode_FromUnicode(
3509 self->str + start,
3510 end - start
3511 );
3512}
3513
3514static
3515PyObject *replace(PyUnicodeObject *self,
3516 PyUnicodeObject *str1,
3517 PyUnicodeObject *str2,
3518 int maxcount)
3519{
3520 PyUnicodeObject *u;
3521
3522 if (maxcount < 0)
3523 maxcount = INT_MAX;
3524
3525 if (str1->length == 1 && str2->length == 1) {
3526 int i;
3527
3528 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003529 if (!findchar(self->str, self->length, str1->str[0]) &&
3530 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003531 /* nothing to replace, return original string */
3532 Py_INCREF(self);
3533 u = self;
3534 } else {
3535 Py_UNICODE u1 = str1->str[0];
3536 Py_UNICODE u2 = str2->str[0];
3537
3538 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003539 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003540 self->length
3541 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003542 if (u != NULL) {
3543 Py_UNICODE_COPY(u->str, self->str,
3544 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003545 for (i = 0; i < u->length; i++)
3546 if (u->str[i] == u1) {
3547 if (--maxcount < 0)
3548 break;
3549 u->str[i] = u2;
3550 }
3551 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003552 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003553
3554 } else {
3555 int n, i;
3556 Py_UNICODE *p;
3557
3558 /* replace strings */
3559 n = count(self, 0, self->length, str1);
3560 if (n > maxcount)
3561 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003562 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003563 /* nothing to replace, return original string */
3564 Py_INCREF(self);
3565 u = self;
3566 } else {
3567 u = _PyUnicode_New(
3568 self->length + n * (str2->length - str1->length));
3569 if (u) {
3570 i = 0;
3571 p = u->str;
3572 while (i <= self->length - str1->length)
3573 if (Py_UNICODE_MATCH(self, i, str1)) {
3574 /* replace string segment */
3575 Py_UNICODE_COPY(p, str2->str, str2->length);
3576 p += str2->length;
3577 i += str1->length;
3578 if (--n <= 0) {
3579 /* copy remaining part */
3580 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3581 break;
3582 }
3583 } else
3584 *p++ = self->str[i++];
3585 }
3586 }
3587 }
3588
3589 return (PyObject *) u;
3590}
3591
3592/* --- Unicode Object Methods --------------------------------------------- */
3593
3594static char title__doc__[] =
3595"S.title() -> unicode\n\
3596\n\
3597Return a titlecased version of S, i.e. words start with title case\n\
3598characters, all remaining cased characters have lower case.";
3599
3600static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003601unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003603 return fixup(self, fixtitle);
3604}
3605
3606static char capitalize__doc__[] =
3607"S.capitalize() -> unicode\n\
3608\n\
3609Return a capitalized version of S, i.e. make the first character\n\
3610have upper case.";
3611
3612static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003613unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003615 return fixup(self, fixcapitalize);
3616}
3617
3618#if 0
3619static char capwords__doc__[] =
3620"S.capwords() -> unicode\n\
3621\n\
3622Apply .capitalize() to all words in S and return the result with\n\
3623normalized whitespace (all whitespace strings are replaced by ' ').";
3624
3625static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003626unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627{
3628 PyObject *list;
3629 PyObject *item;
3630 int i;
3631
Guido van Rossumd57fd912000-03-10 22:53:23 +00003632 /* Split into words */
3633 list = split(self, NULL, -1);
3634 if (!list)
3635 return NULL;
3636
3637 /* Capitalize each word */
3638 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3639 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3640 fixcapitalize);
3641 if (item == NULL)
3642 goto onError;
3643 Py_DECREF(PyList_GET_ITEM(list, i));
3644 PyList_SET_ITEM(list, i, item);
3645 }
3646
3647 /* Join the words to form a new string */
3648 item = PyUnicode_Join(NULL, list);
3649
3650onError:
3651 Py_DECREF(list);
3652 return (PyObject *)item;
3653}
3654#endif
3655
3656static char center__doc__[] =
3657"S.center(width) -> unicode\n\
3658\n\
3659Return S centered in a Unicode string of length width. Padding is done\n\
3660using spaces.";
3661
3662static PyObject *
3663unicode_center(PyUnicodeObject *self, PyObject *args)
3664{
3665 int marg, left;
3666 int width;
3667
3668 if (!PyArg_ParseTuple(args, "i:center", &width))
3669 return NULL;
3670
Tim Peters7a29bd52001-09-12 03:03:31 +00003671 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672 Py_INCREF(self);
3673 return (PyObject*) self;
3674 }
3675
3676 marg = width - self->length;
3677 left = marg / 2 + (marg & width & 1);
3678
3679 return (PyObject*) pad(self, left, marg - left, ' ');
3680}
3681
Marc-André Lemburge5034372000-08-08 08:04:29 +00003682#if 0
3683
3684/* This code should go into some future Unicode collation support
3685 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003686 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003687
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003688/* speedy UTF-16 code point order comparison */
3689/* gleaned from: */
3690/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3691
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003692static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003693{
3694 0, 0, 0, 0, 0, 0, 0, 0,
3695 0, 0, 0, 0, 0, 0, 0, 0,
3696 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003697 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003698};
3699
Guido van Rossumd57fd912000-03-10 22:53:23 +00003700static int
3701unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3702{
3703 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003704
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705 Py_UNICODE *s1 = str1->str;
3706 Py_UNICODE *s2 = str2->str;
3707
3708 len1 = str1->length;
3709 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003710
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003712 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003713
3714 c1 = *s1++;
3715 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003716
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003717 if (c1 > (1<<11) * 26)
3718 c1 += utf16Fixup[c1>>11];
3719 if (c2 > (1<<11) * 26)
3720 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003721 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003722
3723 if (c1 != c2)
3724 return (c1 < c2) ? -1 : 1;
3725
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003726 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003727 }
3728
3729 return (len1 < len2) ? -1 : (len1 != len2);
3730}
3731
Marc-André Lemburge5034372000-08-08 08:04:29 +00003732#else
3733
3734static int
3735unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3736{
3737 register int len1, len2;
3738
3739 Py_UNICODE *s1 = str1->str;
3740 Py_UNICODE *s2 = str2->str;
3741
3742 len1 = str1->length;
3743 len2 = str2->length;
3744
3745 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003746 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003747
Fredrik Lundh45714e92001-06-26 16:39:36 +00003748 c1 = *s1++;
3749 c2 = *s2++;
3750
3751 if (c1 != c2)
3752 return (c1 < c2) ? -1 : 1;
3753
Marc-André Lemburge5034372000-08-08 08:04:29 +00003754 len1--; len2--;
3755 }
3756
3757 return (len1 < len2) ? -1 : (len1 != len2);
3758}
3759
3760#endif
3761
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762int PyUnicode_Compare(PyObject *left,
3763 PyObject *right)
3764{
3765 PyUnicodeObject *u = NULL, *v = NULL;
3766 int result;
3767
3768 /* Coerce the two arguments */
3769 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3770 if (u == NULL)
3771 goto onError;
3772 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3773 if (v == NULL)
3774 goto onError;
3775
Thomas Wouters7e474022000-07-16 12:04:32 +00003776 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 if (v == u) {
3778 Py_DECREF(u);
3779 Py_DECREF(v);
3780 return 0;
3781 }
3782
3783 result = unicode_compare(u, v);
3784
3785 Py_DECREF(u);
3786 Py_DECREF(v);
3787 return result;
3788
3789onError:
3790 Py_XDECREF(u);
3791 Py_XDECREF(v);
3792 return -1;
3793}
3794
Guido van Rossum403d68b2000-03-13 15:55:09 +00003795int PyUnicode_Contains(PyObject *container,
3796 PyObject *element)
3797{
3798 PyUnicodeObject *u = NULL, *v = NULL;
3799 int result;
3800 register const Py_UNICODE *p, *e;
3801 register Py_UNICODE ch;
3802
3803 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003804 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003805 if (v == NULL) {
3806 PyErr_SetString(PyExc_TypeError,
3807 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003808 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003809 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003810 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3811 if (u == NULL) {
3812 Py_DECREF(v);
3813 goto onError;
3814 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003815
3816 /* Check v in u */
3817 if (PyUnicode_GET_SIZE(v) != 1) {
3818 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003819 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003820 goto onError;
3821 }
3822 ch = *PyUnicode_AS_UNICODE(v);
3823 p = PyUnicode_AS_UNICODE(u);
3824 e = p + PyUnicode_GET_SIZE(u);
3825 result = 0;
3826 while (p < e) {
3827 if (*p++ == ch) {
3828 result = 1;
3829 break;
3830 }
3831 }
3832
3833 Py_DECREF(u);
3834 Py_DECREF(v);
3835 return result;
3836
3837onError:
3838 Py_XDECREF(u);
3839 Py_XDECREF(v);
3840 return -1;
3841}
3842
Guido van Rossumd57fd912000-03-10 22:53:23 +00003843/* Concat to string or Unicode object giving a new Unicode object. */
3844
3845PyObject *PyUnicode_Concat(PyObject *left,
3846 PyObject *right)
3847{
3848 PyUnicodeObject *u = NULL, *v = NULL, *w;
3849
3850 /* Coerce the two arguments */
3851 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3852 if (u == NULL)
3853 goto onError;
3854 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3855 if (v == NULL)
3856 goto onError;
3857
3858 /* Shortcuts */
3859 if (v == unicode_empty) {
3860 Py_DECREF(v);
3861 return (PyObject *)u;
3862 }
3863 if (u == unicode_empty) {
3864 Py_DECREF(u);
3865 return (PyObject *)v;
3866 }
3867
3868 /* Concat the two Unicode strings */
3869 w = _PyUnicode_New(u->length + v->length);
3870 if (w == NULL)
3871 goto onError;
3872 Py_UNICODE_COPY(w->str, u->str, u->length);
3873 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3874
3875 Py_DECREF(u);
3876 Py_DECREF(v);
3877 return (PyObject *)w;
3878
3879onError:
3880 Py_XDECREF(u);
3881 Py_XDECREF(v);
3882 return NULL;
3883}
3884
3885static char count__doc__[] =
3886"S.count(sub[, start[, end]]) -> int\n\
3887\n\
3888Return the number of occurrences of substring sub in Unicode string\n\
3889S[start:end]. Optional arguments start and end are\n\
3890interpreted as in slice notation.";
3891
3892static PyObject *
3893unicode_count(PyUnicodeObject *self, PyObject *args)
3894{
3895 PyUnicodeObject *substring;
3896 int start = 0;
3897 int end = INT_MAX;
3898 PyObject *result;
3899
Guido van Rossumb8872e62000-05-09 14:14:27 +00003900 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3901 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003902 return NULL;
3903
3904 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3905 (PyObject *)substring);
3906 if (substring == NULL)
3907 return NULL;
3908
Guido van Rossumd57fd912000-03-10 22:53:23 +00003909 if (start < 0)
3910 start += self->length;
3911 if (start < 0)
3912 start = 0;
3913 if (end > self->length)
3914 end = self->length;
3915 if (end < 0)
3916 end += self->length;
3917 if (end < 0)
3918 end = 0;
3919
3920 result = PyInt_FromLong((long) count(self, start, end, substring));
3921
3922 Py_DECREF(substring);
3923 return result;
3924}
3925
3926static char encode__doc__[] =
3927"S.encode([encoding[,errors]]) -> string\n\
3928\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003929Return an encoded string version of S. Default encoding is the current\n\
3930default string encoding. errors may be given to set a different error\n\
3931handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3932a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933
3934static PyObject *
3935unicode_encode(PyUnicodeObject *self, PyObject *args)
3936{
3937 char *encoding = NULL;
3938 char *errors = NULL;
3939 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3940 return NULL;
3941 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3942}
3943
3944static char expandtabs__doc__[] =
3945"S.expandtabs([tabsize]) -> unicode\n\
3946\n\
3947Return a copy of S where all tab characters are expanded using spaces.\n\
3948If tabsize is not given, a tab size of 8 characters is assumed.";
3949
3950static PyObject*
3951unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3952{
3953 Py_UNICODE *e;
3954 Py_UNICODE *p;
3955 Py_UNICODE *q;
3956 int i, j;
3957 PyUnicodeObject *u;
3958 int tabsize = 8;
3959
3960 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3961 return NULL;
3962
Thomas Wouters7e474022000-07-16 12:04:32 +00003963 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003964 i = j = 0;
3965 e = self->str + self->length;
3966 for (p = self->str; p < e; p++)
3967 if (*p == '\t') {
3968 if (tabsize > 0)
3969 j += tabsize - (j % tabsize);
3970 }
3971 else {
3972 j++;
3973 if (*p == '\n' || *p == '\r') {
3974 i += j;
3975 j = 0;
3976 }
3977 }
3978
3979 /* Second pass: create output string and fill it */
3980 u = _PyUnicode_New(i + j);
3981 if (!u)
3982 return NULL;
3983
3984 j = 0;
3985 q = u->str;
3986
3987 for (p = self->str; p < e; p++)
3988 if (*p == '\t') {
3989 if (tabsize > 0) {
3990 i = tabsize - (j % tabsize);
3991 j += i;
3992 while (i--)
3993 *q++ = ' ';
3994 }
3995 }
3996 else {
3997 j++;
3998 *q++ = *p;
3999 if (*p == '\n' || *p == '\r')
4000 j = 0;
4001 }
4002
4003 return (PyObject*) u;
4004}
4005
4006static char find__doc__[] =
4007"S.find(sub [,start [,end]]) -> int\n\
4008\n\
4009Return the lowest index in S where substring sub is found,\n\
4010such that sub is contained within s[start,end]. Optional\n\
4011arguments start and end are interpreted as in slice notation.\n\
4012\n\
4013Return -1 on failure.";
4014
4015static PyObject *
4016unicode_find(PyUnicodeObject *self, PyObject *args)
4017{
4018 PyUnicodeObject *substring;
4019 int start = 0;
4020 int end = INT_MAX;
4021 PyObject *result;
4022
Guido van Rossumb8872e62000-05-09 14:14:27 +00004023 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4024 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004025 return NULL;
4026 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4027 (PyObject *)substring);
4028 if (substring == NULL)
4029 return NULL;
4030
4031 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4032
4033 Py_DECREF(substring);
4034 return result;
4035}
4036
4037static PyObject *
4038unicode_getitem(PyUnicodeObject *self, int index)
4039{
4040 if (index < 0 || index >= self->length) {
4041 PyErr_SetString(PyExc_IndexError, "string index out of range");
4042 return NULL;
4043 }
4044
4045 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4046}
4047
4048static long
4049unicode_hash(PyUnicodeObject *self)
4050{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004051 /* Since Unicode objects compare equal to their ASCII string
4052 counterparts, they should use the individual character values
4053 as basis for their hash value. This is needed to assure that
4054 strings and Unicode objects behave in the same way as
4055 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004056
Fredrik Lundhdde61642000-07-10 18:27:47 +00004057 register int len;
4058 register Py_UNICODE *p;
4059 register long x;
4060
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 if (self->hash != -1)
4062 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004063 len = PyUnicode_GET_SIZE(self);
4064 p = PyUnicode_AS_UNICODE(self);
4065 x = *p << 7;
4066 while (--len >= 0)
4067 x = (1000003*x) ^ *p++;
4068 x ^= PyUnicode_GET_SIZE(self);
4069 if (x == -1)
4070 x = -2;
4071 self->hash = x;
4072 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004073}
4074
4075static char index__doc__[] =
4076"S.index(sub [,start [,end]]) -> int\n\
4077\n\
4078Like S.find() but raise ValueError when the substring is not found.";
4079
4080static PyObject *
4081unicode_index(PyUnicodeObject *self, PyObject *args)
4082{
4083 int result;
4084 PyUnicodeObject *substring;
4085 int start = 0;
4086 int end = INT_MAX;
4087
Guido van Rossumb8872e62000-05-09 14:14:27 +00004088 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4089 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 return NULL;
4091
4092 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4093 (PyObject *)substring);
4094 if (substring == NULL)
4095 return NULL;
4096
4097 result = findstring(self, substring, start, end, 1);
4098
4099 Py_DECREF(substring);
4100 if (result < 0) {
4101 PyErr_SetString(PyExc_ValueError, "substring not found");
4102 return NULL;
4103 }
4104 return PyInt_FromLong(result);
4105}
4106
4107static char islower__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004108"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004110Return True if all cased characters in S are lowercase and there is\n\
4111at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004112
4113static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004114unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115{
4116 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4117 register const Py_UNICODE *e;
4118 int cased;
4119
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120 /* Shortcut for single character strings */
4121 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004122 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004124 /* Special case for empty strings */
4125 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004126 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004127
Guido van Rossumd57fd912000-03-10 22:53:23 +00004128 e = p + PyUnicode_GET_SIZE(self);
4129 cased = 0;
4130 for (; p < e; p++) {
4131 register const Py_UNICODE ch = *p;
4132
4133 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004134 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 else if (!cased && Py_UNICODE_ISLOWER(ch))
4136 cased = 1;
4137 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004138 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139}
4140
4141static char isupper__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004142"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004144Return True if all cased characters in S are uppercase and there is\n\
4145at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146
4147static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004148unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004149{
4150 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4151 register const Py_UNICODE *e;
4152 int cased;
4153
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154 /* Shortcut for single character strings */
4155 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004156 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004158 /* Special case for empty strings */
4159 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004160 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004161
Guido van Rossumd57fd912000-03-10 22:53:23 +00004162 e = p + PyUnicode_GET_SIZE(self);
4163 cased = 0;
4164 for (; p < e; p++) {
4165 register const Py_UNICODE ch = *p;
4166
4167 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004168 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 else if (!cased && Py_UNICODE_ISUPPER(ch))
4170 cased = 1;
4171 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004172 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173}
4174
4175static char istitle__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004176"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004177\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004178Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4179characters may only follow uncased characters and lowercase characters\n\
4180only cased ones. Return False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004181
4182static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004183unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184{
4185 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4186 register const Py_UNICODE *e;
4187 int cased, previous_is_cased;
4188
Guido van Rossumd57fd912000-03-10 22:53:23 +00004189 /* Shortcut for single character strings */
4190 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004191 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4192 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004194 /* Special case for empty strings */
4195 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004196 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004197
Guido van Rossumd57fd912000-03-10 22:53:23 +00004198 e = p + PyUnicode_GET_SIZE(self);
4199 cased = 0;
4200 previous_is_cased = 0;
4201 for (; p < e; p++) {
4202 register const Py_UNICODE ch = *p;
4203
4204 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4205 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004206 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 previous_is_cased = 1;
4208 cased = 1;
4209 }
4210 else if (Py_UNICODE_ISLOWER(ch)) {
4211 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004212 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004213 previous_is_cased = 1;
4214 cased = 1;
4215 }
4216 else
4217 previous_is_cased = 0;
4218 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004219 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004220}
4221
4222static char isspace__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004223"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004225Return True if there are only whitespace characters in S,\n\
4226False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004227
4228static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004229unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004230{
4231 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4232 register const Py_UNICODE *e;
4233
Guido van Rossumd57fd912000-03-10 22:53:23 +00004234 /* Shortcut for single character strings */
4235 if (PyUnicode_GET_SIZE(self) == 1 &&
4236 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004237 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004239 /* Special case for empty strings */
4240 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004241 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004242
Guido van Rossumd57fd912000-03-10 22:53:23 +00004243 e = p + PyUnicode_GET_SIZE(self);
4244 for (; p < e; p++) {
4245 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004246 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004247 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004248 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249}
4250
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004251static char isalpha__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004252"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004253\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004254Return True if all characters in S are alphabetic\n\
4255and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004256
4257static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004258unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004259{
4260 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4261 register const Py_UNICODE *e;
4262
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004263 /* Shortcut for single character strings */
4264 if (PyUnicode_GET_SIZE(self) == 1 &&
4265 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004266 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004267
4268 /* Special case for empty strings */
4269 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004270 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004271
4272 e = p + PyUnicode_GET_SIZE(self);
4273 for (; p < e; p++) {
4274 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004275 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004276 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004277 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004278}
4279
4280static char isalnum__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004281"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004282\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004283Return True if all characters in S are alphanumeric\n\
4284and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004285
4286static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004287unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004288{
4289 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4290 register const Py_UNICODE *e;
4291
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004292 /* Shortcut for single character strings */
4293 if (PyUnicode_GET_SIZE(self) == 1 &&
4294 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004295 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004296
4297 /* Special case for empty strings */
4298 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004299 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004300
4301 e = p + PyUnicode_GET_SIZE(self);
4302 for (; p < e; p++) {
4303 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004304 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004305 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004306 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004307}
4308
Guido van Rossumd57fd912000-03-10 22:53:23 +00004309static char isdecimal__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004310"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004312Return True if there are only decimal characters in S,\n\
4313False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314
4315static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004316unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004317{
4318 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4319 register const Py_UNICODE *e;
4320
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 /* Shortcut for single character strings */
4322 if (PyUnicode_GET_SIZE(self) == 1 &&
4323 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004324 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004326 /* Special case for empty strings */
4327 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004328 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004329
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 e = p + PyUnicode_GET_SIZE(self);
4331 for (; p < e; p++) {
4332 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004333 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004335 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336}
4337
4338static char isdigit__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004339"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004341Return True if there are only digit characters in S,\n\
4342False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343
4344static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004345unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004346{
4347 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4348 register const Py_UNICODE *e;
4349
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350 /* Shortcut for single character strings */
4351 if (PyUnicode_GET_SIZE(self) == 1 &&
4352 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004353 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004355 /* Special case for empty strings */
4356 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004357 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004358
Guido van Rossumd57fd912000-03-10 22:53:23 +00004359 e = p + PyUnicode_GET_SIZE(self);
4360 for (; p < e; p++) {
4361 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004362 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004364 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365}
4366
4367static char isnumeric__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004368"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004370Return True if there are only numeric characters in S,\n\
4371False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004372
4373static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004374unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004375{
4376 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4377 register const Py_UNICODE *e;
4378
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379 /* Shortcut for single character strings */
4380 if (PyUnicode_GET_SIZE(self) == 1 &&
4381 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004382 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004384 /* Special case for empty strings */
4385 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004386 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004387
Guido van Rossumd57fd912000-03-10 22:53:23 +00004388 e = p + PyUnicode_GET_SIZE(self);
4389 for (; p < e; p++) {
4390 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004391 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004392 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004393 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394}
4395
4396static char join__doc__[] =
4397"S.join(sequence) -> unicode\n\
4398\n\
4399Return a string which is the concatenation of the strings in the\n\
4400sequence. The separator between elements is S.";
4401
4402static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004403unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004405 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004406}
4407
4408static int
4409unicode_length(PyUnicodeObject *self)
4410{
4411 return self->length;
4412}
4413
4414static char ljust__doc__[] =
4415"S.ljust(width) -> unicode\n\
4416\n\
4417Return S left justified in a Unicode string of length width. Padding is\n\
4418done using spaces.";
4419
4420static PyObject *
4421unicode_ljust(PyUnicodeObject *self, PyObject *args)
4422{
4423 int width;
4424 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4425 return NULL;
4426
Tim Peters7a29bd52001-09-12 03:03:31 +00004427 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 Py_INCREF(self);
4429 return (PyObject*) self;
4430 }
4431
4432 return (PyObject*) pad(self, 0, width - self->length, ' ');
4433}
4434
4435static char lower__doc__[] =
4436"S.lower() -> unicode\n\
4437\n\
4438Return a copy of the string S converted to lowercase.";
4439
4440static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004441unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004442{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004443 return fixup(self, fixlower);
4444}
4445
4446static char lstrip__doc__[] =
4447"S.lstrip() -> unicode\n\
4448\n\
4449Return a copy of the string S with leading whitespace removed.";
4450
4451static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004452unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 return strip(self, 1, 0);
4455}
4456
4457static PyObject*
4458unicode_repeat(PyUnicodeObject *str, int len)
4459{
4460 PyUnicodeObject *u;
4461 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004462 int nchars;
4463 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464
4465 if (len < 0)
4466 len = 0;
4467
Tim Peters7a29bd52001-09-12 03:03:31 +00004468 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004469 /* no repeat, return original string */
4470 Py_INCREF(str);
4471 return (PyObject*) str;
4472 }
Tim Peters8f422462000-09-09 06:13:41 +00004473
4474 /* ensure # of chars needed doesn't overflow int and # of bytes
4475 * needed doesn't overflow size_t
4476 */
4477 nchars = len * str->length;
4478 if (len && nchars / len != str->length) {
4479 PyErr_SetString(PyExc_OverflowError,
4480 "repeated string is too long");
4481 return NULL;
4482 }
4483 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4484 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4485 PyErr_SetString(PyExc_OverflowError,
4486 "repeated string is too long");
4487 return NULL;
4488 }
4489 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004490 if (!u)
4491 return NULL;
4492
4493 p = u->str;
4494
4495 while (len-- > 0) {
4496 Py_UNICODE_COPY(p, str->str, str->length);
4497 p += str->length;
4498 }
4499
4500 return (PyObject*) u;
4501}
4502
4503PyObject *PyUnicode_Replace(PyObject *obj,
4504 PyObject *subobj,
4505 PyObject *replobj,
4506 int maxcount)
4507{
4508 PyObject *self;
4509 PyObject *str1;
4510 PyObject *str2;
4511 PyObject *result;
4512
4513 self = PyUnicode_FromObject(obj);
4514 if (self == NULL)
4515 return NULL;
4516 str1 = PyUnicode_FromObject(subobj);
4517 if (str1 == NULL) {
4518 Py_DECREF(self);
4519 return NULL;
4520 }
4521 str2 = PyUnicode_FromObject(replobj);
4522 if (str2 == NULL) {
4523 Py_DECREF(self);
4524 Py_DECREF(str1);
4525 return NULL;
4526 }
4527 result = replace((PyUnicodeObject *)self,
4528 (PyUnicodeObject *)str1,
4529 (PyUnicodeObject *)str2,
4530 maxcount);
4531 Py_DECREF(self);
4532 Py_DECREF(str1);
4533 Py_DECREF(str2);
4534 return result;
4535}
4536
4537static char replace__doc__[] =
4538"S.replace (old, new[, maxsplit]) -> unicode\n\
4539\n\
4540Return a copy of S with all occurrences of substring\n\
4541old replaced by new. If the optional argument maxsplit is\n\
4542given, only the first maxsplit occurrences are replaced.";
4543
4544static PyObject*
4545unicode_replace(PyUnicodeObject *self, PyObject *args)
4546{
4547 PyUnicodeObject *str1;
4548 PyUnicodeObject *str2;
4549 int maxcount = -1;
4550 PyObject *result;
4551
4552 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4553 return NULL;
4554 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4555 if (str1 == NULL)
4556 return NULL;
4557 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4558 if (str2 == NULL)
4559 return NULL;
4560
4561 result = replace(self, str1, str2, maxcount);
4562
4563 Py_DECREF(str1);
4564 Py_DECREF(str2);
4565 return result;
4566}
4567
4568static
4569PyObject *unicode_repr(PyObject *unicode)
4570{
4571 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4572 PyUnicode_GET_SIZE(unicode),
4573 1);
4574}
4575
4576static char rfind__doc__[] =
4577"S.rfind(sub [,start [,end]]) -> int\n\
4578\n\
4579Return the highest index in S where substring sub is found,\n\
4580such that sub is contained within s[start,end]. Optional\n\
4581arguments start and end are interpreted as in slice notation.\n\
4582\n\
4583Return -1 on failure.";
4584
4585static PyObject *
4586unicode_rfind(PyUnicodeObject *self, PyObject *args)
4587{
4588 PyUnicodeObject *substring;
4589 int start = 0;
4590 int end = INT_MAX;
4591 PyObject *result;
4592
Guido van Rossumb8872e62000-05-09 14:14:27 +00004593 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4594 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004595 return NULL;
4596 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4597 (PyObject *)substring);
4598 if (substring == NULL)
4599 return NULL;
4600
4601 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4602
4603 Py_DECREF(substring);
4604 return result;
4605}
4606
4607static char rindex__doc__[] =
4608"S.rindex(sub [,start [,end]]) -> int\n\
4609\n\
4610Like S.rfind() but raise ValueError when the substring is not found.";
4611
4612static PyObject *
4613unicode_rindex(PyUnicodeObject *self, PyObject *args)
4614{
4615 int result;
4616 PyUnicodeObject *substring;
4617 int start = 0;
4618 int end = INT_MAX;
4619
Guido van Rossumb8872e62000-05-09 14:14:27 +00004620 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4621 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 return NULL;
4623 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4624 (PyObject *)substring);
4625 if (substring == NULL)
4626 return NULL;
4627
4628 result = findstring(self, substring, start, end, -1);
4629
4630 Py_DECREF(substring);
4631 if (result < 0) {
4632 PyErr_SetString(PyExc_ValueError, "substring not found");
4633 return NULL;
4634 }
4635 return PyInt_FromLong(result);
4636}
4637
4638static char rjust__doc__[] =
4639"S.rjust(width) -> unicode\n\
4640\n\
4641Return S right justified in a Unicode string of length width. Padding is\n\
4642done using spaces.";
4643
4644static PyObject *
4645unicode_rjust(PyUnicodeObject *self, PyObject *args)
4646{
4647 int width;
4648 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4649 return NULL;
4650
Tim Peters7a29bd52001-09-12 03:03:31 +00004651 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 Py_INCREF(self);
4653 return (PyObject*) self;
4654 }
4655
4656 return (PyObject*) pad(self, width - self->length, 0, ' ');
4657}
4658
4659static char rstrip__doc__[] =
4660"S.rstrip() -> unicode\n\
4661\n\
4662Return a copy of the string S with trailing whitespace removed.";
4663
4664static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004665unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 return strip(self, 0, 1);
4668}
4669
4670static PyObject*
4671unicode_slice(PyUnicodeObject *self, int start, int end)
4672{
4673 /* standard clamping */
4674 if (start < 0)
4675 start = 0;
4676 if (end < 0)
4677 end = 0;
4678 if (end > self->length)
4679 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004680 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004681 /* full slice, return original string */
4682 Py_INCREF(self);
4683 return (PyObject*) self;
4684 }
4685 if (start > end)
4686 start = end;
4687 /* copy slice */
4688 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4689 end - start);
4690}
4691
4692PyObject *PyUnicode_Split(PyObject *s,
4693 PyObject *sep,
4694 int maxsplit)
4695{
4696 PyObject *result;
4697
4698 s = PyUnicode_FromObject(s);
4699 if (s == NULL)
4700 return NULL;
4701 if (sep != NULL) {
4702 sep = PyUnicode_FromObject(sep);
4703 if (sep == NULL) {
4704 Py_DECREF(s);
4705 return NULL;
4706 }
4707 }
4708
4709 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4710
4711 Py_DECREF(s);
4712 Py_XDECREF(sep);
4713 return result;
4714}
4715
4716static char split__doc__[] =
4717"S.split([sep [,maxsplit]]) -> list of strings\n\
4718\n\
4719Return a list of the words in S, using sep as the\n\
4720delimiter string. If maxsplit is given, at most maxsplit\n\
4721splits are done. If sep is not specified, any whitespace string\n\
4722is a separator.";
4723
4724static PyObject*
4725unicode_split(PyUnicodeObject *self, PyObject *args)
4726{
4727 PyObject *substring = Py_None;
4728 int maxcount = -1;
4729
4730 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4731 return NULL;
4732
4733 if (substring == Py_None)
4734 return split(self, NULL, maxcount);
4735 else if (PyUnicode_Check(substring))
4736 return split(self, (PyUnicodeObject *)substring, maxcount);
4737 else
4738 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4739}
4740
4741static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004742"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743\n\
4744Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004745Line breaks are not included in the resulting list unless keepends\n\
4746is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747
4748static PyObject*
4749unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4750{
Guido van Rossum86662912000-04-11 15:38:46 +00004751 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752
Guido van Rossum86662912000-04-11 15:38:46 +00004753 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 return NULL;
4755
Guido van Rossum86662912000-04-11 15:38:46 +00004756 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757}
4758
4759static
4760PyObject *unicode_str(PyUnicodeObject *self)
4761{
Fred Drakee4315f52000-05-09 19:53:39 +00004762 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763}
4764
4765static char strip__doc__[] =
4766"S.strip() -> unicode\n\
4767\n\
4768Return a copy of S with leading and trailing whitespace removed.";
4769
4770static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004771unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 return strip(self, 1, 1);
4774}
4775
4776static char swapcase__doc__[] =
4777"S.swapcase() -> unicode\n\
4778\n\
4779Return a copy of S with uppercase characters converted to lowercase\n\
4780and vice versa.";
4781
4782static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004783unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 return fixup(self, fixswapcase);
4786}
4787
4788static char translate__doc__[] =
4789"S.translate(table) -> unicode\n\
4790\n\
4791Return a copy of the string S, where all characters have been mapped\n\
4792through the given translation table, which must be a mapping of\n\
4793Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4794are left untouched. Characters mapped to None are deleted.";
4795
4796static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004797unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004799 return PyUnicode_TranslateCharmap(self->str,
4800 self->length,
4801 table,
4802 "ignore");
4803}
4804
4805static char upper__doc__[] =
4806"S.upper() -> unicode\n\
4807\n\
4808Return a copy of S converted to uppercase.";
4809
4810static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004811unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 return fixup(self, fixupper);
4814}
4815
Guido van Rossumd57fd912000-03-10 22:53:23 +00004816static char zfill__doc__[] =
4817"S.zfill(width) -> unicode\n\
4818\n\
4819Pad a numeric string x with zeros on the left, to fill a field\n\
4820of the specified width. The string x is never truncated.";
4821
4822static PyObject *
4823unicode_zfill(PyUnicodeObject *self, PyObject *args)
4824{
4825 int fill;
4826 PyUnicodeObject *u;
4827
4828 int width;
4829 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4830 return NULL;
4831
4832 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004833 if (PyUnicode_CheckExact(self)) {
4834 Py_INCREF(self);
4835 return (PyObject*) self;
4836 }
4837 else
4838 return PyUnicode_FromUnicode(
4839 PyUnicode_AS_UNICODE(self),
4840 PyUnicode_GET_SIZE(self)
4841 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004842 }
4843
4844 fill = width - self->length;
4845
4846 u = pad(self, fill, 0, '0');
4847
Walter Dörwald068325e2002-04-15 13:36:47 +00004848 if (u == NULL)
4849 return NULL;
4850
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 if (u->str[fill] == '+' || u->str[fill] == '-') {
4852 /* move sign to beginning of string */
4853 u->str[0] = u->str[fill];
4854 u->str[fill] = '0';
4855 }
4856
4857 return (PyObject*) u;
4858}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859
4860#if 0
4861static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004862unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 return PyInt_FromLong(unicode_freelist_size);
4865}
4866#endif
4867
4868static char startswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004869"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004871Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872optional start, test S beginning at that position. With optional end, stop\n\
4873comparing S at that position.";
4874
4875static PyObject *
4876unicode_startswith(PyUnicodeObject *self,
4877 PyObject *args)
4878{
4879 PyUnicodeObject *substring;
4880 int start = 0;
4881 int end = INT_MAX;
4882 PyObject *result;
4883
Guido van Rossumb8872e62000-05-09 14:14:27 +00004884 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4885 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 return NULL;
4887 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4888 (PyObject *)substring);
4889 if (substring == NULL)
4890 return NULL;
4891
Guido van Rossum77f6a652002-04-03 22:41:51 +00004892 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893
4894 Py_DECREF(substring);
4895 return result;
4896}
4897
4898
4899static char endswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004900"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004901\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004902Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903optional start, test S beginning at that position. With optional end, stop\n\
4904comparing S at that position.";
4905
4906static PyObject *
4907unicode_endswith(PyUnicodeObject *self,
4908 PyObject *args)
4909{
4910 PyUnicodeObject *substring;
4911 int start = 0;
4912 int end = INT_MAX;
4913 PyObject *result;
4914
Guido van Rossumb8872e62000-05-09 14:14:27 +00004915 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4916 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 return NULL;
4918 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4919 (PyObject *)substring);
4920 if (substring == NULL)
4921 return NULL;
4922
Guido van Rossum77f6a652002-04-03 22:41:51 +00004923 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924
4925 Py_DECREF(substring);
4926 return result;
4927}
4928
4929
4930static PyMethodDef unicode_methods[] = {
4931
4932 /* Order is according to common usage: often used methods should
4933 appear first, since lookup is done sequentially. */
4934
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004935 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4936 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4937 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4938 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4939 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4940 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4941 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4942 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4943 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4944 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4945 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4946 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4947 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4948 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4949/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4950 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4951 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4952 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4953 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4954 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4955 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4956 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4957 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4958 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4959 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4960 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4961 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4962 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4963 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4964 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4965 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4966 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4967 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4968 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4969 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004970 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00004971#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004972 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004973#endif
4974
4975#if 0
4976 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004977 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004978#endif
4979
4980 {NULL, NULL}
4981};
4982
Guido van Rossumd57fd912000-03-10 22:53:23 +00004983static PySequenceMethods unicode_as_sequence = {
4984 (inquiry) unicode_length, /* sq_length */
4985 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4986 (intargfunc) unicode_repeat, /* sq_repeat */
4987 (intargfunc) unicode_getitem, /* sq_item */
4988 (intintargfunc) unicode_slice, /* sq_slice */
4989 0, /* sq_ass_item */
4990 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004991 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004992};
4993
4994static int
4995unicode_buffer_getreadbuf(PyUnicodeObject *self,
4996 int index,
4997 const void **ptr)
4998{
4999 if (index != 0) {
5000 PyErr_SetString(PyExc_SystemError,
5001 "accessing non-existent unicode segment");
5002 return -1;
5003 }
5004 *ptr = (void *) self->str;
5005 return PyUnicode_GET_DATA_SIZE(self);
5006}
5007
5008static int
5009unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5010 const void **ptr)
5011{
5012 PyErr_SetString(PyExc_TypeError,
5013 "cannot use unicode as modifyable buffer");
5014 return -1;
5015}
5016
5017static int
5018unicode_buffer_getsegcount(PyUnicodeObject *self,
5019 int *lenp)
5020{
5021 if (lenp)
5022 *lenp = PyUnicode_GET_DATA_SIZE(self);
5023 return 1;
5024}
5025
5026static int
5027unicode_buffer_getcharbuf(PyUnicodeObject *self,
5028 int index,
5029 const void **ptr)
5030{
5031 PyObject *str;
5032
5033 if (index != 0) {
5034 PyErr_SetString(PyExc_SystemError,
5035 "accessing non-existent unicode segment");
5036 return -1;
5037 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005038 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005039 if (str == NULL)
5040 return -1;
5041 *ptr = (void *) PyString_AS_STRING(str);
5042 return PyString_GET_SIZE(str);
5043}
5044
5045/* Helpers for PyUnicode_Format() */
5046
5047static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005048getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049{
5050 int argidx = *p_argidx;
5051 if (argidx < arglen) {
5052 (*p_argidx)++;
5053 if (arglen < 0)
5054 return args;
5055 else
5056 return PyTuple_GetItem(args, argidx);
5057 }
5058 PyErr_SetString(PyExc_TypeError,
5059 "not enough arguments for format string");
5060 return NULL;
5061}
5062
5063#define F_LJUST (1<<0)
5064#define F_SIGN (1<<1)
5065#define F_BLANK (1<<2)
5066#define F_ALT (1<<3)
5067#define F_ZERO (1<<4)
5068
5069static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005070int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005071{
5072 register int i;
5073 int len;
5074 va_list va;
5075 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077
5078 /* First, format the string as char array, then expand to Py_UNICODE
5079 array. */
5080 charbuffer = (char *)buffer;
5081 len = vsprintf(charbuffer, format, va);
5082 for (i = len - 1; i >= 0; i--)
5083 buffer[i] = (Py_UNICODE) charbuffer[i];
5084
5085 va_end(va);
5086 return len;
5087}
5088
5089static int
5090formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005091 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 int flags,
5093 int prec,
5094 int type,
5095 PyObject *v)
5096{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005097 /* fmt = '%#.' + `prec` + `type`
5098 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 char fmt[20];
5100 double x;
5101
5102 x = PyFloat_AsDouble(v);
5103 if (x == -1.0 && PyErr_Occurred())
5104 return -1;
5105 if (prec < 0)
5106 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5108 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005109 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5110 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005111 /* worst case length calc to ensure no buffer overrun:
5112 fmt = %#.<prec>g
5113 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5114 for any double rep.)
5115 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5116 If prec=0 the effective precision is 1 (the leading digit is
5117 always given), therefore increase by one to 10+prec. */
5118 if (buflen <= (size_t)10 + (size_t)prec) {
5119 PyErr_SetString(PyExc_OverflowError,
5120 "formatted float is too long (precision too long?)");
5121 return -1;
5122 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 return usprintf(buf, fmt, x);
5124}
5125
Tim Peters38fd5b62000-09-21 05:43:11 +00005126static PyObject*
5127formatlong(PyObject *val, int flags, int prec, int type)
5128{
5129 char *buf;
5130 int i, len;
5131 PyObject *str; /* temporary string object. */
5132 PyUnicodeObject *result;
5133
5134 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5135 if (!str)
5136 return NULL;
5137 result = _PyUnicode_New(len);
5138 for (i = 0; i < len; i++)
5139 result->str[i] = buf[i];
5140 result->str[len] = 0;
5141 Py_DECREF(str);
5142 return (PyObject*)result;
5143}
5144
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145static int
5146formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005147 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005148 int flags,
5149 int prec,
5150 int type,
5151 PyObject *v)
5152{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005153 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005154 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5155 * + 1 + 1
5156 * = 24
5157 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005158 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 long x;
5160
5161 x = PyInt_AsLong(v);
5162 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005163 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005164 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005165 prec = 1;
5166
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005167 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005168 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5169 */
5170 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005171 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005172 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005173 return -1;
5174 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005175
5176 if ((flags & F_ALT) &&
5177 (type == 'x' || type == 'X')) {
5178 /* When converting under %#x or %#X, there are a number
5179 * of issues that cause pain:
5180 * - when 0 is being converted, the C standard leaves off
5181 * the '0x' or '0X', which is inconsistent with other
5182 * %#x/%#X conversions and inconsistent with Python's
5183 * hex() function
5184 * - there are platforms that violate the standard and
5185 * convert 0 with the '0x' or '0X'
5186 * (Metrowerks, Compaq Tru64)
5187 * - there are platforms that give '0x' when converting
5188 * under %#X, but convert 0 in accordance with the
5189 * standard (OS/2 EMX)
5190 *
5191 * We can achieve the desired consistency by inserting our
5192 * own '0x' or '0X' prefix, and substituting %x/%X in place
5193 * of %#x/%#X.
5194 *
5195 * Note that this is the same approach as used in
5196 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005197 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005198 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5199 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005200 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005201 else {
5202 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5203 (flags&F_ALT) ? "#" : "",
5204 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005205 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005206 return usprintf(buf, fmt, x);
5207}
5208
5209static int
5210formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005211 size_t buflen,
5212 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005214 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005215 if (PyUnicode_Check(v)) {
5216 if (PyUnicode_GET_SIZE(v) != 1)
5217 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005221 else if (PyString_Check(v)) {
5222 if (PyString_GET_SIZE(v) != 1)
5223 goto onError;
5224 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5225 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226
5227 else {
5228 /* Integer input truncated to a character */
5229 long x;
5230 x = PyInt_AsLong(v);
5231 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005232 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005233 buf[0] = (char) x;
5234 }
5235 buf[1] = '\0';
5236 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005237
5238 onError:
5239 PyErr_SetString(PyExc_TypeError,
5240 "%c requires int or char");
5241 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005242}
5243
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005244/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5245
5246 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5247 chars are formatted. XXX This is a magic number. Each formatting
5248 routine does bounds checking to ensure no overflow, but a better
5249 solution may be to malloc a buffer of appropriate size for each
5250 format. For now, the current solution is sufficient.
5251*/
5252#define FORMATBUFLEN (size_t)120
5253
Guido van Rossumd57fd912000-03-10 22:53:23 +00005254PyObject *PyUnicode_Format(PyObject *format,
5255 PyObject *args)
5256{
5257 Py_UNICODE *fmt, *res;
5258 int fmtcnt, rescnt, reslen, arglen, argidx;
5259 int args_owned = 0;
5260 PyUnicodeObject *result = NULL;
5261 PyObject *dict = NULL;
5262 PyObject *uformat;
5263
5264 if (format == NULL || args == NULL) {
5265 PyErr_BadInternalCall();
5266 return NULL;
5267 }
5268 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005269 if (uformat == NULL)
5270 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271 fmt = PyUnicode_AS_UNICODE(uformat);
5272 fmtcnt = PyUnicode_GET_SIZE(uformat);
5273
5274 reslen = rescnt = fmtcnt + 100;
5275 result = _PyUnicode_New(reslen);
5276 if (result == NULL)
5277 goto onError;
5278 res = PyUnicode_AS_UNICODE(result);
5279
5280 if (PyTuple_Check(args)) {
5281 arglen = PyTuple_Size(args);
5282 argidx = 0;
5283 }
5284 else {
5285 arglen = -1;
5286 argidx = -2;
5287 }
5288 if (args->ob_type->tp_as_mapping)
5289 dict = args;
5290
5291 while (--fmtcnt >= 0) {
5292 if (*fmt != '%') {
5293 if (--rescnt < 0) {
5294 rescnt = fmtcnt + 100;
5295 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005296 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 return NULL;
5298 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5299 --rescnt;
5300 }
5301 *res++ = *fmt++;
5302 }
5303 else {
5304 /* Got a format specifier */
5305 int flags = 0;
5306 int width = -1;
5307 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 Py_UNICODE c = '\0';
5309 Py_UNICODE fill;
5310 PyObject *v = NULL;
5311 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005312 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 Py_UNICODE sign;
5314 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005315 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316
5317 fmt++;
5318 if (*fmt == '(') {
5319 Py_UNICODE *keystart;
5320 int keylen;
5321 PyObject *key;
5322 int pcount = 1;
5323
5324 if (dict == NULL) {
5325 PyErr_SetString(PyExc_TypeError,
5326 "format requires a mapping");
5327 goto onError;
5328 }
5329 ++fmt;
5330 --fmtcnt;
5331 keystart = fmt;
5332 /* Skip over balanced parentheses */
5333 while (pcount > 0 && --fmtcnt >= 0) {
5334 if (*fmt == ')')
5335 --pcount;
5336 else if (*fmt == '(')
5337 ++pcount;
5338 fmt++;
5339 }
5340 keylen = fmt - keystart - 1;
5341 if (fmtcnt < 0 || pcount > 0) {
5342 PyErr_SetString(PyExc_ValueError,
5343 "incomplete format key");
5344 goto onError;
5345 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005346#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005347 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005348 then looked up since Python uses strings to hold
5349 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005350 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005351 key = PyUnicode_EncodeUTF8(keystart,
5352 keylen,
5353 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005354#else
5355 key = PyUnicode_FromUnicode(keystart, keylen);
5356#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357 if (key == NULL)
5358 goto onError;
5359 if (args_owned) {
5360 Py_DECREF(args);
5361 args_owned = 0;
5362 }
5363 args = PyObject_GetItem(dict, key);
5364 Py_DECREF(key);
5365 if (args == NULL) {
5366 goto onError;
5367 }
5368 args_owned = 1;
5369 arglen = -1;
5370 argidx = -2;
5371 }
5372 while (--fmtcnt >= 0) {
5373 switch (c = *fmt++) {
5374 case '-': flags |= F_LJUST; continue;
5375 case '+': flags |= F_SIGN; continue;
5376 case ' ': flags |= F_BLANK; continue;
5377 case '#': flags |= F_ALT; continue;
5378 case '0': flags |= F_ZERO; continue;
5379 }
5380 break;
5381 }
5382 if (c == '*') {
5383 v = getnextarg(args, arglen, &argidx);
5384 if (v == NULL)
5385 goto onError;
5386 if (!PyInt_Check(v)) {
5387 PyErr_SetString(PyExc_TypeError,
5388 "* wants int");
5389 goto onError;
5390 }
5391 width = PyInt_AsLong(v);
5392 if (width < 0) {
5393 flags |= F_LJUST;
5394 width = -width;
5395 }
5396 if (--fmtcnt >= 0)
5397 c = *fmt++;
5398 }
5399 else if (c >= '0' && c <= '9') {
5400 width = c - '0';
5401 while (--fmtcnt >= 0) {
5402 c = *fmt++;
5403 if (c < '0' || c > '9')
5404 break;
5405 if ((width*10) / 10 != width) {
5406 PyErr_SetString(PyExc_ValueError,
5407 "width too big");
5408 goto onError;
5409 }
5410 width = width*10 + (c - '0');
5411 }
5412 }
5413 if (c == '.') {
5414 prec = 0;
5415 if (--fmtcnt >= 0)
5416 c = *fmt++;
5417 if (c == '*') {
5418 v = getnextarg(args, arglen, &argidx);
5419 if (v == NULL)
5420 goto onError;
5421 if (!PyInt_Check(v)) {
5422 PyErr_SetString(PyExc_TypeError,
5423 "* wants int");
5424 goto onError;
5425 }
5426 prec = PyInt_AsLong(v);
5427 if (prec < 0)
5428 prec = 0;
5429 if (--fmtcnt >= 0)
5430 c = *fmt++;
5431 }
5432 else if (c >= '0' && c <= '9') {
5433 prec = c - '0';
5434 while (--fmtcnt >= 0) {
5435 c = Py_CHARMASK(*fmt++);
5436 if (c < '0' || c > '9')
5437 break;
5438 if ((prec*10) / 10 != prec) {
5439 PyErr_SetString(PyExc_ValueError,
5440 "prec too big");
5441 goto onError;
5442 }
5443 prec = prec*10 + (c - '0');
5444 }
5445 }
5446 } /* prec */
5447 if (fmtcnt >= 0) {
5448 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005449 if (--fmtcnt >= 0)
5450 c = *fmt++;
5451 }
5452 }
5453 if (fmtcnt < 0) {
5454 PyErr_SetString(PyExc_ValueError,
5455 "incomplete format");
5456 goto onError;
5457 }
5458 if (c != '%') {
5459 v = getnextarg(args, arglen, &argidx);
5460 if (v == NULL)
5461 goto onError;
5462 }
5463 sign = 0;
5464 fill = ' ';
5465 switch (c) {
5466
5467 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005468 pbuf = formatbuf;
5469 /* presume that buffer length is at least 1 */
5470 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005471 len = 1;
5472 break;
5473
5474 case 's':
5475 case 'r':
5476 if (PyUnicode_Check(v) && c == 's') {
5477 temp = v;
5478 Py_INCREF(temp);
5479 }
5480 else {
5481 PyObject *unicode;
5482 if (c == 's')
5483 temp = PyObject_Str(v);
5484 else
5485 temp = PyObject_Repr(v);
5486 if (temp == NULL)
5487 goto onError;
5488 if (!PyString_Check(temp)) {
5489 /* XXX Note: this should never happen, since
5490 PyObject_Repr() and PyObject_Str() assure
5491 this */
5492 Py_DECREF(temp);
5493 PyErr_SetString(PyExc_TypeError,
5494 "%s argument has non-string str()");
5495 goto onError;
5496 }
Fred Drakee4315f52000-05-09 19:53:39 +00005497 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005498 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005499 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005500 "strict");
5501 Py_DECREF(temp);
5502 temp = unicode;
5503 if (temp == NULL)
5504 goto onError;
5505 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005506 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 len = PyUnicode_GET_SIZE(temp);
5508 if (prec >= 0 && len > prec)
5509 len = prec;
5510 break;
5511
5512 case 'i':
5513 case 'd':
5514 case 'u':
5515 case 'o':
5516 case 'x':
5517 case 'X':
5518 if (c == 'i')
5519 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005520 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005521 temp = formatlong(v, flags, prec, c);
5522 if (!temp)
5523 goto onError;
5524 pbuf = PyUnicode_AS_UNICODE(temp);
5525 len = PyUnicode_GET_SIZE(temp);
5526 /* unbounded ints can always produce
5527 a sign character! */
5528 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005529 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005530 else {
5531 pbuf = formatbuf;
5532 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5533 flags, prec, c, v);
5534 if (len < 0)
5535 goto onError;
5536 /* only d conversion is signed */
5537 sign = c == 'd';
5538 }
5539 if (flags & F_ZERO)
5540 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005541 break;
5542
5543 case 'e':
5544 case 'E':
5545 case 'f':
5546 case 'g':
5547 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005548 pbuf = formatbuf;
5549 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5550 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005551 if (len < 0)
5552 goto onError;
5553 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005554 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005555 fill = '0';
5556 break;
5557
5558 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005559 pbuf = formatbuf;
5560 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005561 if (len < 0)
5562 goto onError;
5563 break;
5564
5565 default:
5566 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005567 "unsupported format character '%c' (0x%x) "
5568 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005569 (31<=c && c<=126) ? c : '?',
5570 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005571 goto onError;
5572 }
5573 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005574 if (*pbuf == '-' || *pbuf == '+') {
5575 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005576 len--;
5577 }
5578 else if (flags & F_SIGN)
5579 sign = '+';
5580 else if (flags & F_BLANK)
5581 sign = ' ';
5582 else
5583 sign = 0;
5584 }
5585 if (width < len)
5586 width = len;
5587 if (rescnt < width + (sign != 0)) {
5588 reslen -= rescnt;
5589 rescnt = width + fmtcnt + 100;
5590 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005591 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 return NULL;
5593 res = PyUnicode_AS_UNICODE(result)
5594 + reslen - rescnt;
5595 }
5596 if (sign) {
5597 if (fill != ' ')
5598 *res++ = sign;
5599 rescnt--;
5600 if (width > len)
5601 width--;
5602 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005603 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5604 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005605 assert(pbuf[1] == c);
5606 if (fill != ' ') {
5607 *res++ = *pbuf++;
5608 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005609 }
Tim Petersfff53252001-04-12 18:38:48 +00005610 rescnt -= 2;
5611 width -= 2;
5612 if (width < 0)
5613 width = 0;
5614 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005615 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005616 if (width > len && !(flags & F_LJUST)) {
5617 do {
5618 --rescnt;
5619 *res++ = fill;
5620 } while (--width > len);
5621 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005622 if (fill == ' ') {
5623 if (sign)
5624 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005625 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005626 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005627 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005628 *res++ = *pbuf++;
5629 *res++ = *pbuf++;
5630 }
5631 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005632 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005633 res += len;
5634 rescnt -= len;
5635 while (--width >= len) {
5636 --rescnt;
5637 *res++ = ' ';
5638 }
5639 if (dict && (argidx < arglen) && c != '%') {
5640 PyErr_SetString(PyExc_TypeError,
5641 "not all arguments converted");
5642 goto onError;
5643 }
5644 Py_XDECREF(temp);
5645 } /* '%' */
5646 } /* until end */
5647 if (argidx < arglen && !dict) {
5648 PyErr_SetString(PyExc_TypeError,
5649 "not all arguments converted");
5650 goto onError;
5651 }
5652
5653 if (args_owned) {
5654 Py_DECREF(args);
5655 }
5656 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005657 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005658 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005659 return (PyObject *)result;
5660
5661 onError:
5662 Py_XDECREF(result);
5663 Py_DECREF(uformat);
5664 if (args_owned) {
5665 Py_DECREF(args);
5666 }
5667 return NULL;
5668}
5669
5670static PyBufferProcs unicode_as_buffer = {
5671 (getreadbufferproc) unicode_buffer_getreadbuf,
5672 (getwritebufferproc) unicode_buffer_getwritebuf,
5673 (getsegcountproc) unicode_buffer_getsegcount,
5674 (getcharbufferproc) unicode_buffer_getcharbuf,
5675};
5676
Guido van Rossume023fe02001-08-30 03:12:59 +00005677staticforward PyObject *
5678unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5679
Tim Peters6d6c1a32001-08-02 04:15:00 +00005680static PyObject *
5681unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5682{
5683 PyObject *x = NULL;
5684 static char *kwlist[] = {"string", "encoding", "errors", 0};
5685 char *encoding = NULL;
5686 char *errors = NULL;
5687
Guido van Rossume023fe02001-08-30 03:12:59 +00005688 if (type != &PyUnicode_Type)
5689 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005690 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5691 kwlist, &x, &encoding, &errors))
5692 return NULL;
5693 if (x == NULL)
5694 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005695 if (encoding == NULL && errors == NULL)
5696 return PyObject_Unicode(x);
5697 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005698 return PyUnicode_FromEncodedObject(x, encoding, errors);
5699}
5700
Guido van Rossume023fe02001-08-30 03:12:59 +00005701static PyObject *
5702unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5703{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005704 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005705 int n;
5706
5707 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5708 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5709 if (tmp == NULL)
5710 return NULL;
5711 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005712 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5713 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005714 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005715 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5716 if (pnew->str == NULL) {
5717 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005718 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005719 return NULL;
5720 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005721 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5722 pnew->length = n;
5723 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005724 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005725 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005726}
5727
Tim Peters6d6c1a32001-08-02 04:15:00 +00005728static char unicode_doc[] =
5729"unicode(string [, encoding[, errors]]) -> object\n\
5730\n\
5731Create a new Unicode object from the given encoded string.\n\
5732encoding defaults to the current default string encoding and \n\
5733errors, defining the error handling, to 'strict'.";
5734
Guido van Rossumd57fd912000-03-10 22:53:23 +00005735PyTypeObject PyUnicode_Type = {
5736 PyObject_HEAD_INIT(&PyType_Type)
5737 0, /* ob_size */
5738 "unicode", /* tp_name */
5739 sizeof(PyUnicodeObject), /* tp_size */
5740 0, /* tp_itemsize */
5741 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005742 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005744 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005745 0, /* tp_setattr */
5746 (cmpfunc) unicode_compare, /* tp_compare */
5747 (reprfunc) unicode_repr, /* tp_repr */
5748 0, /* tp_as_number */
5749 &unicode_as_sequence, /* tp_as_sequence */
5750 0, /* tp_as_mapping */
5751 (hashfunc) unicode_hash, /* tp_hash*/
5752 0, /* tp_call*/
5753 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005754 PyObject_GenericGetAttr, /* tp_getattro */
5755 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005757 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005758 unicode_doc, /* tp_doc */
5759 0, /* tp_traverse */
5760 0, /* tp_clear */
5761 0, /* tp_richcompare */
5762 0, /* tp_weaklistoffset */
5763 0, /* tp_iter */
5764 0, /* tp_iternext */
5765 unicode_methods, /* tp_methods */
5766 0, /* tp_members */
5767 0, /* tp_getset */
5768 0, /* tp_base */
5769 0, /* tp_dict */
5770 0, /* tp_descr_get */
5771 0, /* tp_descr_set */
5772 0, /* tp_dictoffset */
5773 0, /* tp_init */
5774 0, /* tp_alloc */
5775 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005776 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005777};
5778
5779/* Initialize the Unicode implementation */
5780
Thomas Wouters78890102000-07-22 19:25:51 +00005781void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005782{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005783 int i;
5784
Fred Drakee4315f52000-05-09 19:53:39 +00005785 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005786 unicode_freelist = NULL;
5787 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005789 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005790 for (i = 0; i < 256; i++)
5791 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005792}
5793
5794/* Finalize the Unicode implementation */
5795
5796void
Thomas Wouters78890102000-07-22 19:25:51 +00005797_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005798{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005799 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005800 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005801
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005802 Py_XDECREF(unicode_empty);
5803 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005804
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005805 for (i = 0; i < 256; i++) {
5806 if (unicode_latin1[i]) {
5807 Py_DECREF(unicode_latin1[i]);
5808 unicode_latin1[i] = NULL;
5809 }
5810 }
5811
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005812 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005813 PyUnicodeObject *v = u;
5814 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005815 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005816 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005817 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005818 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005819 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005820 unicode_freelist = NULL;
5821 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005822}