blob: 29ba2e449848d8cb21046aad150e9d19064e6ebe [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000204 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Neil Schemenauer58aa8612002-04-12 03:07:20 +0000222 PyObject_Del(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum604ddf82001-12-06 20:03:56 +0000229 if (PyUnicode_CheckExact(unicode) &&
230 unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000231 /* Keep-Alive optimization */
232 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000233 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000234 unicode->str = NULL;
235 unicode->length = 0;
236 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000237 if (unicode->defenc) {
238 Py_DECREF(unicode->defenc);
239 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000240 }
241 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 *(PyUnicodeObject **)unicode = unicode_freelist;
243 unicode_freelist = unicode;
244 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 }
246 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000247 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000248 Py_XDECREF(unicode->defenc);
Guido van Rossum604ddf82001-12-06 20:03:56 +0000249 unicode->ob_type->tp_free((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000250 }
251}
252
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000253int PyUnicode_Resize(PyObject **unicode,
254 int length)
255{
256 register PyUnicodeObject *v;
257
258 /* Argument checks */
259 if (unicode == NULL) {
260 PyErr_BadInternalCall();
261 return -1;
262 }
263 v = (PyUnicodeObject *)*unicode;
264 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
265 PyErr_BadInternalCall();
266 return -1;
267 }
268
269 /* Resizing unicode_empty and single character objects is not
270 possible since these are being shared. We simply return a fresh
271 copy with the same Unicode content. */
272 if (v->length != length &&
273 (v == unicode_empty || v->length == 1)) {
274 PyUnicodeObject *w = _PyUnicode_New(length);
275 if (w == NULL)
276 return -1;
277 Py_UNICODE_COPY(w->str, v->str,
278 length < v->length ? length : v->length);
279 *unicode = (PyObject *)w;
280 return 0;
281 }
282
283 /* Note that we don't have to modify *unicode for unshared Unicode
284 objects, since we can modify them in-place. */
285 return unicode_resize(v, length);
286}
287
288/* Internal API for use in unicodeobject.c only ! */
289#define _PyUnicode_Resize(unicodevar, length) \
290 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
291
Guido van Rossumd57fd912000-03-10 22:53:23 +0000292PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
293 int size)
294{
295 PyUnicodeObject *unicode;
296
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000297 /* If the Unicode data is known at construction time, we can apply
298 some optimizations which share commonly used objects. */
299 if (u != NULL) {
300
301 /* Optimization for empty strings */
302 if (size == 0 && unicode_empty != NULL) {
303 Py_INCREF(unicode_empty);
304 return (PyObject *)unicode_empty;
305 }
306
307 /* Single character Unicode objects in the Latin-1 range are
308 shared when using this constructor */
309 if (size == 1 && *u < 256) {
310 unicode = unicode_latin1[*u];
311 if (!unicode) {
312 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000313 if (!unicode)
314 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000315 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 unicode_latin1[*u] = unicode;
317 }
318 Py_INCREF(unicode);
319 return (PyObject *)unicode;
320 }
321 }
322
Guido van Rossumd57fd912000-03-10 22:53:23 +0000323 unicode = _PyUnicode_New(size);
324 if (!unicode)
325 return NULL;
326
327 /* Copy the Unicode data into the new object */
328 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000329 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000330
331 return (PyObject *)unicode;
332}
333
334#ifdef HAVE_WCHAR_H
335
336PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
337 int size)
338{
339 PyUnicodeObject *unicode;
340
341 if (w == NULL) {
342 PyErr_BadInternalCall();
343 return NULL;
344 }
345
346 unicode = _PyUnicode_New(size);
347 if (!unicode)
348 return NULL;
349
350 /* Copy the wchar_t data into the new object */
351#ifdef HAVE_USABLE_WCHAR_T
352 memcpy(unicode->str, w, size * sizeof(wchar_t));
353#else
354 {
355 register Py_UNICODE *u;
356 register int i;
357 u = PyUnicode_AS_UNICODE(unicode);
358 for (i = size; i >= 0; i--)
359 *u++ = *w++;
360 }
361#endif
362
363 return (PyObject *)unicode;
364}
365
366int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
367 register wchar_t *w,
368 int size)
369{
370 if (unicode == NULL) {
371 PyErr_BadInternalCall();
372 return -1;
373 }
374 if (size > PyUnicode_GET_SIZE(unicode))
375 size = PyUnicode_GET_SIZE(unicode);
376#ifdef HAVE_USABLE_WCHAR_T
377 memcpy(w, unicode->str, size * sizeof(wchar_t));
378#else
379 {
380 register Py_UNICODE *u;
381 register int i;
382 u = PyUnicode_AS_UNICODE(unicode);
383 for (i = size; i >= 0; i--)
384 *w++ = *u++;
385 }
386#endif
387
388 return size;
389}
390
391#endif
392
393PyObject *PyUnicode_FromObject(register PyObject *obj)
394{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000395 /* XXX Perhaps we should make this API an alias of
396 PyObject_Unicode() instead ?! */
397 if (PyUnicode_CheckExact(obj)) {
398 Py_INCREF(obj);
399 return obj;
400 }
401 if (PyUnicode_Check(obj)) {
402 /* For a Unicode subtype that's not a Unicode object,
403 return a true Unicode object with the same data. */
404 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
405 PyUnicode_GET_SIZE(obj));
406 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000407 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
408}
409
410PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
411 const char *encoding,
412 const char *errors)
413{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000414 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 int owned = 0;
417 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418
419 if (obj == NULL) {
420 PyErr_BadInternalCall();
421 return NULL;
422 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000423
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000424#if 0
425 /* For b/w compatibility we also accept Unicode objects provided
Marc-André Lemburgb5507ec2001-10-19 12:02:29 +0000426 that no encodings is given and then redirect to
427 PyObject_Unicode() which then applies the additional logic for
428 Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000429
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000430 NOTE: This API should really only be used for object which
431 represent *encoded* Unicode !
432
433 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000434 if (PyUnicode_Check(obj)) {
435 if (encoding) {
436 PyErr_SetString(PyExc_TypeError,
437 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000438 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000439 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442#else
443 if (PyUnicode_Check(obj)) {
444 PyErr_SetString(PyExc_TypeError,
445 "decoding Unicode is not supported");
446 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000448#endif
449
450 /* Coerce object */
451 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000452 s = PyString_AS_STRING(obj);
453 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000455 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
456 /* Overwrite the error message with something more useful in
457 case of a TypeError. */
458 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000459 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000460 "coercing to Unicode: need string or buffer, "
461 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000462 obj->ob_type->tp_name);
463 goto onError;
464 }
465
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000466 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000467 if (len == 0) {
468 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 else
472 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000473
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return v;
478
479 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000480 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000481 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484}
485
486PyObject *PyUnicode_Decode(const char *s,
487 int size,
488 const char *encoding,
489 const char *errors)
490{
491 PyObject *buffer = NULL, *unicode;
492
Fred Drakee4315f52000-05-09 19:53:39 +0000493 if (encoding == NULL)
494 encoding = PyUnicode_GetDefaultEncoding();
495
496 /* Shortcuts for common default encodings */
497 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000499 else if (strcmp(encoding, "latin-1") == 0)
500 return PyUnicode_DecodeLatin1(s, size, errors);
501 else if (strcmp(encoding, "ascii") == 0)
502 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000503
504 /* Decode via the codec registry */
505 buffer = PyBuffer_FromMemory((void *)s, size);
506 if (buffer == NULL)
507 goto onError;
508 unicode = PyCodec_Decode(buffer, encoding, errors);
509 if (unicode == NULL)
510 goto onError;
511 if (!PyUnicode_Check(unicode)) {
512 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 unicode->ob_type->tp_name);
515 Py_DECREF(unicode);
516 goto onError;
517 }
518 Py_DECREF(buffer);
519 return unicode;
520
521 onError:
522 Py_XDECREF(buffer);
523 return NULL;
524}
525
526PyObject *PyUnicode_Encode(const Py_UNICODE *s,
527 int size,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v, *unicode;
532
533 unicode = PyUnicode_FromUnicode(s, size);
534 if (unicode == NULL)
535 return NULL;
536 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
537 Py_DECREF(unicode);
538 return v;
539}
540
541PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
542 const char *encoding,
543 const char *errors)
544{
545 PyObject *v;
546
547 if (!PyUnicode_Check(unicode)) {
548 PyErr_BadArgument();
549 goto onError;
550 }
Fred Drakee4315f52000-05-09 19:53:39 +0000551
552 if (encoding == NULL)
553 encoding = PyUnicode_GetDefaultEncoding();
554
555 /* Shortcuts for common default encodings */
556 if (errors == NULL) {
557 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000558 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000559 else if (strcmp(encoding, "latin-1") == 0)
560 return PyUnicode_AsLatin1String(unicode);
561 else if (strcmp(encoding, "ascii") == 0)
562 return PyUnicode_AsASCIIString(unicode);
563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000564
565 /* Encode via the codec registry */
566 v = PyCodec_Encode(unicode, encoding, errors);
567 if (v == NULL)
568 goto onError;
569 /* XXX Should we really enforce this ? */
570 if (!PyString_Check(v)) {
571 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000572 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000573 v->ob_type->tp_name);
574 Py_DECREF(v);
575 goto onError;
576 }
577 return v;
578
579 onError:
580 return NULL;
581}
582
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000644/* --- UTF-7 Codec -------------------------------------------------------- */
645
646/* see RFC2152 for details */
647
648static
649char utf7_special[128] = {
650 /* indicate whether a UTF-7 character is special i.e. cannot be directly
651 encoded:
652 0 - not special
653 1 - special
654 2 - whitespace (optional)
655 3 - RFC2152 Set O (optional) */
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
660 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
664
665};
666
667#define SPECIAL(c, encodeO, encodeWS) \
668 (((c)>127 || utf7_special[(c)] == 1) || \
669 (encodeWS && (utf7_special[(c)] == 2)) || \
670 (encodeO && (utf7_special[(c)] == 3)))
671
672#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
673#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
674#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
675 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
676
677#define ENCODE(out, ch, bits) \
678 while (bits >= 6) { \
679 *out++ = B64(ch >> (bits-6)); \
680 bits -= 6; \
681 }
682
683#define DECODE(out, ch, bits, surrogate) \
684 while (bits >= 16) { \
685 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
686 bits -= 16; \
687 if (surrogate) { \
688 /* We have already generated an error for the high surrogate
689 so let's not bother seeing if the low surrogate is correct or not */\
690 surrogate = 0; \
691 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
692 /* This is a surrogate pair. Unfortunately we can't represent \
693 it in a 16-bit character */ \
694 surrogate = 1; \
695 errmsg = "code pairs are not supported"; \
696 goto utf7Error; \
697 } else { \
698 *out++ = outCh; \
699 } \
700 } \
701
702static
703int utf7_decoding_error(Py_UNICODE **dest,
704 const char *errors,
705 const char *details)
706{
707 if ((errors == NULL) ||
708 (strcmp(errors,"strict") == 0)) {
709 PyErr_Format(PyExc_UnicodeError,
710 "UTF-7 decoding error: %.400s",
711 details);
712 return -1;
713 }
714 else if (strcmp(errors,"ignore") == 0) {
715 return 0;
716 }
717 else if (strcmp(errors,"replace") == 0) {
718 if (dest != NULL) {
719 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
720 (*dest)++;
721 }
722 return 0;
723 }
724 else {
725 PyErr_Format(PyExc_ValueError,
726 "UTF-7 decoding error; unknown error handling code: %.400s",
727 errors);
728 return -1;
729 }
730}
731
732PyObject *PyUnicode_DecodeUTF7(const char *s,
733 int size,
734 const char *errors)
735{
736 const char *e;
737 PyUnicodeObject *unicode;
738 Py_UNICODE *p;
739 const char *errmsg = "";
740 int inShift = 0;
741 unsigned int bitsleft = 0;
742 unsigned long charsleft = 0;
743 int surrogate = 0;
744
745 unicode = _PyUnicode_New(size);
746 if (!unicode)
747 return NULL;
748 if (size == 0)
749 return (PyObject *)unicode;
750
751 p = unicode->str;
752 e = s + size;
753
754 while (s < e) {
755 Py_UNICODE ch = *s;
756
757 if (inShift) {
758 if ((ch == '-') || !B64CHAR(ch)) {
759 inShift = 0;
760 s++;
761
762 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
763 if (bitsleft >= 6) {
764 /* The shift sequence has a partial character in it. If
765 bitsleft < 6 then we could just classify it as padding
766 but that is not the case here */
767
768 errmsg = "partial character in shift sequence";
769 goto utf7Error;
770 }
771 /* According to RFC2152 the remaining bits should be zero. We
772 choose to signal an error/insert a replacement character
773 here so indicate the potential of a misencoded character. */
774
775 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
776 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
777 errmsg = "non-zero padding bits in shift sequence";
778 goto utf7Error;
779 }
780
781 if (ch == '-') {
782 if ((s < e) && (*(s) == '-')) {
783 *p++ = '-';
784 inShift = 1;
785 }
786 } else if (SPECIAL(ch,0,0)) {
787 errmsg = "unexpected special character";
788 goto utf7Error;
789 } else {
790 *p++ = ch;
791 }
792 } else {
793 charsleft = (charsleft << 6) | UB64(ch);
794 bitsleft += 6;
795 s++;
796 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
797 }
798 }
799 else if ( ch == '+' ) {
800 s++;
801 if (s < e && *s == '-') {
802 s++;
803 *p++ = '+';
804 } else
805 {
806 inShift = 1;
807 bitsleft = 0;
808 }
809 }
810 else if (SPECIAL(ch,0,0)) {
811 errmsg = "unexpected special character";
812 s++;
813 goto utf7Error;
814 }
815 else {
816 *p++ = ch;
817 s++;
818 }
819 continue;
820 utf7Error:
821 if (utf7_decoding_error(&p, errors, errmsg))
822 goto onError;
823 }
824
825 if (inShift) {
826 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
827 goto onError;
828 }
829
830 if (_PyUnicode_Resize(&unicode, p - unicode->str))
831 goto onError;
832
833 return (PyObject *)unicode;
834
835onError:
836 Py_DECREF(unicode);
837 return NULL;
838}
839
840
841PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
842 int size,
843 int encodeSetO,
844 int encodeWhiteSpace,
845 const char *errors)
846{
847 PyObject *v;
848 /* It might be possible to tighten this worst case */
849 unsigned int cbAllocated = 5 * size;
850 int inShift = 0;
851 int i = 0;
852 unsigned int bitsleft = 0;
853 unsigned long charsleft = 0;
854 char * out;
855 char * start;
856
857 if (size == 0)
858 return PyString_FromStringAndSize(NULL, 0);
859
860 v = PyString_FromStringAndSize(NULL, cbAllocated);
861 if (v == NULL)
862 return NULL;
863
864 start = out = PyString_AS_STRING(v);
865 for (;i < size; ++i) {
866 Py_UNICODE ch = s[i];
867
868 if (!inShift) {
869 if (ch == '+') {
870 *out++ = '+';
871 *out++ = '-';
872 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
873 charsleft = ch;
874 bitsleft = 16;
875 *out++ = '+';
876 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
877 inShift = bitsleft > 0;
878 } else {
879 *out++ = (char) ch;
880 }
881 } else {
882 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
883 *out++ = B64(charsleft << (6-bitsleft));
884 charsleft = 0;
885 bitsleft = 0;
886 /* Characters not in the BASE64 set implicitly unshift the sequence
887 so no '-' is required, except if the character is itself a '-' */
888 if (B64CHAR(ch) || ch == '-') {
889 *out++ = '-';
890 }
891 inShift = 0;
892 *out++ = (char) ch;
893 } else {
894 bitsleft += 16;
895 charsleft = (charsleft << 16) | ch;
896 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
897
898 /* If the next character is special then we dont' need to terminate
899 the shift sequence. If the next character is not a BASE64 character
900 or '-' then the shift sequence will be terminated implicitly and we
901 don't have to insert a '-'. */
902
903 if (bitsleft == 0) {
904 if (i + 1 < size) {
905 Py_UNICODE ch2 = s[i+1];
906
907 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
908
909 } else if (B64CHAR(ch2) || ch2 == '-') {
910 *out++ = '-';
911 inShift = 0;
912 } else {
913 inShift = 0;
914 }
915
916 }
917 else {
918 *out++ = '-';
919 inShift = 0;
920 }
921 }
922 }
923 }
924 }
925 if (bitsleft) {
926 *out++= B64(charsleft << (6-bitsleft) );
927 *out++ = '-';
928 }
929
930 if (_PyString_Resize(&v, out - start)) {
931 Py_DECREF(v);
932 return NULL;
933 }
934 return v;
935}
936
937#undef SPECIAL
938#undef B64
939#undef B64CHAR
940#undef UB64
941#undef ENCODE
942#undef DECODE
943
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944/* --- UTF-8 Codec -------------------------------------------------------- */
945
946static
947char utf8_code_length[256] = {
948 /* Map UTF-8 encoded prefix byte to sequence length. zero means
949 illegal prefix. see RFC 2279 for details */
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
959 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
963 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
964 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
965 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
966};
967
968static
969int utf8_decoding_error(const char **source,
970 Py_UNICODE **dest,
971 const char *errors,
972 const char *details)
973{
974 if ((errors == NULL) ||
975 (strcmp(errors,"strict") == 0)) {
976 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000977 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000978 details);
979 return -1;
980 }
981 else if (strcmp(errors,"ignore") == 0) {
982 (*source)++;
983 return 0;
984 }
985 else if (strcmp(errors,"replace") == 0) {
986 (*source)++;
987 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
988 (*dest)++;
989 return 0;
990 }
991 else {
992 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000993 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000994 errors);
995 return -1;
996 }
997}
998
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999PyObject *PyUnicode_DecodeUTF8(const char *s,
1000 int size,
1001 const char *errors)
1002{
1003 int n;
1004 const char *e;
1005 PyUnicodeObject *unicode;
1006 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001007 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008
1009 /* Note: size will always be longer than the resulting Unicode
1010 character count */
1011 unicode = _PyUnicode_New(size);
1012 if (!unicode)
1013 return NULL;
1014 if (size == 0)
1015 return (PyObject *)unicode;
1016
1017 /* Unpack UTF-8 encoded data */
1018 p = unicode->str;
1019 e = s + size;
1020
1021 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001022 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001025 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 s++;
1027 continue;
1028 }
1029
1030 n = utf8_code_length[ch];
1031
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001032 if (s + n > e) {
1033 errmsg = "unexpected end of data";
1034 goto utf8Error;
1035 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 switch (n) {
1038
1039 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001040 errmsg = "unexpected code byte";
1041 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042
1043 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 errmsg = "internal error";
1045 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046
1047 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001048 if ((s[1] & 0xc0) != 0x80) {
1049 errmsg = "invalid data";
1050 goto utf8Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001053 if (ch < 0x80) {
1054 errmsg = "illegal encoding";
1055 goto utf8Error;
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001058 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 break;
1060
1061 case 3:
1062 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 (s[2] & 0xc0) != 0x80) {
1064 errmsg = "invalid data";
1065 goto utf8Error;
1066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001067 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001068 if (ch < 0x0800) {
1069 /* Note: UTF-8 encodings of surrogates are considered
1070 legal UTF-8 sequences;
1071
1072 XXX For wide builds (UCS-4) we should probably try
1073 to recombine the surrogates into a single code
1074 unit.
1075 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001080 *p++ = (Py_UNICODE)ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001175/* Allocation strategy: we default to Latin-1, then do one resize
1176 whenever we hit an order boundary. The assumption is that
1177 characters from higher orders usually occur often enough to warrant
1178 this.
1179*/
1180
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1182 int size,
1183 const char *errors)
1184{
1185 PyObject *v;
1186 char *p;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001187 int i = 0;
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001188 int overalloc = 2;
1189 int len;
1190
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00001191 /* Short-cut for emtpy strings */
1192 if (size == 0)
1193 return PyString_FromStringAndSize(NULL, 0);
1194
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001195 v = PyString_FromStringAndSize(NULL, overalloc * size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001196 if (v == NULL)
1197 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001199 p = PyString_AS_STRING(v);
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001200
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001201 while (i < size) {
1202 Py_UCS4 ch = s[i++];
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001203
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001204 if (ch < 0x80)
1205 /* Encode ASCII */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 *p++ = (char) ch;
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001207
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208 else if (ch < 0x0800) {
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001209 /* Encode Latin-1 */
Marc-André Lemburgdc724d62002-02-06 18:20:19 +00001210 *p++ = (char)(0xc0 | (ch >> 6));
1211 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001212 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001213
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001214 else {
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001215 /* Encode UCS2 Unicode ordinals */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001216 if (ch < 0x10000) {
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001217
1218 /* Special case: check for high surrogate */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001219 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
1220 Py_UCS4 ch2 = s[i];
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001221 /* Check for low surrogate and combine the two to
1222 form a UCS4 value */
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001223 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001224 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
1225 i++;
1226 goto encodeUCS4;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001227 }
Marc-André Lemburg3688a882002-02-06 18:09:02 +00001228 /* Fall through: handles isolated high surrogates */
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001229 }
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001230
1231 if (overalloc < 3) {
1232 len = (int)(p - PyString_AS_STRING(v));
1233 overalloc = 3;
1234 if (_PyString_Resize(&v, overalloc * size))
1235 goto onError;
1236 p = PyString_AS_STRING(v) + len;
1237 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001238 *p++ = (char)(0xe0 | (ch >> 12));
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001239 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1240 *p++ = (char)(0x80 | (ch & 0x3f));
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001241 continue;
Marc-André Lemburge7c6ee42002-02-06 18:18:03 +00001242 }
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001243
1244 /* Encode UCS4 Unicode ordinals */
1245 encodeUCS4:
1246 if (overalloc < 4) {
1247 len = (int)(p - PyString_AS_STRING(v));
1248 overalloc = 4;
1249 if (_PyString_Resize(&v, overalloc * size))
1250 goto onError;
1251 p = PyString_AS_STRING(v) + len;
1252 }
1253 *p++ = (char)(0xf0 | (ch >> 18));
1254 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
1255 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1256 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258 }
1259 *p = '\0';
Marc-André Lemburg68e69332002-04-10 20:36:13 +00001260 if (_PyString_Resize(&v, (int)(p - PyString_AS_STRING(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001261 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001262 return v;
1263
1264 onError:
1265 Py_DECREF(v);
1266 return NULL;
1267}
1268
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1270{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 if (!PyUnicode_Check(unicode)) {
1272 PyErr_BadArgument();
1273 return NULL;
1274 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001275 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1276 PyUnicode_GET_SIZE(unicode),
1277 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278}
1279
1280/* --- UTF-16 Codec ------------------------------------------------------- */
1281
1282static
Tim Peters772747b2001-08-09 22:21:55 +00001283int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001284 const char *errors,
1285 const char *details)
1286{
1287 if ((errors == NULL) ||
1288 (strcmp(errors,"strict") == 0)) {
1289 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001290 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291 details);
1292 return -1;
1293 }
1294 else if (strcmp(errors,"ignore") == 0) {
1295 return 0;
1296 }
1297 else if (strcmp(errors,"replace") == 0) {
1298 if (dest) {
1299 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1300 (*dest)++;
1301 }
1302 return 0;
1303 }
1304 else {
1305 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001306 "UTF-16 decoding error; "
1307 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 errors);
1309 return -1;
1310 }
1311}
1312
Tim Peters772747b2001-08-09 22:21:55 +00001313PyObject *
1314PyUnicode_DecodeUTF16(const char *s,
1315 int size,
1316 const char *errors,
1317 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001318{
1319 PyUnicodeObject *unicode;
1320 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001321 const unsigned char *q, *e;
1322 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001323 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001324 /* Offsets from q for retrieving byte pairs in the right order. */
1325#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1326 int ihi = 1, ilo = 0;
1327#else
1328 int ihi = 0, ilo = 1;
1329#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330
1331 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001332 if (size & 1) {
1333 if (utf16_decoding_error(NULL, errors, "truncated data"))
1334 return NULL;
1335 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336 }
1337
1338 /* Note: size will always be longer than the resulting Unicode
1339 character count */
1340 unicode = _PyUnicode_New(size);
1341 if (!unicode)
1342 return NULL;
1343 if (size == 0)
1344 return (PyObject *)unicode;
1345
1346 /* Unpack UTF-16 encoded data */
1347 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001348 q = (unsigned char *)s;
1349 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350
1351 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001352 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001354 /* Check for BOM marks (U+FEFF) in the input and adjust current
1355 byte order setting accordingly. In native mode, the leading BOM
1356 mark is skipped, in all other modes, it is copied to the output
1357 stream as-is (giving a ZWNBSP character). */
1358 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001359 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001360#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001361 if (bom == 0xFEFF) {
1362 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001363 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001364 }
1365 else if (bom == 0xFFFE) {
1366 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001367 bo = 1;
1368 }
1369#else
Tim Peters772747b2001-08-09 22:21:55 +00001370 if (bom == 0xFEFF) {
1371 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001372 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001373 }
1374 else if (bom == 0xFFFE) {
1375 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001376 bo = -1;
1377 }
1378#endif
1379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001380
Tim Peters772747b2001-08-09 22:21:55 +00001381 if (bo == -1) {
1382 /* force LE */
1383 ihi = 1;
1384 ilo = 0;
1385 }
1386 else if (bo == 1) {
1387 /* force BE */
1388 ihi = 0;
1389 ilo = 1;
1390 }
1391
1392 while (q < e) {
1393 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1394 q += 2;
1395
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396 if (ch < 0xD800 || ch > 0xDFFF) {
1397 *p++ = ch;
1398 continue;
1399 }
1400
1401 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001402 if (q >= e) {
1403 errmsg = "unexpected end of data";
1404 goto utf16Error;
1405 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001406 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001407 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1408 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001409 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001410#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001411 *p++ = ch;
1412 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001413#else
1414 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001415#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001416 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001417 }
1418 else {
1419 errmsg = "illegal UTF-16 surrogate";
1420 goto utf16Error;
1421 }
1422
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001424 errmsg = "illegal encoding";
1425 /* Fall through to report the error */
1426
1427 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001428 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001429 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430 }
1431
1432 if (byteorder)
1433 *byteorder = bo;
1434
1435 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001436 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001437 goto onError;
1438
1439 return (PyObject *)unicode;
1440
1441onError:
1442 Py_DECREF(unicode);
1443 return NULL;
1444}
1445
Tim Peters772747b2001-08-09 22:21:55 +00001446PyObject *
1447PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1448 int size,
1449 const char *errors,
1450 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001451{
1452 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001453 unsigned char *p;
1454 int i, pairs;
1455 /* Offsets from p for storing byte pairs in the right order. */
1456#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1457 int ihi = 1, ilo = 0;
1458#else
1459 int ihi = 0, ilo = 1;
1460#endif
1461
1462#define STORECHAR(CH) \
1463 do { \
1464 p[ihi] = ((CH) >> 8) & 0xff; \
1465 p[ilo] = (CH) & 0xff; \
1466 p += 2; \
1467 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001468
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001469 for (i = pairs = 0; i < size; i++)
1470 if (s[i] >= 0x10000)
1471 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001473 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474 if (v == NULL)
1475 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001476
Tim Peters772747b2001-08-09 22:21:55 +00001477 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001478 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001479 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001480 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001481 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001482
1483 if (byteorder == -1) {
1484 /* force LE */
1485 ihi = 1;
1486 ilo = 0;
1487 }
1488 else if (byteorder == 1) {
1489 /* force BE */
1490 ihi = 0;
1491 ilo = 1;
1492 }
1493
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001494 while (size-- > 0) {
1495 Py_UNICODE ch = *s++;
1496 Py_UNICODE ch2 = 0;
1497 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001498 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1499 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 }
Tim Peters772747b2001-08-09 22:21:55 +00001501 STORECHAR(ch);
1502 if (ch2)
1503 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001504 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001506#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507}
1508
1509PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1510{
1511 if (!PyUnicode_Check(unicode)) {
1512 PyErr_BadArgument();
1513 return NULL;
1514 }
1515 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1516 PyUnicode_GET_SIZE(unicode),
1517 NULL,
1518 0);
1519}
1520
1521/* --- Unicode Escape Codec ----------------------------------------------- */
1522
1523static
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001524int unicodeescape_decoding_error(Py_UNICODE **x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001525 const char *errors,
1526 const char *details)
1527{
1528 if ((errors == NULL) ||
1529 (strcmp(errors,"strict") == 0)) {
1530 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001531 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 details);
1533 return -1;
1534 }
1535 else if (strcmp(errors,"ignore") == 0) {
1536 return 0;
1537 }
1538 else if (strcmp(errors,"replace") == 0) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001539 **x = Py_UNICODE_REPLACEMENT_CHARACTER;
1540 (*x)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541 return 0;
1542 }
1543 else {
1544 PyErr_Format(PyExc_ValueError,
1545 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001546 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547 errors);
1548 return -1;
1549 }
1550}
1551
Fredrik Lundh06d12682001-01-24 07:59:11 +00001552static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001553
Guido van Rossumd57fd912000-03-10 22:53:23 +00001554PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1555 int size,
1556 const char *errors)
1557{
1558 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001559 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001561 char* message;
1562 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1563
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564 /* Escaped strings will always be longer than the resulting
1565 Unicode string, so we start with size here and then reduce the
1566 length after conversion to the true value. */
1567 v = _PyUnicode_New(size);
1568 if (v == NULL)
1569 goto onError;
1570 if (size == 0)
1571 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001572
Guido van Rossumd57fd912000-03-10 22:53:23 +00001573 p = buf = PyUnicode_AS_UNICODE(v);
1574 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001575
Guido van Rossumd57fd912000-03-10 22:53:23 +00001576 while (s < end) {
1577 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001578 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001579 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580
1581 /* Non-escape characters are interpreted as Unicode ordinals */
1582 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001583 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584 continue;
1585 }
1586
1587 /* \ - Escapes */
1588 s++;
1589 switch (*s++) {
1590
1591 /* \x escapes */
1592 case '\n': break;
1593 case '\\': *p++ = '\\'; break;
1594 case '\'': *p++ = '\''; break;
1595 case '\"': *p++ = '\"'; break;
1596 case 'b': *p++ = '\b'; break;
1597 case 'f': *p++ = '\014'; break; /* FF */
1598 case 't': *p++ = '\t'; break;
1599 case 'n': *p++ = '\n'; break;
1600 case 'r': *p++ = '\r'; break;
1601 case 'v': *p++ = '\013'; break; /* VT */
1602 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1603
1604 /* \OOO (octal) escapes */
1605 case '0': case '1': case '2': case '3':
1606 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001607 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001609 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001610 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001611 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001613 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 break;
1615
Fredrik Lundhccc74732001-02-18 22:13:49 +00001616 /* hex escapes */
1617 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001618 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001619 digits = 2;
1620 message = "truncated \\xXX escape";
1621 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001622
Fredrik Lundhccc74732001-02-18 22:13:49 +00001623 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001624 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001625 digits = 4;
1626 message = "truncated \\uXXXX escape";
1627 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628
Fredrik Lundhccc74732001-02-18 22:13:49 +00001629 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001630 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001631 digits = 8;
1632 message = "truncated \\UXXXXXXXX escape";
1633 hexescape:
1634 chr = 0;
1635 for (i = 0; i < digits; i++) {
1636 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001637 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001638 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001639 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001640 chr = 0xffffffff;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001641 i++;
1642 break;
1643 }
1644 chr = (chr<<4) & ~0xF;
1645 if (c >= '0' && c <= '9')
1646 chr += c - '0';
1647 else if (c >= 'a' && c <= 'f')
1648 chr += 10 + c - 'a';
1649 else
1650 chr += 10 + c - 'A';
1651 }
1652 s += i;
Walter Dörwald8c077222002-03-25 11:16:18 +00001653 if (chr == 0xffffffff)
1654 /* _decoding_error will have already written into the
1655 target buffer. */
1656 break;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001657 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001658 /* when we get here, chr is a 32-bit unicode character */
1659 if (chr <= 0xffff)
1660 /* UCS-2 character */
1661 *p++ = (Py_UNICODE) chr;
1662 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001663 /* UCS-4 character. Either store directly, or as
Walter Dörwald8c077222002-03-25 11:16:18 +00001664 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001665#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001666 *p++ = chr;
1667#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001668 chr -= 0x10000L;
1669 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001670 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001671#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001672 } else {
1673 if (unicodeescape_decoding_error(
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001674 &p, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001675 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001676 )
1677 goto onError;
1678 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001679 break;
1680
1681 /* \N{name} */
1682 case 'N':
1683 message = "malformed \\N character escape";
1684 if (ucnhash_CAPI == NULL) {
1685 /* load the unicode data module */
1686 PyObject *m, *v;
1687 m = PyImport_ImportModule("unicodedata");
1688 if (m == NULL)
1689 goto ucnhashError;
1690 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1691 Py_DECREF(m);
1692 if (v == NULL)
1693 goto ucnhashError;
1694 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1695 Py_DECREF(v);
1696 if (ucnhash_CAPI == NULL)
1697 goto ucnhashError;
1698 }
1699 if (*s == '{') {
1700 const char *start = s+1;
1701 /* look for the closing brace */
1702 while (*s != '}' && s < end)
1703 s++;
1704 if (s > start && s < end && *s == '}') {
1705 /* found a name. look it up in the unicode database */
1706 message = "unknown Unicode character name";
1707 s++;
1708 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1709 goto store;
1710 }
1711 }
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001712 if (unicodeescape_decoding_error(&p, errors, message))
Fredrik Lundhccc74732001-02-18 22:13:49 +00001713 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001714 break;
1715
1716 default:
Walter Dörwald8c077222002-03-25 11:16:18 +00001717 if (s > end) {
1718 if (unicodeescape_decoding_error(&p, errors, "\\ at end of string"))
1719 goto onError;
1720 }
1721 else {
1722 *p++ = '\\';
1723 *p++ = (unsigned char)s[-1];
1724 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001725 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726 }
1727 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001728 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Walter Dörwald8c077222002-03-25 11:16:18 +00001729 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 return (PyObject *)v;
Walter Dörwald8c077222002-03-25 11:16:18 +00001731
Fredrik Lundhccc74732001-02-18 22:13:49 +00001732ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001733 PyErr_SetString(
1734 PyExc_UnicodeError,
1735 "\\N escapes not supported (can't load unicodedata module)"
1736 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001737 return NULL;
1738
Fredrik Lundhccc74732001-02-18 22:13:49 +00001739onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001740 Py_XDECREF(v);
1741 return NULL;
1742}
1743
1744/* Return a Unicode-Escape string version of the Unicode object.
1745
1746 If quotes is true, the string is enclosed in u"" or u'' quotes as
1747 appropriate.
1748
1749*/
1750
Barry Warsaw51ac5802000-03-20 16:36:48 +00001751static const Py_UNICODE *findchar(const Py_UNICODE *s,
1752 int size,
1753 Py_UNICODE ch);
1754
Guido van Rossumd57fd912000-03-10 22:53:23 +00001755static
1756PyObject *unicodeescape_string(const Py_UNICODE *s,
1757 int size,
1758 int quotes)
1759{
1760 PyObject *repr;
1761 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001762
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001763 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764
1765 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1766 if (repr == NULL)
1767 return NULL;
1768
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001769 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001770
1771 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 *p++ = 'u';
1773 *p++ = (findchar(s, size, '\'') &&
1774 !findchar(s, size, '"')) ? '"' : '\'';
1775 }
1776 while (size-- > 0) {
1777 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001778
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001780 if (quotes &&
1781 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 *p++ = '\\';
1783 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001784 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001786
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001787#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001788 /* Map 21-bit characters to '\U00xxxxxx' */
1789 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001790 int offset = p - PyString_AS_STRING(repr);
1791
1792 /* Resize the string if necessary */
1793 if (offset + 12 > PyString_GET_SIZE(repr)) {
1794 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1795 goto onError;
1796 p = PyString_AS_STRING(repr) + offset;
1797 }
1798
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001799 *p++ = '\\';
1800 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001801 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1802 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1803 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1804 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1805 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1806 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1807 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001808 *p++ = hexdigit[ch & 0x0000000F];
1809 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001810 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001811#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001812 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1813 else if (ch >= 0xD800 && ch < 0xDC00) {
1814 Py_UNICODE ch2;
1815 Py_UCS4 ucs;
1816
1817 ch2 = *s++;
1818 size--;
1819 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1820 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1821 *p++ = '\\';
1822 *p++ = 'U';
1823 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1824 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1825 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1826 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1827 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1828 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1829 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1830 *p++ = hexdigit[ucs & 0x0000000F];
1831 continue;
1832 }
1833 /* Fall through: isolated surrogates are copied as-is */
1834 s--;
1835 size++;
1836 }
1837
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001839 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 *p++ = '\\';
1841 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001842 *p++ = hexdigit[(ch >> 12) & 0x000F];
1843 *p++ = hexdigit[(ch >> 8) & 0x000F];
1844 *p++ = hexdigit[(ch >> 4) & 0x000F];
1845 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001847
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001848 /* Map special whitespace to '\t', \n', '\r' */
1849 else if (ch == '\t') {
1850 *p++ = '\\';
1851 *p++ = 't';
1852 }
1853 else if (ch == '\n') {
1854 *p++ = '\\';
1855 *p++ = 'n';
1856 }
1857 else if (ch == '\r') {
1858 *p++ = '\\';
1859 *p++ = 'r';
1860 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001861
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001862 /* Map non-printable US ASCII to '\xhh' */
Marc-André Lemburg11326de2001-11-28 12:56:20 +00001863 else if (ch < ' ' || ch >= 0x7F) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001865 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001866 *p++ = hexdigit[(ch >> 4) & 0x000F];
1867 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001869
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870 /* Copy everything else as-is */
1871 else
1872 *p++ = (char) ch;
1873 }
1874 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001875 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001876
1877 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001878 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001879 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880
1881 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001882
1883 onError:
1884 Py_DECREF(repr);
1885 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001886}
1887
1888PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1889 int size)
1890{
1891 return unicodeescape_string(s, size, 0);
1892}
1893
1894PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1895{
1896 if (!PyUnicode_Check(unicode)) {
1897 PyErr_BadArgument();
1898 return NULL;
1899 }
1900 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1901 PyUnicode_GET_SIZE(unicode));
1902}
1903
1904/* --- Raw Unicode Escape Codec ------------------------------------------- */
1905
1906PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1907 int size,
1908 const char *errors)
1909{
1910 PyUnicodeObject *v;
1911 Py_UNICODE *p, *buf;
1912 const char *end;
1913 const char *bs;
1914
1915 /* Escaped strings will always be longer than the resulting
1916 Unicode string, so we start with size here and then reduce the
1917 length after conversion to the true value. */
1918 v = _PyUnicode_New(size);
1919 if (v == NULL)
1920 goto onError;
1921 if (size == 0)
1922 return (PyObject *)v;
1923 p = buf = PyUnicode_AS_UNICODE(v);
1924 end = s + size;
1925 while (s < end) {
1926 unsigned char c;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001927 Py_UCS4 x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928 int i;
1929
1930 /* Non-escape characters are interpreted as Unicode ordinals */
1931 if (*s != '\\') {
1932 *p++ = (unsigned char)*s++;
1933 continue;
1934 }
1935
1936 /* \u-escapes are only interpreted iff the number of leading
1937 backslashes if odd */
1938 bs = s;
1939 for (;s < end;) {
1940 if (*s != '\\')
1941 break;
1942 *p++ = (unsigned char)*s++;
1943 }
1944 if (((s - bs) & 1) == 0 ||
1945 s >= end ||
1946 *s != 'u') {
1947 continue;
1948 }
1949 p--;
1950 s++;
1951
1952 /* \uXXXX with 4 hex digits */
1953 for (x = 0, i = 0; i < 4; i++) {
1954 c = (unsigned char)s[i];
1955 if (!isxdigit(c)) {
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001956 if (unicodeescape_decoding_error(&p, errors,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957 "truncated \\uXXXX"))
1958 goto onError;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001959 x = 0xffffffff;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001960 i++;
1961 break;
1962 }
1963 x = (x<<4) & ~0xF;
1964 if (c >= '0' && c <= '9')
1965 x += c - '0';
1966 else if (c >= 'a' && c <= 'f')
1967 x += 10 + c - 'a';
1968 else
1969 x += 10 + c - 'A';
1970 }
1971 s += i;
Martin v. Löwis047c05e2002-03-21 08:55:28 +00001972 if (x != 0xffffffff)
1973 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001974 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001975 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001976 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001977 return (PyObject *)v;
1978
1979 onError:
1980 Py_XDECREF(v);
1981 return NULL;
1982}
1983
1984PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1985 int size)
1986{
1987 PyObject *repr;
1988 char *p;
1989 char *q;
1990
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001991 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992
1993 repr = PyString_FromStringAndSize(NULL, 6 * size);
1994 if (repr == NULL)
1995 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001996 if (size == 0)
1997 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998
1999 p = q = PyString_AS_STRING(repr);
2000 while (size-- > 0) {
2001 Py_UNICODE ch = *s++;
2002 /* Map 16-bit characters to '\uxxxx' */
2003 if (ch >= 256) {
2004 *p++ = '\\';
2005 *p++ = 'u';
2006 *p++ = hexdigit[(ch >> 12) & 0xf];
2007 *p++ = hexdigit[(ch >> 8) & 0xf];
2008 *p++ = hexdigit[(ch >> 4) & 0xf];
2009 *p++ = hexdigit[ch & 15];
2010 }
2011 /* Copy everything else as-is */
2012 else
2013 *p++ = (char) ch;
2014 }
2015 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002016 if (_PyString_Resize(&repr, p - q))
2017 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018
2019 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002020
2021 onError:
2022 Py_DECREF(repr);
2023 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024}
2025
2026PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2027{
2028 if (!PyUnicode_Check(unicode)) {
2029 PyErr_BadArgument();
2030 return NULL;
2031 }
2032 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2033 PyUnicode_GET_SIZE(unicode));
2034}
2035
2036/* --- Latin-1 Codec ------------------------------------------------------ */
2037
2038PyObject *PyUnicode_DecodeLatin1(const char *s,
2039 int size,
2040 const char *errors)
2041{
2042 PyUnicodeObject *v;
2043 Py_UNICODE *p;
2044
2045 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002046 if (size == 1 && *(unsigned char*)s < 256) {
2047 Py_UNICODE r = *(unsigned char*)s;
2048 return PyUnicode_FromUnicode(&r, 1);
2049 }
2050
Guido van Rossumd57fd912000-03-10 22:53:23 +00002051 v = _PyUnicode_New(size);
2052 if (v == NULL)
2053 goto onError;
2054 if (size == 0)
2055 return (PyObject *)v;
2056 p = PyUnicode_AS_UNICODE(v);
2057 while (size-- > 0)
2058 *p++ = (unsigned char)*s++;
2059 return (PyObject *)v;
2060
2061 onError:
2062 Py_XDECREF(v);
2063 return NULL;
2064}
2065
2066static
2067int latin1_encoding_error(const Py_UNICODE **source,
2068 char **dest,
2069 const char *errors,
2070 const char *details)
2071{
2072 if ((errors == NULL) ||
2073 (strcmp(errors,"strict") == 0)) {
2074 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002075 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002076 details);
2077 return -1;
2078 }
2079 else if (strcmp(errors,"ignore") == 0) {
2080 return 0;
2081 }
2082 else if (strcmp(errors,"replace") == 0) {
2083 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002084 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 return 0;
2086 }
2087 else {
2088 PyErr_Format(PyExc_ValueError,
2089 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002090 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 errors);
2092 return -1;
2093 }
2094}
2095
2096PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2097 int size,
2098 const char *errors)
2099{
2100 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002101 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002102
Guido van Rossumd57fd912000-03-10 22:53:23 +00002103 repr = PyString_FromStringAndSize(NULL, size);
2104 if (repr == NULL)
2105 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002106 if (size == 0)
2107 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002108
2109 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002110 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002111 while (size-- > 0) {
2112 Py_UNICODE ch = *p++;
2113 if (ch >= 256) {
2114 if (latin1_encoding_error(&p, &s, errors,
2115 "ordinal not in range(256)"))
2116 goto onError;
2117 }
2118 else
2119 *s++ = (char)ch;
2120 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002121 /* Resize if error handling skipped some characters */
2122 if (s - start < PyString_GET_SIZE(repr))
2123 if (_PyString_Resize(&repr, s - start))
2124 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 return repr;
2126
2127 onError:
2128 Py_DECREF(repr);
2129 return NULL;
2130}
2131
2132PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2133{
2134 if (!PyUnicode_Check(unicode)) {
2135 PyErr_BadArgument();
2136 return NULL;
2137 }
2138 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2139 PyUnicode_GET_SIZE(unicode),
2140 NULL);
2141}
2142
2143/* --- 7-bit ASCII Codec -------------------------------------------------- */
2144
2145static
2146int ascii_decoding_error(const char **source,
2147 Py_UNICODE **dest,
2148 const char *errors,
2149 const char *details)
2150{
2151 if ((errors == NULL) ||
2152 (strcmp(errors,"strict") == 0)) {
2153 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002154 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155 details);
2156 return -1;
2157 }
2158 else if (strcmp(errors,"ignore") == 0) {
2159 return 0;
2160 }
2161 else if (strcmp(errors,"replace") == 0) {
2162 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2163 (*dest)++;
2164 return 0;
2165 }
2166 else {
2167 PyErr_Format(PyExc_ValueError,
2168 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002169 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 errors);
2171 return -1;
2172 }
2173}
2174
2175PyObject *PyUnicode_DecodeASCII(const char *s,
2176 int size,
2177 const char *errors)
2178{
2179 PyUnicodeObject *v;
2180 Py_UNICODE *p;
2181
2182 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002183 if (size == 1 && *(unsigned char*)s < 128) {
2184 Py_UNICODE r = *(unsigned char*)s;
2185 return PyUnicode_FromUnicode(&r, 1);
2186 }
2187
Guido van Rossumd57fd912000-03-10 22:53:23 +00002188 v = _PyUnicode_New(size);
2189 if (v == NULL)
2190 goto onError;
2191 if (size == 0)
2192 return (PyObject *)v;
2193 p = PyUnicode_AS_UNICODE(v);
2194 while (size-- > 0) {
2195 register unsigned char c;
2196
2197 c = (unsigned char)*s++;
2198 if (c < 128)
2199 *p++ = c;
2200 else if (ascii_decoding_error(&s, &p, errors,
2201 "ordinal not in range(128)"))
2202 goto onError;
2203 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002204 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002205 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002206 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 return (PyObject *)v;
2208
2209 onError:
2210 Py_XDECREF(v);
2211 return NULL;
2212}
2213
2214static
2215int ascii_encoding_error(const Py_UNICODE **source,
2216 char **dest,
2217 const char *errors,
2218 const char *details)
2219{
2220 if ((errors == NULL) ||
2221 (strcmp(errors,"strict") == 0)) {
2222 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002223 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 details);
2225 return -1;
2226 }
2227 else if (strcmp(errors,"ignore") == 0) {
2228 return 0;
2229 }
2230 else if (strcmp(errors,"replace") == 0) {
2231 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002232 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 return 0;
2234 }
2235 else {
2236 PyErr_Format(PyExc_ValueError,
2237 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002238 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002239 errors);
2240 return -1;
2241 }
2242}
2243
2244PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2245 int size,
2246 const char *errors)
2247{
2248 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002249 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002250
Guido van Rossumd57fd912000-03-10 22:53:23 +00002251 repr = PyString_FromStringAndSize(NULL, size);
2252 if (repr == NULL)
2253 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002254 if (size == 0)
2255 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002256
2257 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002258 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002259 while (size-- > 0) {
2260 Py_UNICODE ch = *p++;
2261 if (ch >= 128) {
2262 if (ascii_encoding_error(&p, &s, errors,
2263 "ordinal not in range(128)"))
2264 goto onError;
2265 }
2266 else
2267 *s++ = (char)ch;
2268 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002269 /* Resize if error handling skipped some characters */
2270 if (s - start < PyString_GET_SIZE(repr))
2271 if (_PyString_Resize(&repr, s - start))
2272 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002273 return repr;
2274
2275 onError:
2276 Py_DECREF(repr);
2277 return NULL;
2278}
2279
2280PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2281{
2282 if (!PyUnicode_Check(unicode)) {
2283 PyErr_BadArgument();
2284 return NULL;
2285 }
2286 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2287 PyUnicode_GET_SIZE(unicode),
2288 NULL);
2289}
2290
Fredrik Lundh30831632001-06-26 15:11:00 +00002291#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002292
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002293/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002294
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002295PyObject *PyUnicode_DecodeMBCS(const char *s,
2296 int size,
2297 const char *errors)
2298{
2299 PyUnicodeObject *v;
2300 Py_UNICODE *p;
2301
2302 /* First get the size of the result */
2303 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002304 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002305 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2306
2307 v = _PyUnicode_New(usize);
2308 if (v == NULL)
2309 return NULL;
2310 if (usize == 0)
2311 return (PyObject *)v;
2312 p = PyUnicode_AS_UNICODE(v);
2313 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2314 Py_DECREF(v);
2315 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2316 }
2317
2318 return (PyObject *)v;
2319}
2320
2321PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2322 int size,
2323 const char *errors)
2324{
2325 PyObject *repr;
2326 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002327 DWORD mbcssize;
2328
2329 /* If there are no characters, bail now! */
2330 if (size==0)
2331 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002332
2333 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002334 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002335 if (mbcssize==0)
2336 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2337
2338 repr = PyString_FromStringAndSize(NULL, mbcssize);
2339 if (repr == NULL)
2340 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002341 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002342 return repr;
2343
2344 /* Do the conversion */
2345 s = PyString_AS_STRING(repr);
2346 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2347 Py_DECREF(repr);
2348 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2349 }
2350 return repr;
2351}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002352
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002353#endif /* MS_WIN32 */
2354
Guido van Rossumd57fd912000-03-10 22:53:23 +00002355/* --- Character Mapping Codec -------------------------------------------- */
2356
2357static
2358int charmap_decoding_error(const char **source,
2359 Py_UNICODE **dest,
2360 const char *errors,
2361 const char *details)
2362{
2363 if ((errors == NULL) ||
2364 (strcmp(errors,"strict") == 0)) {
2365 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002366 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002367 details);
2368 return -1;
2369 }
2370 else if (strcmp(errors,"ignore") == 0) {
2371 return 0;
2372 }
2373 else if (strcmp(errors,"replace") == 0) {
2374 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2375 (*dest)++;
2376 return 0;
2377 }
2378 else {
2379 PyErr_Format(PyExc_ValueError,
2380 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002381 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002382 errors);
2383 return -1;
2384 }
2385}
2386
2387PyObject *PyUnicode_DecodeCharmap(const char *s,
2388 int size,
2389 PyObject *mapping,
2390 const char *errors)
2391{
2392 PyUnicodeObject *v;
2393 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002394 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395
2396 /* Default to Latin-1 */
2397 if (mapping == NULL)
2398 return PyUnicode_DecodeLatin1(s, size, errors);
2399
2400 v = _PyUnicode_New(size);
2401 if (v == NULL)
2402 goto onError;
2403 if (size == 0)
2404 return (PyObject *)v;
2405 p = PyUnicode_AS_UNICODE(v);
2406 while (size-- > 0) {
2407 unsigned char ch = *s++;
2408 PyObject *w, *x;
2409
2410 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2411 w = PyInt_FromLong((long)ch);
2412 if (w == NULL)
2413 goto onError;
2414 x = PyObject_GetItem(mapping, w);
2415 Py_DECREF(w);
2416 if (x == NULL) {
2417 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002418 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002419 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002420 x = Py_None;
2421 Py_INCREF(x);
2422 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002423 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002424 }
2425
2426 /* Apply mapping */
2427 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002428 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002429 if (value < 0 || value > 65535) {
2430 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002431 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002432 Py_DECREF(x);
2433 goto onError;
2434 }
2435 *p++ = (Py_UNICODE)value;
2436 }
2437 else if (x == Py_None) {
2438 /* undefined mapping */
2439 if (charmap_decoding_error(&s, &p, errors,
2440 "character maps to <undefined>")) {
2441 Py_DECREF(x);
2442 goto onError;
2443 }
2444 }
2445 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002446 int targetsize = PyUnicode_GET_SIZE(x);
2447
2448 if (targetsize == 1)
2449 /* 1-1 mapping */
2450 *p++ = *PyUnicode_AS_UNICODE(x);
2451
2452 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002453 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002454 if (targetsize > extrachars) {
2455 /* resize first */
2456 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2457 int needed = (targetsize - extrachars) + \
2458 (targetsize << 2);
2459 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002460 if (_PyUnicode_Resize(&v,
2461 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002462 Py_DECREF(x);
2463 goto onError;
2464 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002465 p = PyUnicode_AS_UNICODE(v) + oldpos;
2466 }
2467 Py_UNICODE_COPY(p,
2468 PyUnicode_AS_UNICODE(x),
2469 targetsize);
2470 p += targetsize;
2471 extrachars -= targetsize;
2472 }
2473 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 }
2475 else {
2476 /* wrong return value */
2477 PyErr_SetString(PyExc_TypeError,
2478 "character mapping must return integer, None or unicode");
2479 Py_DECREF(x);
2480 goto onError;
2481 }
2482 Py_DECREF(x);
2483 }
2484 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002485 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 goto onError;
2487 return (PyObject *)v;
2488
2489 onError:
2490 Py_XDECREF(v);
2491 return NULL;
2492}
2493
2494static
2495int charmap_encoding_error(const Py_UNICODE **source,
2496 char **dest,
2497 const char *errors,
2498 const char *details)
2499{
2500 if ((errors == NULL) ||
2501 (strcmp(errors,"strict") == 0)) {
2502 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002503 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002504 details);
2505 return -1;
2506 }
2507 else if (strcmp(errors,"ignore") == 0) {
2508 return 0;
2509 }
2510 else if (strcmp(errors,"replace") == 0) {
2511 **dest = '?';
2512 (*dest)++;
2513 return 0;
2514 }
2515 else {
2516 PyErr_Format(PyExc_ValueError,
2517 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002518 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002519 errors);
2520 return -1;
2521 }
2522}
2523
2524PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2525 int size,
2526 PyObject *mapping,
2527 const char *errors)
2528{
2529 PyObject *v;
2530 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002531 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002532
2533 /* Default to Latin-1 */
2534 if (mapping == NULL)
2535 return PyUnicode_EncodeLatin1(p, size, errors);
2536
2537 v = PyString_FromStringAndSize(NULL, size);
2538 if (v == NULL)
2539 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002540 if (size == 0)
2541 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542 s = PyString_AS_STRING(v);
2543 while (size-- > 0) {
2544 Py_UNICODE ch = *p++;
2545 PyObject *w, *x;
2546
2547 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2548 w = PyInt_FromLong((long)ch);
2549 if (w == NULL)
2550 goto onError;
2551 x = PyObject_GetItem(mapping, w);
2552 Py_DECREF(w);
2553 if (x == NULL) {
2554 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002555 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002557 x = Py_None;
2558 Py_INCREF(x);
2559 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002560 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 }
2562
2563 /* Apply mapping */
2564 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002565 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002566 if (value < 0 || value > 255) {
2567 PyErr_SetString(PyExc_TypeError,
2568 "character mapping must be in range(256)");
2569 Py_DECREF(x);
2570 goto onError;
2571 }
2572 *s++ = (char)value;
2573 }
2574 else if (x == Py_None) {
2575 /* undefined mapping */
2576 if (charmap_encoding_error(&p, &s, errors,
2577 "character maps to <undefined>")) {
2578 Py_DECREF(x);
2579 goto onError;
2580 }
2581 }
2582 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002583 int targetsize = PyString_GET_SIZE(x);
2584
2585 if (targetsize == 1)
2586 /* 1-1 mapping */
2587 *s++ = *PyString_AS_STRING(x);
2588
2589 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002590 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002591 if (targetsize > extrachars) {
2592 /* resize first */
2593 int oldpos = (int)(s - PyString_AS_STRING(v));
2594 int needed = (targetsize - extrachars) + \
2595 (targetsize << 2);
2596 extrachars += needed;
2597 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002598 Py_DECREF(x);
2599 goto onError;
2600 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002601 s = PyString_AS_STRING(v) + oldpos;
2602 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002603 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002604 s += targetsize;
2605 extrachars -= targetsize;
2606 }
2607 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002608 }
2609 else {
2610 /* wrong return value */
2611 PyErr_SetString(PyExc_TypeError,
2612 "character mapping must return integer, None or unicode");
2613 Py_DECREF(x);
2614 goto onError;
2615 }
2616 Py_DECREF(x);
2617 }
2618 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2619 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2620 goto onError;
2621 return v;
2622
2623 onError:
2624 Py_DECREF(v);
2625 return NULL;
2626}
2627
2628PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2629 PyObject *mapping)
2630{
2631 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2632 PyErr_BadArgument();
2633 return NULL;
2634 }
2635 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2636 PyUnicode_GET_SIZE(unicode),
2637 mapping,
2638 NULL);
2639}
2640
2641static
2642int translate_error(const Py_UNICODE **source,
2643 Py_UNICODE **dest,
2644 const char *errors,
2645 const char *details)
2646{
2647 if ((errors == NULL) ||
2648 (strcmp(errors,"strict") == 0)) {
2649 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002650 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002651 details);
2652 return -1;
2653 }
2654 else if (strcmp(errors,"ignore") == 0) {
2655 return 0;
2656 }
2657 else if (strcmp(errors,"replace") == 0) {
2658 **dest = '?';
2659 (*dest)++;
2660 return 0;
2661 }
2662 else {
2663 PyErr_Format(PyExc_ValueError,
2664 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002665 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 errors);
2667 return -1;
2668 }
2669}
2670
2671PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2672 int size,
2673 PyObject *mapping,
2674 const char *errors)
2675{
2676 PyUnicodeObject *v;
2677 Py_UNICODE *p;
2678
2679 if (mapping == NULL) {
2680 PyErr_BadArgument();
2681 return NULL;
2682 }
2683
2684 /* Output will never be longer than input */
2685 v = _PyUnicode_New(size);
2686 if (v == NULL)
2687 goto onError;
2688 if (size == 0)
2689 goto done;
2690 p = PyUnicode_AS_UNICODE(v);
2691 while (size-- > 0) {
2692 Py_UNICODE ch = *s++;
2693 PyObject *w, *x;
2694
2695 /* Get mapping */
2696 w = PyInt_FromLong(ch);
2697 if (w == NULL)
2698 goto onError;
2699 x = PyObject_GetItem(mapping, w);
2700 Py_DECREF(w);
2701 if (x == NULL) {
2702 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2703 /* No mapping found: default to 1-1 mapping */
2704 PyErr_Clear();
2705 *p++ = ch;
2706 continue;
2707 }
2708 goto onError;
2709 }
2710
2711 /* Apply mapping */
2712 if (PyInt_Check(x))
2713 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2714 else if (x == Py_None) {
2715 /* undefined mapping */
2716 if (translate_error(&s, &p, errors,
2717 "character maps to <undefined>")) {
2718 Py_DECREF(x);
2719 goto onError;
2720 }
2721 }
2722 else if (PyUnicode_Check(x)) {
2723 if (PyUnicode_GET_SIZE(x) != 1) {
2724 /* 1-n mapping */
2725 PyErr_SetString(PyExc_NotImplementedError,
2726 "1-n mappings are currently not implemented");
2727 Py_DECREF(x);
2728 goto onError;
2729 }
2730 *p++ = *PyUnicode_AS_UNICODE(x);
2731 }
2732 else {
2733 /* wrong return value */
2734 PyErr_SetString(PyExc_TypeError,
2735 "translate mapping must return integer, None or unicode");
2736 Py_DECREF(x);
2737 goto onError;
2738 }
2739 Py_DECREF(x);
2740 }
2741 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002742 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002743 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002744
2745 done:
2746 return (PyObject *)v;
2747
2748 onError:
2749 Py_XDECREF(v);
2750 return NULL;
2751}
2752
2753PyObject *PyUnicode_Translate(PyObject *str,
2754 PyObject *mapping,
2755 const char *errors)
2756{
2757 PyObject *result;
2758
2759 str = PyUnicode_FromObject(str);
2760 if (str == NULL)
2761 goto onError;
2762 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2763 PyUnicode_GET_SIZE(str),
2764 mapping,
2765 errors);
2766 Py_DECREF(str);
2767 return result;
2768
2769 onError:
2770 Py_XDECREF(str);
2771 return NULL;
2772}
2773
Guido van Rossum9e896b32000-04-05 20:11:21 +00002774/* --- Decimal Encoder ---------------------------------------------------- */
2775
2776int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2777 int length,
2778 char *output,
2779 const char *errors)
2780{
2781 Py_UNICODE *p, *end;
2782
2783 if (output == NULL) {
2784 PyErr_BadArgument();
2785 return -1;
2786 }
2787
2788 p = s;
2789 end = s + length;
2790 while (p < end) {
2791 register Py_UNICODE ch = *p++;
2792 int decimal;
2793
2794 if (Py_UNICODE_ISSPACE(ch)) {
2795 *output++ = ' ';
2796 continue;
2797 }
2798 decimal = Py_UNICODE_TODECIMAL(ch);
2799 if (decimal >= 0) {
2800 *output++ = '0' + decimal;
2801 continue;
2802 }
Guido van Rossumba477042000-04-06 18:18:10 +00002803 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002804 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002805 continue;
2806 }
2807 /* All other characters are considered invalid */
2808 if (errors == NULL || strcmp(errors, "strict") == 0) {
2809 PyErr_SetString(PyExc_ValueError,
2810 "invalid decimal Unicode string");
2811 goto onError;
2812 }
2813 else if (strcmp(errors, "ignore") == 0)
2814 continue;
2815 else if (strcmp(errors, "replace") == 0) {
2816 *output++ = '?';
2817 continue;
2818 }
2819 }
2820 /* 0-terminate the output string */
2821 *output++ = '\0';
2822 return 0;
2823
2824 onError:
2825 return -1;
2826}
2827
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828/* --- Helpers ------------------------------------------------------------ */
2829
2830static
2831int count(PyUnicodeObject *self,
2832 int start,
2833 int end,
2834 PyUnicodeObject *substring)
2835{
2836 int count = 0;
2837
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002838 if (start < 0)
2839 start += self->length;
2840 if (start < 0)
2841 start = 0;
2842 if (end > self->length)
2843 end = self->length;
2844 if (end < 0)
2845 end += self->length;
2846 if (end < 0)
2847 end = 0;
2848
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002849 if (substring->length == 0)
2850 return (end - start + 1);
2851
Guido van Rossumd57fd912000-03-10 22:53:23 +00002852 end -= substring->length;
2853
2854 while (start <= end)
2855 if (Py_UNICODE_MATCH(self, start, substring)) {
2856 count++;
2857 start += substring->length;
2858 } else
2859 start++;
2860
2861 return count;
2862}
2863
2864int PyUnicode_Count(PyObject *str,
2865 PyObject *substr,
2866 int start,
2867 int end)
2868{
2869 int result;
2870
2871 str = PyUnicode_FromObject(str);
2872 if (str == NULL)
2873 return -1;
2874 substr = PyUnicode_FromObject(substr);
2875 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002876 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877 return -1;
2878 }
2879
2880 result = count((PyUnicodeObject *)str,
2881 start, end,
2882 (PyUnicodeObject *)substr);
2883
2884 Py_DECREF(str);
2885 Py_DECREF(substr);
2886 return result;
2887}
2888
2889static
2890int findstring(PyUnicodeObject *self,
2891 PyUnicodeObject *substring,
2892 int start,
2893 int end,
2894 int direction)
2895{
2896 if (start < 0)
2897 start += self->length;
2898 if (start < 0)
2899 start = 0;
2900
2901 if (substring->length == 0)
2902 return start;
2903
2904 if (end > self->length)
2905 end = self->length;
2906 if (end < 0)
2907 end += self->length;
2908 if (end < 0)
2909 end = 0;
2910
2911 end -= substring->length;
2912
2913 if (direction < 0) {
2914 for (; end >= start; end--)
2915 if (Py_UNICODE_MATCH(self, end, substring))
2916 return end;
2917 } else {
2918 for (; start <= end; start++)
2919 if (Py_UNICODE_MATCH(self, start, substring))
2920 return start;
2921 }
2922
2923 return -1;
2924}
2925
2926int PyUnicode_Find(PyObject *str,
2927 PyObject *substr,
2928 int start,
2929 int end,
2930 int direction)
2931{
2932 int result;
2933
2934 str = PyUnicode_FromObject(str);
2935 if (str == NULL)
2936 return -1;
2937 substr = PyUnicode_FromObject(substr);
2938 if (substr == NULL) {
2939 Py_DECREF(substr);
2940 return -1;
2941 }
2942
2943 result = findstring((PyUnicodeObject *)str,
2944 (PyUnicodeObject *)substr,
2945 start, end, direction);
2946 Py_DECREF(str);
2947 Py_DECREF(substr);
2948 return result;
2949}
2950
2951static
2952int tailmatch(PyUnicodeObject *self,
2953 PyUnicodeObject *substring,
2954 int start,
2955 int end,
2956 int direction)
2957{
2958 if (start < 0)
2959 start += self->length;
2960 if (start < 0)
2961 start = 0;
2962
2963 if (substring->length == 0)
2964 return 1;
2965
2966 if (end > self->length)
2967 end = self->length;
2968 if (end < 0)
2969 end += self->length;
2970 if (end < 0)
2971 end = 0;
2972
2973 end -= substring->length;
2974 if (end < start)
2975 return 0;
2976
2977 if (direction > 0) {
2978 if (Py_UNICODE_MATCH(self, end, substring))
2979 return 1;
2980 } else {
2981 if (Py_UNICODE_MATCH(self, start, substring))
2982 return 1;
2983 }
2984
2985 return 0;
2986}
2987
2988int PyUnicode_Tailmatch(PyObject *str,
2989 PyObject *substr,
2990 int start,
2991 int end,
2992 int direction)
2993{
2994 int result;
2995
2996 str = PyUnicode_FromObject(str);
2997 if (str == NULL)
2998 return -1;
2999 substr = PyUnicode_FromObject(substr);
3000 if (substr == NULL) {
3001 Py_DECREF(substr);
3002 return -1;
3003 }
3004
3005 result = tailmatch((PyUnicodeObject *)str,
3006 (PyUnicodeObject *)substr,
3007 start, end, direction);
3008 Py_DECREF(str);
3009 Py_DECREF(substr);
3010 return result;
3011}
3012
3013static
3014const Py_UNICODE *findchar(const Py_UNICODE *s,
3015 int size,
3016 Py_UNICODE ch)
3017{
3018 /* like wcschr, but doesn't stop at NULL characters */
3019
3020 while (size-- > 0) {
3021 if (*s == ch)
3022 return s;
3023 s++;
3024 }
3025
3026 return NULL;
3027}
3028
3029/* Apply fixfct filter to the Unicode object self and return a
3030 reference to the modified object */
3031
3032static
3033PyObject *fixup(PyUnicodeObject *self,
3034 int (*fixfct)(PyUnicodeObject *s))
3035{
3036
3037 PyUnicodeObject *u;
3038
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003039 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003040 if (u == NULL)
3041 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003042
3043 Py_UNICODE_COPY(u->str, self->str, self->length);
3044
Tim Peters7a29bd52001-09-12 03:03:31 +00003045 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 /* fixfct should return TRUE if it modified the buffer. If
3047 FALSE, return a reference to the original buffer instead
3048 (to save space, not time) */
3049 Py_INCREF(self);
3050 Py_DECREF(u);
3051 return (PyObject*) self;
3052 }
3053 return (PyObject*) u;
3054}
3055
3056static
3057int fixupper(PyUnicodeObject *self)
3058{
3059 int len = self->length;
3060 Py_UNICODE *s = self->str;
3061 int status = 0;
3062
3063 while (len-- > 0) {
3064 register Py_UNICODE ch;
3065
3066 ch = Py_UNICODE_TOUPPER(*s);
3067 if (ch != *s) {
3068 status = 1;
3069 *s = ch;
3070 }
3071 s++;
3072 }
3073
3074 return status;
3075}
3076
3077static
3078int fixlower(PyUnicodeObject *self)
3079{
3080 int len = self->length;
3081 Py_UNICODE *s = self->str;
3082 int status = 0;
3083
3084 while (len-- > 0) {
3085 register Py_UNICODE ch;
3086
3087 ch = Py_UNICODE_TOLOWER(*s);
3088 if (ch != *s) {
3089 status = 1;
3090 *s = ch;
3091 }
3092 s++;
3093 }
3094
3095 return status;
3096}
3097
3098static
3099int fixswapcase(PyUnicodeObject *self)
3100{
3101 int len = self->length;
3102 Py_UNICODE *s = self->str;
3103 int status = 0;
3104
3105 while (len-- > 0) {
3106 if (Py_UNICODE_ISUPPER(*s)) {
3107 *s = Py_UNICODE_TOLOWER(*s);
3108 status = 1;
3109 } else if (Py_UNICODE_ISLOWER(*s)) {
3110 *s = Py_UNICODE_TOUPPER(*s);
3111 status = 1;
3112 }
3113 s++;
3114 }
3115
3116 return status;
3117}
3118
3119static
3120int fixcapitalize(PyUnicodeObject *self)
3121{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003122 int len = self->length;
3123 Py_UNICODE *s = self->str;
3124 int status = 0;
3125
3126 if (len == 0)
3127 return 0;
3128 if (Py_UNICODE_ISLOWER(*s)) {
3129 *s = Py_UNICODE_TOUPPER(*s);
3130 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003131 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003132 s++;
3133 while (--len > 0) {
3134 if (Py_UNICODE_ISUPPER(*s)) {
3135 *s = Py_UNICODE_TOLOWER(*s);
3136 status = 1;
3137 }
3138 s++;
3139 }
3140 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003141}
3142
3143static
3144int fixtitle(PyUnicodeObject *self)
3145{
3146 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3147 register Py_UNICODE *e;
3148 int previous_is_cased;
3149
3150 /* Shortcut for single character strings */
3151 if (PyUnicode_GET_SIZE(self) == 1) {
3152 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3153 if (*p != ch) {
3154 *p = ch;
3155 return 1;
3156 }
3157 else
3158 return 0;
3159 }
3160
3161 e = p + PyUnicode_GET_SIZE(self);
3162 previous_is_cased = 0;
3163 for (; p < e; p++) {
3164 register const Py_UNICODE ch = *p;
3165
3166 if (previous_is_cased)
3167 *p = Py_UNICODE_TOLOWER(ch);
3168 else
3169 *p = Py_UNICODE_TOTITLE(ch);
3170
3171 if (Py_UNICODE_ISLOWER(ch) ||
3172 Py_UNICODE_ISUPPER(ch) ||
3173 Py_UNICODE_ISTITLE(ch))
3174 previous_is_cased = 1;
3175 else
3176 previous_is_cased = 0;
3177 }
3178 return 1;
3179}
3180
3181PyObject *PyUnicode_Join(PyObject *separator,
3182 PyObject *seq)
3183{
3184 Py_UNICODE *sep;
3185 int seplen;
3186 PyUnicodeObject *res = NULL;
3187 int reslen = 0;
3188 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 int sz = 100;
3190 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003191 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003192
Tim Peters2cfe3682001-05-05 05:36:48 +00003193 it = PyObject_GetIter(seq);
3194 if (it == NULL)
3195 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196
3197 if (separator == NULL) {
3198 Py_UNICODE blank = ' ';
3199 sep = &blank;
3200 seplen = 1;
3201 }
3202 else {
3203 separator = PyUnicode_FromObject(separator);
3204 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003205 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 sep = PyUnicode_AS_UNICODE(separator);
3207 seplen = PyUnicode_GET_SIZE(separator);
3208 }
3209
3210 res = _PyUnicode_New(sz);
3211 if (res == NULL)
3212 goto onError;
3213 p = PyUnicode_AS_UNICODE(res);
3214 reslen = 0;
3215
Tim Peters2cfe3682001-05-05 05:36:48 +00003216 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003218 PyObject *item = PyIter_Next(it);
3219 if (item == NULL) {
3220 if (PyErr_Occurred())
3221 goto onError;
3222 break;
3223 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 if (!PyUnicode_Check(item)) {
3225 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003226 if (!PyString_Check(item)) {
3227 PyErr_Format(PyExc_TypeError,
3228 "sequence item %i: expected string or Unicode,"
3229 " %.80s found",
3230 i, item->ob_type->tp_name);
3231 Py_DECREF(item);
3232 goto onError;
3233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 v = PyUnicode_FromObject(item);
3235 Py_DECREF(item);
3236 item = v;
3237 if (item == NULL)
3238 goto onError;
3239 }
3240 itemlen = PyUnicode_GET_SIZE(item);
3241 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003242 if (_PyUnicode_Resize(&res, sz*2)) {
3243 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003245 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 sz *= 2;
3247 p = PyUnicode_AS_UNICODE(res) + reslen;
3248 }
3249 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003250 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251 p += seplen;
3252 reslen += seplen;
3253 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003254 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 p += itemlen;
3256 reslen += itemlen;
3257 Py_DECREF(item);
3258 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003259 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 goto onError;
3261
3262 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003263 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 return (PyObject *)res;
3265
3266 onError:
3267 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003268 Py_XDECREF(res);
3269 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 return NULL;
3271}
3272
3273static
3274PyUnicodeObject *pad(PyUnicodeObject *self,
3275 int left,
3276 int right,
3277 Py_UNICODE fill)
3278{
3279 PyUnicodeObject *u;
3280
3281 if (left < 0)
3282 left = 0;
3283 if (right < 0)
3284 right = 0;
3285
Tim Peters7a29bd52001-09-12 03:03:31 +00003286 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003287 Py_INCREF(self);
3288 return self;
3289 }
3290
3291 u = _PyUnicode_New(left + self->length + right);
3292 if (u) {
3293 if (left)
3294 Py_UNICODE_FILL(u->str, fill, left);
3295 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3296 if (right)
3297 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3298 }
3299
3300 return u;
3301}
3302
3303#define SPLIT_APPEND(data, left, right) \
3304 str = PyUnicode_FromUnicode(data + left, right - left); \
3305 if (!str) \
3306 goto onError; \
3307 if (PyList_Append(list, str)) { \
3308 Py_DECREF(str); \
3309 goto onError; \
3310 } \
3311 else \
3312 Py_DECREF(str);
3313
3314static
3315PyObject *split_whitespace(PyUnicodeObject *self,
3316 PyObject *list,
3317 int maxcount)
3318{
3319 register int i;
3320 register int j;
3321 int len = self->length;
3322 PyObject *str;
3323
3324 for (i = j = 0; i < len; ) {
3325 /* find a token */
3326 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3327 i++;
3328 j = i;
3329 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3330 i++;
3331 if (j < i) {
3332 if (maxcount-- <= 0)
3333 break;
3334 SPLIT_APPEND(self->str, j, i);
3335 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3336 i++;
3337 j = i;
3338 }
3339 }
3340 if (j < len) {
3341 SPLIT_APPEND(self->str, j, len);
3342 }
3343 return list;
3344
3345 onError:
3346 Py_DECREF(list);
3347 return NULL;
3348}
3349
3350PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003351 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003352{
3353 register int i;
3354 register int j;
3355 int len;
3356 PyObject *list;
3357 PyObject *str;
3358 Py_UNICODE *data;
3359
3360 string = PyUnicode_FromObject(string);
3361 if (string == NULL)
3362 return NULL;
3363 data = PyUnicode_AS_UNICODE(string);
3364 len = PyUnicode_GET_SIZE(string);
3365
Guido van Rossumd57fd912000-03-10 22:53:23 +00003366 list = PyList_New(0);
3367 if (!list)
3368 goto onError;
3369
3370 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003371 int eol;
3372
Guido van Rossumd57fd912000-03-10 22:53:23 +00003373 /* Find a line and append it */
3374 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3375 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003376
3377 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003378 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379 if (i < len) {
3380 if (data[i] == '\r' && i + 1 < len &&
3381 data[i+1] == '\n')
3382 i += 2;
3383 else
3384 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003385 if (keepends)
3386 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003387 }
Guido van Rossum86662912000-04-11 15:38:46 +00003388 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 j = i;
3390 }
3391 if (j < len) {
3392 SPLIT_APPEND(data, j, len);
3393 }
3394
3395 Py_DECREF(string);
3396 return list;
3397
3398 onError:
3399 Py_DECREF(list);
3400 Py_DECREF(string);
3401 return NULL;
3402}
3403
3404static
3405PyObject *split_char(PyUnicodeObject *self,
3406 PyObject *list,
3407 Py_UNICODE ch,
3408 int maxcount)
3409{
3410 register int i;
3411 register int j;
3412 int len = self->length;
3413 PyObject *str;
3414
3415 for (i = j = 0; i < len; ) {
3416 if (self->str[i] == ch) {
3417 if (maxcount-- <= 0)
3418 break;
3419 SPLIT_APPEND(self->str, j, i);
3420 i = j = i + 1;
3421 } else
3422 i++;
3423 }
3424 if (j <= len) {
3425 SPLIT_APPEND(self->str, j, len);
3426 }
3427 return list;
3428
3429 onError:
3430 Py_DECREF(list);
3431 return NULL;
3432}
3433
3434static
3435PyObject *split_substring(PyUnicodeObject *self,
3436 PyObject *list,
3437 PyUnicodeObject *substring,
3438 int maxcount)
3439{
3440 register int i;
3441 register int j;
3442 int len = self->length;
3443 int sublen = substring->length;
3444 PyObject *str;
3445
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003446 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 if (Py_UNICODE_MATCH(self, i, substring)) {
3448 if (maxcount-- <= 0)
3449 break;
3450 SPLIT_APPEND(self->str, j, i);
3451 i = j = i + sublen;
3452 } else
3453 i++;
3454 }
3455 if (j <= len) {
3456 SPLIT_APPEND(self->str, j, len);
3457 }
3458 return list;
3459
3460 onError:
3461 Py_DECREF(list);
3462 return NULL;
3463}
3464
3465#undef SPLIT_APPEND
3466
3467static
3468PyObject *split(PyUnicodeObject *self,
3469 PyUnicodeObject *substring,
3470 int maxcount)
3471{
3472 PyObject *list;
3473
3474 if (maxcount < 0)
3475 maxcount = INT_MAX;
3476
3477 list = PyList_New(0);
3478 if (!list)
3479 return NULL;
3480
3481 if (substring == NULL)
3482 return split_whitespace(self,list,maxcount);
3483
3484 else if (substring->length == 1)
3485 return split_char(self,list,substring->str[0],maxcount);
3486
3487 else if (substring->length == 0) {
3488 Py_DECREF(list);
3489 PyErr_SetString(PyExc_ValueError, "empty separator");
3490 return NULL;
3491 }
3492 else
3493 return split_substring(self,list,substring,maxcount);
3494}
3495
3496static
3497PyObject *strip(PyUnicodeObject *self,
3498 int left,
3499 int right)
3500{
3501 Py_UNICODE *p = self->str;
3502 int start = 0;
3503 int end = self->length;
3504
3505 if (left)
3506 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3507 start++;
3508
3509 if (right)
3510 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3511 end--;
3512
Tim Peters7a29bd52001-09-12 03:03:31 +00003513 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 /* couldn't strip anything off, return original string */
3515 Py_INCREF(self);
3516 return (PyObject*) self;
3517 }
3518
3519 return (PyObject*) PyUnicode_FromUnicode(
3520 self->str + start,
3521 end - start
3522 );
3523}
3524
3525static
3526PyObject *replace(PyUnicodeObject *self,
3527 PyUnicodeObject *str1,
3528 PyUnicodeObject *str2,
3529 int maxcount)
3530{
3531 PyUnicodeObject *u;
3532
3533 if (maxcount < 0)
3534 maxcount = INT_MAX;
3535
3536 if (str1->length == 1 && str2->length == 1) {
3537 int i;
3538
3539 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003540 if (!findchar(self->str, self->length, str1->str[0]) &&
3541 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 /* nothing to replace, return original string */
3543 Py_INCREF(self);
3544 u = self;
3545 } else {
3546 Py_UNICODE u1 = str1->str[0];
3547 Py_UNICODE u2 = str2->str[0];
3548
3549 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003550 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003551 self->length
3552 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003553 if (u != NULL) {
3554 Py_UNICODE_COPY(u->str, self->str,
3555 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 for (i = 0; i < u->length; i++)
3557 if (u->str[i] == u1) {
3558 if (--maxcount < 0)
3559 break;
3560 u->str[i] = u2;
3561 }
3562 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003563 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564
3565 } else {
3566 int n, i;
3567 Py_UNICODE *p;
3568
3569 /* replace strings */
3570 n = count(self, 0, self->length, str1);
3571 if (n > maxcount)
3572 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003573 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003574 /* nothing to replace, return original string */
3575 Py_INCREF(self);
3576 u = self;
3577 } else {
3578 u = _PyUnicode_New(
3579 self->length + n * (str2->length - str1->length));
3580 if (u) {
3581 i = 0;
3582 p = u->str;
3583 while (i <= self->length - str1->length)
3584 if (Py_UNICODE_MATCH(self, i, str1)) {
3585 /* replace string segment */
3586 Py_UNICODE_COPY(p, str2->str, str2->length);
3587 p += str2->length;
3588 i += str1->length;
3589 if (--n <= 0) {
3590 /* copy remaining part */
3591 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3592 break;
3593 }
3594 } else
3595 *p++ = self->str[i++];
3596 }
3597 }
3598 }
3599
3600 return (PyObject *) u;
3601}
3602
3603/* --- Unicode Object Methods --------------------------------------------- */
3604
3605static char title__doc__[] =
3606"S.title() -> unicode\n\
3607\n\
3608Return a titlecased version of S, i.e. words start with title case\n\
3609characters, all remaining cased characters have lower case.";
3610
3611static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003612unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 return fixup(self, fixtitle);
3615}
3616
3617static char capitalize__doc__[] =
3618"S.capitalize() -> unicode\n\
3619\n\
3620Return a capitalized version of S, i.e. make the first character\n\
3621have upper case.";
3622
3623static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003624unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003625{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 return fixup(self, fixcapitalize);
3627}
3628
3629#if 0
3630static char capwords__doc__[] =
3631"S.capwords() -> unicode\n\
3632\n\
3633Apply .capitalize() to all words in S and return the result with\n\
3634normalized whitespace (all whitespace strings are replaced by ' ').";
3635
3636static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003637unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003638{
3639 PyObject *list;
3640 PyObject *item;
3641 int i;
3642
Guido van Rossumd57fd912000-03-10 22:53:23 +00003643 /* Split into words */
3644 list = split(self, NULL, -1);
3645 if (!list)
3646 return NULL;
3647
3648 /* Capitalize each word */
3649 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3650 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3651 fixcapitalize);
3652 if (item == NULL)
3653 goto onError;
3654 Py_DECREF(PyList_GET_ITEM(list, i));
3655 PyList_SET_ITEM(list, i, item);
3656 }
3657
3658 /* Join the words to form a new string */
3659 item = PyUnicode_Join(NULL, list);
3660
3661onError:
3662 Py_DECREF(list);
3663 return (PyObject *)item;
3664}
3665#endif
3666
3667static char center__doc__[] =
3668"S.center(width) -> unicode\n\
3669\n\
3670Return S centered in a Unicode string of length width. Padding is done\n\
3671using spaces.";
3672
3673static PyObject *
3674unicode_center(PyUnicodeObject *self, PyObject *args)
3675{
3676 int marg, left;
3677 int width;
3678
3679 if (!PyArg_ParseTuple(args, "i:center", &width))
3680 return NULL;
3681
Tim Peters7a29bd52001-09-12 03:03:31 +00003682 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003683 Py_INCREF(self);
3684 return (PyObject*) self;
3685 }
3686
3687 marg = width - self->length;
3688 left = marg / 2 + (marg & width & 1);
3689
3690 return (PyObject*) pad(self, left, marg - left, ' ');
3691}
3692
Marc-André Lemburge5034372000-08-08 08:04:29 +00003693#if 0
3694
3695/* This code should go into some future Unicode collation support
3696 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003697 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003698
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003699/* speedy UTF-16 code point order comparison */
3700/* gleaned from: */
3701/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3702
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003703static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003704{
3705 0, 0, 0, 0, 0, 0, 0, 0,
3706 0, 0, 0, 0, 0, 0, 0, 0,
3707 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003708 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003709};
3710
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711static int
3712unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3713{
3714 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003715
Guido van Rossumd57fd912000-03-10 22:53:23 +00003716 Py_UNICODE *s1 = str1->str;
3717 Py_UNICODE *s2 = str2->str;
3718
3719 len1 = str1->length;
3720 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003721
Guido van Rossumd57fd912000-03-10 22:53:23 +00003722 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003723 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003724
3725 c1 = *s1++;
3726 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003727
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003728 if (c1 > (1<<11) * 26)
3729 c1 += utf16Fixup[c1>>11];
3730 if (c2 > (1<<11) * 26)
3731 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003732 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003733
3734 if (c1 != c2)
3735 return (c1 < c2) ? -1 : 1;
3736
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003737 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 }
3739
3740 return (len1 < len2) ? -1 : (len1 != len2);
3741}
3742
Marc-André Lemburge5034372000-08-08 08:04:29 +00003743#else
3744
3745static int
3746unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3747{
3748 register int len1, len2;
3749
3750 Py_UNICODE *s1 = str1->str;
3751 Py_UNICODE *s2 = str2->str;
3752
3753 len1 = str1->length;
3754 len2 = str2->length;
3755
3756 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003757 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003758
Fredrik Lundh45714e92001-06-26 16:39:36 +00003759 c1 = *s1++;
3760 c2 = *s2++;
3761
3762 if (c1 != c2)
3763 return (c1 < c2) ? -1 : 1;
3764
Marc-André Lemburge5034372000-08-08 08:04:29 +00003765 len1--; len2--;
3766 }
3767
3768 return (len1 < len2) ? -1 : (len1 != len2);
3769}
3770
3771#endif
3772
Guido van Rossumd57fd912000-03-10 22:53:23 +00003773int PyUnicode_Compare(PyObject *left,
3774 PyObject *right)
3775{
3776 PyUnicodeObject *u = NULL, *v = NULL;
3777 int result;
3778
3779 /* Coerce the two arguments */
3780 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3781 if (u == NULL)
3782 goto onError;
3783 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3784 if (v == NULL)
3785 goto onError;
3786
Thomas Wouters7e474022000-07-16 12:04:32 +00003787 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003788 if (v == u) {
3789 Py_DECREF(u);
3790 Py_DECREF(v);
3791 return 0;
3792 }
3793
3794 result = unicode_compare(u, v);
3795
3796 Py_DECREF(u);
3797 Py_DECREF(v);
3798 return result;
3799
3800onError:
3801 Py_XDECREF(u);
3802 Py_XDECREF(v);
3803 return -1;
3804}
3805
Guido van Rossum403d68b2000-03-13 15:55:09 +00003806int PyUnicode_Contains(PyObject *container,
3807 PyObject *element)
3808{
3809 PyUnicodeObject *u = NULL, *v = NULL;
3810 int result;
3811 register const Py_UNICODE *p, *e;
3812 register Py_UNICODE ch;
3813
3814 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003815 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003816 if (v == NULL) {
3817 PyErr_SetString(PyExc_TypeError,
3818 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003819 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003820 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003821 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3822 if (u == NULL) {
3823 Py_DECREF(v);
3824 goto onError;
3825 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003826
3827 /* Check v in u */
3828 if (PyUnicode_GET_SIZE(v) != 1) {
3829 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003830 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003831 goto onError;
3832 }
3833 ch = *PyUnicode_AS_UNICODE(v);
3834 p = PyUnicode_AS_UNICODE(u);
3835 e = p + PyUnicode_GET_SIZE(u);
3836 result = 0;
3837 while (p < e) {
3838 if (*p++ == ch) {
3839 result = 1;
3840 break;
3841 }
3842 }
3843
3844 Py_DECREF(u);
3845 Py_DECREF(v);
3846 return result;
3847
3848onError:
3849 Py_XDECREF(u);
3850 Py_XDECREF(v);
3851 return -1;
3852}
3853
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854/* Concat to string or Unicode object giving a new Unicode object. */
3855
3856PyObject *PyUnicode_Concat(PyObject *left,
3857 PyObject *right)
3858{
3859 PyUnicodeObject *u = NULL, *v = NULL, *w;
3860
3861 /* Coerce the two arguments */
3862 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3863 if (u == NULL)
3864 goto onError;
3865 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3866 if (v == NULL)
3867 goto onError;
3868
3869 /* Shortcuts */
3870 if (v == unicode_empty) {
3871 Py_DECREF(v);
3872 return (PyObject *)u;
3873 }
3874 if (u == unicode_empty) {
3875 Py_DECREF(u);
3876 return (PyObject *)v;
3877 }
3878
3879 /* Concat the two Unicode strings */
3880 w = _PyUnicode_New(u->length + v->length);
3881 if (w == NULL)
3882 goto onError;
3883 Py_UNICODE_COPY(w->str, u->str, u->length);
3884 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3885
3886 Py_DECREF(u);
3887 Py_DECREF(v);
3888 return (PyObject *)w;
3889
3890onError:
3891 Py_XDECREF(u);
3892 Py_XDECREF(v);
3893 return NULL;
3894}
3895
3896static char count__doc__[] =
3897"S.count(sub[, start[, end]]) -> int\n\
3898\n\
3899Return the number of occurrences of substring sub in Unicode string\n\
3900S[start:end]. Optional arguments start and end are\n\
3901interpreted as in slice notation.";
3902
3903static PyObject *
3904unicode_count(PyUnicodeObject *self, PyObject *args)
3905{
3906 PyUnicodeObject *substring;
3907 int start = 0;
3908 int end = INT_MAX;
3909 PyObject *result;
3910
Guido van Rossumb8872e62000-05-09 14:14:27 +00003911 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3912 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003913 return NULL;
3914
3915 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3916 (PyObject *)substring);
3917 if (substring == NULL)
3918 return NULL;
3919
Guido van Rossumd57fd912000-03-10 22:53:23 +00003920 if (start < 0)
3921 start += self->length;
3922 if (start < 0)
3923 start = 0;
3924 if (end > self->length)
3925 end = self->length;
3926 if (end < 0)
3927 end += self->length;
3928 if (end < 0)
3929 end = 0;
3930
3931 result = PyInt_FromLong((long) count(self, start, end, substring));
3932
3933 Py_DECREF(substring);
3934 return result;
3935}
3936
3937static char encode__doc__[] =
3938"S.encode([encoding[,errors]]) -> string\n\
3939\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003940Return an encoded string version of S. Default encoding is the current\n\
3941default string encoding. errors may be given to set a different error\n\
3942handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3943a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003944
3945static PyObject *
3946unicode_encode(PyUnicodeObject *self, PyObject *args)
3947{
3948 char *encoding = NULL;
3949 char *errors = NULL;
3950 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3951 return NULL;
3952 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3953}
3954
3955static char expandtabs__doc__[] =
3956"S.expandtabs([tabsize]) -> unicode\n\
3957\n\
3958Return a copy of S where all tab characters are expanded using spaces.\n\
3959If tabsize is not given, a tab size of 8 characters is assumed.";
3960
3961static PyObject*
3962unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3963{
3964 Py_UNICODE *e;
3965 Py_UNICODE *p;
3966 Py_UNICODE *q;
3967 int i, j;
3968 PyUnicodeObject *u;
3969 int tabsize = 8;
3970
3971 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3972 return NULL;
3973
Thomas Wouters7e474022000-07-16 12:04:32 +00003974 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003975 i = j = 0;
3976 e = self->str + self->length;
3977 for (p = self->str; p < e; p++)
3978 if (*p == '\t') {
3979 if (tabsize > 0)
3980 j += tabsize - (j % tabsize);
3981 }
3982 else {
3983 j++;
3984 if (*p == '\n' || *p == '\r') {
3985 i += j;
3986 j = 0;
3987 }
3988 }
3989
3990 /* Second pass: create output string and fill it */
3991 u = _PyUnicode_New(i + j);
3992 if (!u)
3993 return NULL;
3994
3995 j = 0;
3996 q = u->str;
3997
3998 for (p = self->str; p < e; p++)
3999 if (*p == '\t') {
4000 if (tabsize > 0) {
4001 i = tabsize - (j % tabsize);
4002 j += i;
4003 while (i--)
4004 *q++ = ' ';
4005 }
4006 }
4007 else {
4008 j++;
4009 *q++ = *p;
4010 if (*p == '\n' || *p == '\r')
4011 j = 0;
4012 }
4013
4014 return (PyObject*) u;
4015}
4016
4017static char find__doc__[] =
4018"S.find(sub [,start [,end]]) -> int\n\
4019\n\
4020Return the lowest index in S where substring sub is found,\n\
4021such that sub is contained within s[start,end]. Optional\n\
4022arguments start and end are interpreted as in slice notation.\n\
4023\n\
4024Return -1 on failure.";
4025
4026static PyObject *
4027unicode_find(PyUnicodeObject *self, PyObject *args)
4028{
4029 PyUnicodeObject *substring;
4030 int start = 0;
4031 int end = INT_MAX;
4032 PyObject *result;
4033
Guido van Rossumb8872e62000-05-09 14:14:27 +00004034 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4035 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 return NULL;
4037 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4038 (PyObject *)substring);
4039 if (substring == NULL)
4040 return NULL;
4041
4042 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4043
4044 Py_DECREF(substring);
4045 return result;
4046}
4047
4048static PyObject *
4049unicode_getitem(PyUnicodeObject *self, int index)
4050{
4051 if (index < 0 || index >= self->length) {
4052 PyErr_SetString(PyExc_IndexError, "string index out of range");
4053 return NULL;
4054 }
4055
4056 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4057}
4058
4059static long
4060unicode_hash(PyUnicodeObject *self)
4061{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004062 /* Since Unicode objects compare equal to their ASCII string
4063 counterparts, they should use the individual character values
4064 as basis for their hash value. This is needed to assure that
4065 strings and Unicode objects behave in the same way as
4066 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004067
Fredrik Lundhdde61642000-07-10 18:27:47 +00004068 register int len;
4069 register Py_UNICODE *p;
4070 register long x;
4071
Guido van Rossumd57fd912000-03-10 22:53:23 +00004072 if (self->hash != -1)
4073 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004074 len = PyUnicode_GET_SIZE(self);
4075 p = PyUnicode_AS_UNICODE(self);
4076 x = *p << 7;
4077 while (--len >= 0)
4078 x = (1000003*x) ^ *p++;
4079 x ^= PyUnicode_GET_SIZE(self);
4080 if (x == -1)
4081 x = -2;
4082 self->hash = x;
4083 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084}
4085
4086static char index__doc__[] =
4087"S.index(sub [,start [,end]]) -> int\n\
4088\n\
4089Like S.find() but raise ValueError when the substring is not found.";
4090
4091static PyObject *
4092unicode_index(PyUnicodeObject *self, PyObject *args)
4093{
4094 int result;
4095 PyUnicodeObject *substring;
4096 int start = 0;
4097 int end = INT_MAX;
4098
Guido van Rossumb8872e62000-05-09 14:14:27 +00004099 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4100 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 return NULL;
4102
4103 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4104 (PyObject *)substring);
4105 if (substring == NULL)
4106 return NULL;
4107
4108 result = findstring(self, substring, start, end, 1);
4109
4110 Py_DECREF(substring);
4111 if (result < 0) {
4112 PyErr_SetString(PyExc_ValueError, "substring not found");
4113 return NULL;
4114 }
4115 return PyInt_FromLong(result);
4116}
4117
4118static char islower__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004119"S.islower() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004121Return True if all cased characters in S are lowercase and there is\n\
4122at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123
4124static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004125unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126{
4127 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4128 register const Py_UNICODE *e;
4129 int cased;
4130
Guido van Rossumd57fd912000-03-10 22:53:23 +00004131 /* Shortcut for single character strings */
4132 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004133 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004135 /* Special case for empty strings */
4136 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004137 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004138
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 e = p + PyUnicode_GET_SIZE(self);
4140 cased = 0;
4141 for (; p < e; p++) {
4142 register const Py_UNICODE ch = *p;
4143
4144 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004145 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 else if (!cased && Py_UNICODE_ISLOWER(ch))
4147 cased = 1;
4148 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004149 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004150}
4151
4152static char isupper__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004153"S.isupper() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004154\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004155Return True if all cased characters in S are uppercase and there is\n\
4156at least one cased character in S, False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004157
4158static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004159unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160{
4161 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4162 register const Py_UNICODE *e;
4163 int cased;
4164
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165 /* Shortcut for single character strings */
4166 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004167 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004168
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004169 /* Special case for empty strings */
4170 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004171 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004172
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 e = p + PyUnicode_GET_SIZE(self);
4174 cased = 0;
4175 for (; p < e; p++) {
4176 register const Py_UNICODE ch = *p;
4177
4178 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004179 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 else if (!cased && Py_UNICODE_ISUPPER(ch))
4181 cased = 1;
4182 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004183 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184}
4185
4186static char istitle__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004187"S.istitle() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004188\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004189Return True if S is a titlecased string, i.e. upper- and titlecase\n\
4190characters may only follow uncased characters and lowercase characters\n\
4191only cased ones. Return False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192
4193static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004194unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004195{
4196 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4197 register const Py_UNICODE *e;
4198 int cased, previous_is_cased;
4199
Guido van Rossumd57fd912000-03-10 22:53:23 +00004200 /* Shortcut for single character strings */
4201 if (PyUnicode_GET_SIZE(self) == 1)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004202 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4203 (Py_UNICODE_ISUPPER(*p) != 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004204
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004205 /* Special case for empty strings */
4206 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004207 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004208
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 e = p + PyUnicode_GET_SIZE(self);
4210 cased = 0;
4211 previous_is_cased = 0;
4212 for (; p < e; p++) {
4213 register const Py_UNICODE ch = *p;
4214
4215 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4216 if (previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004217 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 previous_is_cased = 1;
4219 cased = 1;
4220 }
4221 else if (Py_UNICODE_ISLOWER(ch)) {
4222 if (!previous_is_cased)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004223 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 previous_is_cased = 1;
4225 cased = 1;
4226 }
4227 else
4228 previous_is_cased = 0;
4229 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004230 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004231}
4232
4233static char isspace__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004234"S.isspace() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004235\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004236Return True if there are only whitespace characters in S,\n\
4237False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004238
4239static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004240unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004241{
4242 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4243 register const Py_UNICODE *e;
4244
Guido van Rossumd57fd912000-03-10 22:53:23 +00004245 /* Shortcut for single character strings */
4246 if (PyUnicode_GET_SIZE(self) == 1 &&
4247 Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004248 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004250 /* Special case for empty strings */
4251 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004252 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004253
Guido van Rossumd57fd912000-03-10 22:53:23 +00004254 e = p + PyUnicode_GET_SIZE(self);
4255 for (; p < e; p++) {
4256 if (!Py_UNICODE_ISSPACE(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004257 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004259 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260}
4261
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004262static char isalpha__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004263"S.isalpha() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004264\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004265Return True if all characters in S are alphabetic\n\
4266and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004267
4268static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004269unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004270{
4271 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4272 register const Py_UNICODE *e;
4273
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004274 /* Shortcut for single character strings */
4275 if (PyUnicode_GET_SIZE(self) == 1 &&
4276 Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004277 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004278
4279 /* Special case for empty strings */
4280 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004281 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004282
4283 e = p + PyUnicode_GET_SIZE(self);
4284 for (; p < e; p++) {
4285 if (!Py_UNICODE_ISALPHA(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004286 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004287 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004288 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004289}
4290
4291static char isalnum__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004292"S.isalnum() -> bool\n\
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004293\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004294Return True if all characters in S are alphanumeric\n\
4295and there is at least one character in S, False otherwise.";
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004296
4297static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004298unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004299{
4300 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4301 register const Py_UNICODE *e;
4302
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004303 /* Shortcut for single character strings */
4304 if (PyUnicode_GET_SIZE(self) == 1 &&
4305 Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004306 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004307
4308 /* Special case for empty strings */
4309 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004310 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004311
4312 e = p + PyUnicode_GET_SIZE(self);
4313 for (; p < e; p++) {
4314 if (!Py_UNICODE_ISALNUM(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004315 return PyBool_FromLong(0);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004316 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004317 return PyBool_FromLong(1);
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004318}
4319
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320static char isdecimal__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004321"S.isdecimal() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004322\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004323Return True if there are only decimal characters in S,\n\
4324False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325
4326static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004327unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328{
4329 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4330 register const Py_UNICODE *e;
4331
Guido van Rossumd57fd912000-03-10 22:53:23 +00004332 /* Shortcut for single character strings */
4333 if (PyUnicode_GET_SIZE(self) == 1 &&
4334 Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004335 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004337 /* Special case for empty strings */
4338 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004339 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004340
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 e = p + PyUnicode_GET_SIZE(self);
4342 for (; p < e; p++) {
4343 if (!Py_UNICODE_ISDECIMAL(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004344 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004345 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004346 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004347}
4348
4349static char isdigit__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004350"S.isdigit() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004351\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004352Return True if there are only digit characters in S,\n\
4353False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354
4355static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004356unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004357{
4358 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4359 register const Py_UNICODE *e;
4360
Guido van Rossumd57fd912000-03-10 22:53:23 +00004361 /* Shortcut for single character strings */
4362 if (PyUnicode_GET_SIZE(self) == 1 &&
4363 Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004364 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004365
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004366 /* Special case for empty strings */
4367 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004368 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004369
Guido van Rossumd57fd912000-03-10 22:53:23 +00004370 e = p + PyUnicode_GET_SIZE(self);
4371 for (; p < e; p++) {
4372 if (!Py_UNICODE_ISDIGIT(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004373 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004374 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004375 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004376}
4377
4378static char isnumeric__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004379"S.isnumeric() -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004381Return True if there are only numeric characters in S,\n\
4382False otherwise.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004383
4384static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004385unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004386{
4387 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4388 register const Py_UNICODE *e;
4389
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 /* Shortcut for single character strings */
4391 if (PyUnicode_GET_SIZE(self) == 1 &&
4392 Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004393 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004394
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004395 /* Special case for empty strings */
4396 if (PyString_GET_SIZE(self) == 0)
Guido van Rossum77f6a652002-04-03 22:41:51 +00004397 return PyBool_FromLong(0);
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004398
Guido van Rossumd57fd912000-03-10 22:53:23 +00004399 e = p + PyUnicode_GET_SIZE(self);
4400 for (; p < e; p++) {
4401 if (!Py_UNICODE_ISNUMERIC(*p))
Guido van Rossum77f6a652002-04-03 22:41:51 +00004402 return PyBool_FromLong(0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 }
Guido van Rossum77f6a652002-04-03 22:41:51 +00004404 return PyBool_FromLong(1);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405}
4406
4407static char join__doc__[] =
4408"S.join(sequence) -> unicode\n\
4409\n\
4410Return a string which is the concatenation of the strings in the\n\
4411sequence. The separator between elements is S.";
4412
4413static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004414unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004416 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417}
4418
4419static int
4420unicode_length(PyUnicodeObject *self)
4421{
4422 return self->length;
4423}
4424
4425static char ljust__doc__[] =
4426"S.ljust(width) -> unicode\n\
4427\n\
4428Return S left justified in a Unicode string of length width. Padding is\n\
4429done using spaces.";
4430
4431static PyObject *
4432unicode_ljust(PyUnicodeObject *self, PyObject *args)
4433{
4434 int width;
4435 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4436 return NULL;
4437
Tim Peters7a29bd52001-09-12 03:03:31 +00004438 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 Py_INCREF(self);
4440 return (PyObject*) self;
4441 }
4442
4443 return (PyObject*) pad(self, 0, width - self->length, ' ');
4444}
4445
4446static char lower__doc__[] =
4447"S.lower() -> unicode\n\
4448\n\
4449Return a copy of the string S converted to lowercase.";
4450
4451static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004452unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004453{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 return fixup(self, fixlower);
4455}
4456
4457static char lstrip__doc__[] =
4458"S.lstrip() -> unicode\n\
4459\n\
4460Return a copy of the string S with leading whitespace removed.";
4461
4462static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004463unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004464{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 return strip(self, 1, 0);
4466}
4467
4468static PyObject*
4469unicode_repeat(PyUnicodeObject *str, int len)
4470{
4471 PyUnicodeObject *u;
4472 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004473 int nchars;
4474 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475
4476 if (len < 0)
4477 len = 0;
4478
Tim Peters7a29bd52001-09-12 03:03:31 +00004479 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480 /* no repeat, return original string */
4481 Py_INCREF(str);
4482 return (PyObject*) str;
4483 }
Tim Peters8f422462000-09-09 06:13:41 +00004484
4485 /* ensure # of chars needed doesn't overflow int and # of bytes
4486 * needed doesn't overflow size_t
4487 */
4488 nchars = len * str->length;
4489 if (len && nchars / len != str->length) {
4490 PyErr_SetString(PyExc_OverflowError,
4491 "repeated string is too long");
4492 return NULL;
4493 }
4494 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4495 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4496 PyErr_SetString(PyExc_OverflowError,
4497 "repeated string is too long");
4498 return NULL;
4499 }
4500 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004501 if (!u)
4502 return NULL;
4503
4504 p = u->str;
4505
4506 while (len-- > 0) {
4507 Py_UNICODE_COPY(p, str->str, str->length);
4508 p += str->length;
4509 }
4510
4511 return (PyObject*) u;
4512}
4513
4514PyObject *PyUnicode_Replace(PyObject *obj,
4515 PyObject *subobj,
4516 PyObject *replobj,
4517 int maxcount)
4518{
4519 PyObject *self;
4520 PyObject *str1;
4521 PyObject *str2;
4522 PyObject *result;
4523
4524 self = PyUnicode_FromObject(obj);
4525 if (self == NULL)
4526 return NULL;
4527 str1 = PyUnicode_FromObject(subobj);
4528 if (str1 == NULL) {
4529 Py_DECREF(self);
4530 return NULL;
4531 }
4532 str2 = PyUnicode_FromObject(replobj);
4533 if (str2 == NULL) {
4534 Py_DECREF(self);
4535 Py_DECREF(str1);
4536 return NULL;
4537 }
4538 result = replace((PyUnicodeObject *)self,
4539 (PyUnicodeObject *)str1,
4540 (PyUnicodeObject *)str2,
4541 maxcount);
4542 Py_DECREF(self);
4543 Py_DECREF(str1);
4544 Py_DECREF(str2);
4545 return result;
4546}
4547
4548static char replace__doc__[] =
4549"S.replace (old, new[, maxsplit]) -> unicode\n\
4550\n\
4551Return a copy of S with all occurrences of substring\n\
4552old replaced by new. If the optional argument maxsplit is\n\
4553given, only the first maxsplit occurrences are replaced.";
4554
4555static PyObject*
4556unicode_replace(PyUnicodeObject *self, PyObject *args)
4557{
4558 PyUnicodeObject *str1;
4559 PyUnicodeObject *str2;
4560 int maxcount = -1;
4561 PyObject *result;
4562
4563 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4564 return NULL;
4565 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4566 if (str1 == NULL)
4567 return NULL;
4568 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4569 if (str2 == NULL)
4570 return NULL;
4571
4572 result = replace(self, str1, str2, maxcount);
4573
4574 Py_DECREF(str1);
4575 Py_DECREF(str2);
4576 return result;
4577}
4578
4579static
4580PyObject *unicode_repr(PyObject *unicode)
4581{
4582 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4583 PyUnicode_GET_SIZE(unicode),
4584 1);
4585}
4586
4587static char rfind__doc__[] =
4588"S.rfind(sub [,start [,end]]) -> int\n\
4589\n\
4590Return the highest index in S where substring sub is found,\n\
4591such that sub is contained within s[start,end]. Optional\n\
4592arguments start and end are interpreted as in slice notation.\n\
4593\n\
4594Return -1 on failure.";
4595
4596static PyObject *
4597unicode_rfind(PyUnicodeObject *self, PyObject *args)
4598{
4599 PyUnicodeObject *substring;
4600 int start = 0;
4601 int end = INT_MAX;
4602 PyObject *result;
4603
Guido van Rossumb8872e62000-05-09 14:14:27 +00004604 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4605 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606 return NULL;
4607 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4608 (PyObject *)substring);
4609 if (substring == NULL)
4610 return NULL;
4611
4612 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4613
4614 Py_DECREF(substring);
4615 return result;
4616}
4617
4618static char rindex__doc__[] =
4619"S.rindex(sub [,start [,end]]) -> int\n\
4620\n\
4621Like S.rfind() but raise ValueError when the substring is not found.";
4622
4623static PyObject *
4624unicode_rindex(PyUnicodeObject *self, PyObject *args)
4625{
4626 int result;
4627 PyUnicodeObject *substring;
4628 int start = 0;
4629 int end = INT_MAX;
4630
Guido van Rossumb8872e62000-05-09 14:14:27 +00004631 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4632 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633 return NULL;
4634 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4635 (PyObject *)substring);
4636 if (substring == NULL)
4637 return NULL;
4638
4639 result = findstring(self, substring, start, end, -1);
4640
4641 Py_DECREF(substring);
4642 if (result < 0) {
4643 PyErr_SetString(PyExc_ValueError, "substring not found");
4644 return NULL;
4645 }
4646 return PyInt_FromLong(result);
4647}
4648
4649static char rjust__doc__[] =
4650"S.rjust(width) -> unicode\n\
4651\n\
4652Return S right justified in a Unicode string of length width. Padding is\n\
4653done using spaces.";
4654
4655static PyObject *
4656unicode_rjust(PyUnicodeObject *self, PyObject *args)
4657{
4658 int width;
4659 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4660 return NULL;
4661
Tim Peters7a29bd52001-09-12 03:03:31 +00004662 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 Py_INCREF(self);
4664 return (PyObject*) self;
4665 }
4666
4667 return (PyObject*) pad(self, width - self->length, 0, ' ');
4668}
4669
4670static char rstrip__doc__[] =
4671"S.rstrip() -> unicode\n\
4672\n\
4673Return a copy of the string S with trailing whitespace removed.";
4674
4675static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004676unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 return strip(self, 0, 1);
4679}
4680
4681static PyObject*
4682unicode_slice(PyUnicodeObject *self, int start, int end)
4683{
4684 /* standard clamping */
4685 if (start < 0)
4686 start = 0;
4687 if (end < 0)
4688 end = 0;
4689 if (end > self->length)
4690 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004691 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004692 /* full slice, return original string */
4693 Py_INCREF(self);
4694 return (PyObject*) self;
4695 }
4696 if (start > end)
4697 start = end;
4698 /* copy slice */
4699 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4700 end - start);
4701}
4702
4703PyObject *PyUnicode_Split(PyObject *s,
4704 PyObject *sep,
4705 int maxsplit)
4706{
4707 PyObject *result;
4708
4709 s = PyUnicode_FromObject(s);
4710 if (s == NULL)
4711 return NULL;
4712 if (sep != NULL) {
4713 sep = PyUnicode_FromObject(sep);
4714 if (sep == NULL) {
4715 Py_DECREF(s);
4716 return NULL;
4717 }
4718 }
4719
4720 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4721
4722 Py_DECREF(s);
4723 Py_XDECREF(sep);
4724 return result;
4725}
4726
4727static char split__doc__[] =
4728"S.split([sep [,maxsplit]]) -> list of strings\n\
4729\n\
4730Return a list of the words in S, using sep as the\n\
4731delimiter string. If maxsplit is given, at most maxsplit\n\
4732splits are done. If sep is not specified, any whitespace string\n\
4733is a separator.";
4734
4735static PyObject*
4736unicode_split(PyUnicodeObject *self, PyObject *args)
4737{
4738 PyObject *substring = Py_None;
4739 int maxcount = -1;
4740
4741 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4742 return NULL;
4743
4744 if (substring == Py_None)
4745 return split(self, NULL, maxcount);
4746 else if (PyUnicode_Check(substring))
4747 return split(self, (PyUnicodeObject *)substring, maxcount);
4748 else
4749 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4750}
4751
4752static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004753"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754\n\
4755Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004756Line breaks are not included in the resulting list unless keepends\n\
4757is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758
4759static PyObject*
4760unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4761{
Guido van Rossum86662912000-04-11 15:38:46 +00004762 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004763
Guido van Rossum86662912000-04-11 15:38:46 +00004764 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765 return NULL;
4766
Guido van Rossum86662912000-04-11 15:38:46 +00004767 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768}
4769
4770static
4771PyObject *unicode_str(PyUnicodeObject *self)
4772{
Fred Drakee4315f52000-05-09 19:53:39 +00004773 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774}
4775
4776static char strip__doc__[] =
4777"S.strip() -> unicode\n\
4778\n\
4779Return a copy of S with leading and trailing whitespace removed.";
4780
4781static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004782unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 return strip(self, 1, 1);
4785}
4786
4787static char swapcase__doc__[] =
4788"S.swapcase() -> unicode\n\
4789\n\
4790Return a copy of S with uppercase characters converted to lowercase\n\
4791and vice versa.";
4792
4793static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004794unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796 return fixup(self, fixswapcase);
4797}
4798
4799static char translate__doc__[] =
4800"S.translate(table) -> unicode\n\
4801\n\
4802Return a copy of the string S, where all characters have been mapped\n\
4803through the given translation table, which must be a mapping of\n\
4804Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4805are left untouched. Characters mapped to None are deleted.";
4806
4807static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004808unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 return PyUnicode_TranslateCharmap(self->str,
4811 self->length,
4812 table,
4813 "ignore");
4814}
4815
4816static char upper__doc__[] =
4817"S.upper() -> unicode\n\
4818\n\
4819Return a copy of S converted to uppercase.";
4820
4821static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004822unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 return fixup(self, fixupper);
4825}
4826
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827static char zfill__doc__[] =
4828"S.zfill(width) -> unicode\n\
4829\n\
4830Pad a numeric string x with zeros on the left, to fill a field\n\
4831of the specified width. The string x is never truncated.";
4832
4833static PyObject *
4834unicode_zfill(PyUnicodeObject *self, PyObject *args)
4835{
4836 int fill;
4837 PyUnicodeObject *u;
4838
4839 int width;
4840 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4841 return NULL;
4842
4843 if (self->length >= width) {
Walter Dörwald0fe940c2002-04-15 18:42:15 +00004844 if (PyUnicode_CheckExact(self)) {
4845 Py_INCREF(self);
4846 return (PyObject*) self;
4847 }
4848 else
4849 return PyUnicode_FromUnicode(
4850 PyUnicode_AS_UNICODE(self),
4851 PyUnicode_GET_SIZE(self)
4852 );
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 }
4854
4855 fill = width - self->length;
4856
4857 u = pad(self, fill, 0, '0');
4858
Walter Dörwald068325e2002-04-15 13:36:47 +00004859 if (u == NULL)
4860 return NULL;
4861
Guido van Rossumd57fd912000-03-10 22:53:23 +00004862 if (u->str[fill] == '+' || u->str[fill] == '-') {
4863 /* move sign to beginning of string */
4864 u->str[0] = u->str[fill];
4865 u->str[fill] = '0';
4866 }
4867
4868 return (PyObject*) u;
4869}
Guido van Rossumd57fd912000-03-10 22:53:23 +00004870
4871#if 0
4872static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004873unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004874{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004875 return PyInt_FromLong(unicode_freelist_size);
4876}
4877#endif
4878
4879static char startswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004880"S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004881\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004882Return True if S starts with the specified prefix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883optional start, test S beginning at that position. With optional end, stop\n\
4884comparing S at that position.";
4885
4886static PyObject *
4887unicode_startswith(PyUnicodeObject *self,
4888 PyObject *args)
4889{
4890 PyUnicodeObject *substring;
4891 int start = 0;
4892 int end = INT_MAX;
4893 PyObject *result;
4894
Guido van Rossumb8872e62000-05-09 14:14:27 +00004895 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4896 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004897 return NULL;
4898 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4899 (PyObject *)substring);
4900 if (substring == NULL)
4901 return NULL;
4902
Guido van Rossum77f6a652002-04-03 22:41:51 +00004903 result = PyBool_FromLong(tailmatch(self, substring, start, end, -1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004904
4905 Py_DECREF(substring);
4906 return result;
4907}
4908
4909
4910static char endswith__doc__[] =
Guido van Rossum77f6a652002-04-03 22:41:51 +00004911"S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004912\n\
Guido van Rossum77f6a652002-04-03 22:41:51 +00004913Return True if S ends with the specified suffix, False otherwise. With\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004914optional start, test S beginning at that position. With optional end, stop\n\
4915comparing S at that position.";
4916
4917static PyObject *
4918unicode_endswith(PyUnicodeObject *self,
4919 PyObject *args)
4920{
4921 PyUnicodeObject *substring;
4922 int start = 0;
4923 int end = INT_MAX;
4924 PyObject *result;
4925
Guido van Rossumb8872e62000-05-09 14:14:27 +00004926 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4927 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004928 return NULL;
4929 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4930 (PyObject *)substring);
4931 if (substring == NULL)
4932 return NULL;
4933
Guido van Rossum77f6a652002-04-03 22:41:51 +00004934 result = PyBool_FromLong(tailmatch(self, substring, start, end, +1));
Guido van Rossumd57fd912000-03-10 22:53:23 +00004935
4936 Py_DECREF(substring);
4937 return result;
4938}
4939
4940
4941static PyMethodDef unicode_methods[] = {
4942
4943 /* Order is according to common usage: often used methods should
4944 appear first, since lookup is done sequentially. */
4945
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004946 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4947 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4948 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4949 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4950 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4951 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4952 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4953 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4954 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4955 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4956 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4957 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4958 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4959 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4960/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4961 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4962 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4963 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4964 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4965 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4966 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4967 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4968 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4969 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4970 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4971 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4972 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4973 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4974 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4975 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4976 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4977 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4978 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4979 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4980 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004981 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
Walter Dörwald068325e2002-04-15 13:36:47 +00004982#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004983 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004984#endif
4985
4986#if 0
4987 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004988 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989#endif
4990
4991 {NULL, NULL}
4992};
4993
Guido van Rossumd57fd912000-03-10 22:53:23 +00004994static PySequenceMethods unicode_as_sequence = {
4995 (inquiry) unicode_length, /* sq_length */
4996 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4997 (intargfunc) unicode_repeat, /* sq_repeat */
4998 (intargfunc) unicode_getitem, /* sq_item */
4999 (intintargfunc) unicode_slice, /* sq_slice */
5000 0, /* sq_ass_item */
5001 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00005002 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003};
5004
5005static int
5006unicode_buffer_getreadbuf(PyUnicodeObject *self,
5007 int index,
5008 const void **ptr)
5009{
5010 if (index != 0) {
5011 PyErr_SetString(PyExc_SystemError,
5012 "accessing non-existent unicode segment");
5013 return -1;
5014 }
5015 *ptr = (void *) self->str;
5016 return PyUnicode_GET_DATA_SIZE(self);
5017}
5018
5019static int
5020unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
5021 const void **ptr)
5022{
5023 PyErr_SetString(PyExc_TypeError,
5024 "cannot use unicode as modifyable buffer");
5025 return -1;
5026}
5027
5028static int
5029unicode_buffer_getsegcount(PyUnicodeObject *self,
5030 int *lenp)
5031{
5032 if (lenp)
5033 *lenp = PyUnicode_GET_DATA_SIZE(self);
5034 return 1;
5035}
5036
5037static int
5038unicode_buffer_getcharbuf(PyUnicodeObject *self,
5039 int index,
5040 const void **ptr)
5041{
5042 PyObject *str;
5043
5044 if (index != 0) {
5045 PyErr_SetString(PyExc_SystemError,
5046 "accessing non-existent unicode segment");
5047 return -1;
5048 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005049 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 if (str == NULL)
5051 return -1;
5052 *ptr = (void *) PyString_AS_STRING(str);
5053 return PyString_GET_SIZE(str);
5054}
5055
5056/* Helpers for PyUnicode_Format() */
5057
5058static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005059getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005060{
5061 int argidx = *p_argidx;
5062 if (argidx < arglen) {
5063 (*p_argidx)++;
5064 if (arglen < 0)
5065 return args;
5066 else
5067 return PyTuple_GetItem(args, argidx);
5068 }
5069 PyErr_SetString(PyExc_TypeError,
5070 "not enough arguments for format string");
5071 return NULL;
5072}
5073
5074#define F_LJUST (1<<0)
5075#define F_SIGN (1<<1)
5076#define F_BLANK (1<<2)
5077#define F_ALT (1<<3)
5078#define F_ZERO (1<<4)
5079
5080static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005081int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082{
5083 register int i;
5084 int len;
5085 va_list va;
5086 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005087 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005088
5089 /* First, format the string as char array, then expand to Py_UNICODE
5090 array. */
5091 charbuffer = (char *)buffer;
5092 len = vsprintf(charbuffer, format, va);
5093 for (i = len - 1; i >= 0; i--)
5094 buffer[i] = (Py_UNICODE) charbuffer[i];
5095
5096 va_end(va);
5097 return len;
5098}
5099
5100static int
5101formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005102 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005103 int flags,
5104 int prec,
5105 int type,
5106 PyObject *v)
5107{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005108 /* fmt = '%#.' + `prec` + `type`
5109 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 char fmt[20];
5111 double x;
5112
5113 x = PyFloat_AsDouble(v);
5114 if (x == -1.0 && PyErr_Occurred())
5115 return -1;
5116 if (prec < 0)
5117 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5119 type = 'g';
Barry Warsawe5c492d2001-11-28 21:00:41 +00005120 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%d%c",
5121 (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005122 /* worst case length calc to ensure no buffer overrun:
5123 fmt = %#.<prec>g
5124 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5125 for any double rep.)
5126 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5127 If prec=0 the effective precision is 1 (the leading digit is
5128 always given), therefore increase by one to 10+prec. */
5129 if (buflen <= (size_t)10 + (size_t)prec) {
5130 PyErr_SetString(PyExc_OverflowError,
5131 "formatted float is too long (precision too long?)");
5132 return -1;
5133 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005134 return usprintf(buf, fmt, x);
5135}
5136
Tim Peters38fd5b62000-09-21 05:43:11 +00005137static PyObject*
5138formatlong(PyObject *val, int flags, int prec, int type)
5139{
5140 char *buf;
5141 int i, len;
5142 PyObject *str; /* temporary string object. */
5143 PyUnicodeObject *result;
5144
5145 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5146 if (!str)
5147 return NULL;
5148 result = _PyUnicode_New(len);
5149 for (i = 0; i < len; i++)
5150 result->str[i] = buf[i];
5151 result->str[len] = 0;
5152 Py_DECREF(str);
5153 return (PyObject*)result;
5154}
5155
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156static int
5157formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005158 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005159 int flags,
5160 int prec,
5161 int type,
5162 PyObject *v)
5163{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005164 /* fmt = '%#.' + `prec` + 'l' + `type`
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005165 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5166 * + 1 + 1
5167 * = 24
5168 */
Tim Peters38fd5b62000-09-21 05:43:11 +00005169 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 long x;
5171
5172 x = PyInt_AsLong(v);
5173 if (x == -1 && PyErr_Occurred())
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005174 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005175 if (prec < 0)
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005176 prec = 1;
5177
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005178 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005179 * worst case buf = '0x' + [0-9]*prec, where prec >= 11
5180 */
5181 if (buflen <= 13 || buflen <= (size_t)2 + (size_t)prec) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005182 PyErr_SetString(PyExc_OverflowError,
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005183 "formatted integer is too long (precision too large?)");
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005184 return -1;
5185 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005186
5187 if ((flags & F_ALT) &&
5188 (type == 'x' || type == 'X')) {
5189 /* When converting under %#x or %#X, there are a number
5190 * of issues that cause pain:
5191 * - when 0 is being converted, the C standard leaves off
5192 * the '0x' or '0X', which is inconsistent with other
5193 * %#x/%#X conversions and inconsistent with Python's
5194 * hex() function
5195 * - there are platforms that violate the standard and
5196 * convert 0 with the '0x' or '0X'
5197 * (Metrowerks, Compaq Tru64)
5198 * - there are platforms that give '0x' when converting
5199 * under %#X, but convert 0 in accordance with the
5200 * standard (OS/2 EMX)
5201 *
5202 * We can achieve the desired consistency by inserting our
5203 * own '0x' or '0X' prefix, and substituting %x/%X in place
5204 * of %#x/%#X.
5205 *
5206 * Note that this is the same approach as used in
5207 * formatint() in stringobject.c
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005208 */
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005209 PyOS_snprintf(fmt, sizeof(fmt), "0%c%%.%dl%c",
5210 type, prec, type);
Andrew MacIntyrec4874392002-02-26 11:36:35 +00005211 }
Andrew MacIntyre5e9c80d2002-02-28 11:38:24 +00005212 else {
5213 PyOS_snprintf(fmt, sizeof(fmt), "%%%s.%dl%c",
5214 (flags&F_ALT) ? "#" : "",
5215 prec, type);
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005216 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217 return usprintf(buf, fmt, x);
5218}
5219
5220static int
5221formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005222 size_t buflen,
5223 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005225 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005226 if (PyUnicode_Check(v)) {
5227 if (PyUnicode_GET_SIZE(v) != 1)
5228 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005230 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005232 else if (PyString_Check(v)) {
5233 if (PyString_GET_SIZE(v) != 1)
5234 goto onError;
5235 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005237
5238 else {
5239 /* Integer input truncated to a character */
5240 long x;
5241 x = PyInt_AsLong(v);
5242 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005243 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244 buf[0] = (char) x;
5245 }
5246 buf[1] = '\0';
5247 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005248
5249 onError:
5250 PyErr_SetString(PyExc_TypeError,
5251 "%c requires int or char");
5252 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253}
5254
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005255/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5256
5257 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5258 chars are formatted. XXX This is a magic number. Each formatting
5259 routine does bounds checking to ensure no overflow, but a better
5260 solution may be to malloc a buffer of appropriate size for each
5261 format. For now, the current solution is sufficient.
5262*/
5263#define FORMATBUFLEN (size_t)120
5264
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265PyObject *PyUnicode_Format(PyObject *format,
5266 PyObject *args)
5267{
5268 Py_UNICODE *fmt, *res;
5269 int fmtcnt, rescnt, reslen, arglen, argidx;
5270 int args_owned = 0;
5271 PyUnicodeObject *result = NULL;
5272 PyObject *dict = NULL;
5273 PyObject *uformat;
5274
5275 if (format == NULL || args == NULL) {
5276 PyErr_BadInternalCall();
5277 return NULL;
5278 }
5279 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005280 if (uformat == NULL)
5281 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 fmt = PyUnicode_AS_UNICODE(uformat);
5283 fmtcnt = PyUnicode_GET_SIZE(uformat);
5284
5285 reslen = rescnt = fmtcnt + 100;
5286 result = _PyUnicode_New(reslen);
5287 if (result == NULL)
5288 goto onError;
5289 res = PyUnicode_AS_UNICODE(result);
5290
5291 if (PyTuple_Check(args)) {
5292 arglen = PyTuple_Size(args);
5293 argidx = 0;
5294 }
5295 else {
5296 arglen = -1;
5297 argidx = -2;
5298 }
5299 if (args->ob_type->tp_as_mapping)
5300 dict = args;
5301
5302 while (--fmtcnt >= 0) {
5303 if (*fmt != '%') {
5304 if (--rescnt < 0) {
5305 rescnt = fmtcnt + 100;
5306 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005307 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 return NULL;
5309 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5310 --rescnt;
5311 }
5312 *res++ = *fmt++;
5313 }
5314 else {
5315 /* Got a format specifier */
5316 int flags = 0;
5317 int width = -1;
5318 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005319 Py_UNICODE c = '\0';
5320 Py_UNICODE fill;
5321 PyObject *v = NULL;
5322 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005323 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005324 Py_UNICODE sign;
5325 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005326 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005327
5328 fmt++;
5329 if (*fmt == '(') {
5330 Py_UNICODE *keystart;
5331 int keylen;
5332 PyObject *key;
5333 int pcount = 1;
5334
5335 if (dict == NULL) {
5336 PyErr_SetString(PyExc_TypeError,
5337 "format requires a mapping");
5338 goto onError;
5339 }
5340 ++fmt;
5341 --fmtcnt;
5342 keystart = fmt;
5343 /* Skip over balanced parentheses */
5344 while (pcount > 0 && --fmtcnt >= 0) {
5345 if (*fmt == ')')
5346 --pcount;
5347 else if (*fmt == '(')
5348 ++pcount;
5349 fmt++;
5350 }
5351 keylen = fmt - keystart - 1;
5352 if (fmtcnt < 0 || pcount > 0) {
5353 PyErr_SetString(PyExc_ValueError,
5354 "incomplete format key");
5355 goto onError;
5356 }
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005357#if 0
Fred Drakee4315f52000-05-09 19:53:39 +00005358 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005359 then looked up since Python uses strings to hold
5360 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005361 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362 key = PyUnicode_EncodeUTF8(keystart,
5363 keylen,
5364 NULL);
Marc-André Lemburg72f82132001-11-20 15:18:49 +00005365#else
5366 key = PyUnicode_FromUnicode(keystart, keylen);
5367#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368 if (key == NULL)
5369 goto onError;
5370 if (args_owned) {
5371 Py_DECREF(args);
5372 args_owned = 0;
5373 }
5374 args = PyObject_GetItem(dict, key);
5375 Py_DECREF(key);
5376 if (args == NULL) {
5377 goto onError;
5378 }
5379 args_owned = 1;
5380 arglen = -1;
5381 argidx = -2;
5382 }
5383 while (--fmtcnt >= 0) {
5384 switch (c = *fmt++) {
5385 case '-': flags |= F_LJUST; continue;
5386 case '+': flags |= F_SIGN; continue;
5387 case ' ': flags |= F_BLANK; continue;
5388 case '#': flags |= F_ALT; continue;
5389 case '0': flags |= F_ZERO; continue;
5390 }
5391 break;
5392 }
5393 if (c == '*') {
5394 v = getnextarg(args, arglen, &argidx);
5395 if (v == NULL)
5396 goto onError;
5397 if (!PyInt_Check(v)) {
5398 PyErr_SetString(PyExc_TypeError,
5399 "* wants int");
5400 goto onError;
5401 }
5402 width = PyInt_AsLong(v);
5403 if (width < 0) {
5404 flags |= F_LJUST;
5405 width = -width;
5406 }
5407 if (--fmtcnt >= 0)
5408 c = *fmt++;
5409 }
5410 else if (c >= '0' && c <= '9') {
5411 width = c - '0';
5412 while (--fmtcnt >= 0) {
5413 c = *fmt++;
5414 if (c < '0' || c > '9')
5415 break;
5416 if ((width*10) / 10 != width) {
5417 PyErr_SetString(PyExc_ValueError,
5418 "width too big");
5419 goto onError;
5420 }
5421 width = width*10 + (c - '0');
5422 }
5423 }
5424 if (c == '.') {
5425 prec = 0;
5426 if (--fmtcnt >= 0)
5427 c = *fmt++;
5428 if (c == '*') {
5429 v = getnextarg(args, arglen, &argidx);
5430 if (v == NULL)
5431 goto onError;
5432 if (!PyInt_Check(v)) {
5433 PyErr_SetString(PyExc_TypeError,
5434 "* wants int");
5435 goto onError;
5436 }
5437 prec = PyInt_AsLong(v);
5438 if (prec < 0)
5439 prec = 0;
5440 if (--fmtcnt >= 0)
5441 c = *fmt++;
5442 }
5443 else if (c >= '0' && c <= '9') {
5444 prec = c - '0';
5445 while (--fmtcnt >= 0) {
5446 c = Py_CHARMASK(*fmt++);
5447 if (c < '0' || c > '9')
5448 break;
5449 if ((prec*10) / 10 != prec) {
5450 PyErr_SetString(PyExc_ValueError,
5451 "prec too big");
5452 goto onError;
5453 }
5454 prec = prec*10 + (c - '0');
5455 }
5456 }
5457 } /* prec */
5458 if (fmtcnt >= 0) {
5459 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005460 if (--fmtcnt >= 0)
5461 c = *fmt++;
5462 }
5463 }
5464 if (fmtcnt < 0) {
5465 PyErr_SetString(PyExc_ValueError,
5466 "incomplete format");
5467 goto onError;
5468 }
5469 if (c != '%') {
5470 v = getnextarg(args, arglen, &argidx);
5471 if (v == NULL)
5472 goto onError;
5473 }
5474 sign = 0;
5475 fill = ' ';
5476 switch (c) {
5477
5478 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005479 pbuf = formatbuf;
5480 /* presume that buffer length is at least 1 */
5481 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005482 len = 1;
5483 break;
5484
5485 case 's':
5486 case 'r':
5487 if (PyUnicode_Check(v) && c == 's') {
5488 temp = v;
5489 Py_INCREF(temp);
5490 }
5491 else {
5492 PyObject *unicode;
5493 if (c == 's')
5494 temp = PyObject_Str(v);
5495 else
5496 temp = PyObject_Repr(v);
5497 if (temp == NULL)
5498 goto onError;
5499 if (!PyString_Check(temp)) {
5500 /* XXX Note: this should never happen, since
5501 PyObject_Repr() and PyObject_Str() assure
5502 this */
5503 Py_DECREF(temp);
5504 PyErr_SetString(PyExc_TypeError,
5505 "%s argument has non-string str()");
5506 goto onError;
5507 }
Fred Drakee4315f52000-05-09 19:53:39 +00005508 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005510 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 "strict");
5512 Py_DECREF(temp);
5513 temp = unicode;
5514 if (temp == NULL)
5515 goto onError;
5516 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005517 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005518 len = PyUnicode_GET_SIZE(temp);
5519 if (prec >= 0 && len > prec)
5520 len = prec;
5521 break;
5522
5523 case 'i':
5524 case 'd':
5525 case 'u':
5526 case 'o':
5527 case 'x':
5528 case 'X':
5529 if (c == 'i')
5530 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005531 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005532 temp = formatlong(v, flags, prec, c);
5533 if (!temp)
5534 goto onError;
5535 pbuf = PyUnicode_AS_UNICODE(temp);
5536 len = PyUnicode_GET_SIZE(temp);
5537 /* unbounded ints can always produce
5538 a sign character! */
5539 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005540 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005541 else {
5542 pbuf = formatbuf;
5543 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5544 flags, prec, c, v);
5545 if (len < 0)
5546 goto onError;
5547 /* only d conversion is signed */
5548 sign = c == 'd';
5549 }
5550 if (flags & F_ZERO)
5551 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005552 break;
5553
5554 case 'e':
5555 case 'E':
5556 case 'f':
5557 case 'g':
5558 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005559 pbuf = formatbuf;
5560 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5561 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 if (len < 0)
5563 goto onError;
5564 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005565 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 fill = '0';
5567 break;
5568
5569 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005570 pbuf = formatbuf;
5571 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 if (len < 0)
5573 goto onError;
5574 break;
5575
5576 default:
5577 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005578 "unsupported format character '%c' (0x%x) "
5579 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005580 (31<=c && c<=126) ? c : '?',
5581 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005582 goto onError;
5583 }
5584 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005585 if (*pbuf == '-' || *pbuf == '+') {
5586 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005587 len--;
5588 }
5589 else if (flags & F_SIGN)
5590 sign = '+';
5591 else if (flags & F_BLANK)
5592 sign = ' ';
5593 else
5594 sign = 0;
5595 }
5596 if (width < len)
5597 width = len;
5598 if (rescnt < width + (sign != 0)) {
5599 reslen -= rescnt;
5600 rescnt = width + fmtcnt + 100;
5601 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005602 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005603 return NULL;
5604 res = PyUnicode_AS_UNICODE(result)
5605 + reslen - rescnt;
5606 }
5607 if (sign) {
5608 if (fill != ' ')
5609 *res++ = sign;
5610 rescnt--;
5611 if (width > len)
5612 width--;
5613 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005614 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5615 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005616 assert(pbuf[1] == c);
5617 if (fill != ' ') {
5618 *res++ = *pbuf++;
5619 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005620 }
Tim Petersfff53252001-04-12 18:38:48 +00005621 rescnt -= 2;
5622 width -= 2;
5623 if (width < 0)
5624 width = 0;
5625 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005626 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005627 if (width > len && !(flags & F_LJUST)) {
5628 do {
5629 --rescnt;
5630 *res++ = fill;
5631 } while (--width > len);
5632 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005633 if (fill == ' ') {
5634 if (sign)
5635 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005636 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005637 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005638 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005639 *res++ = *pbuf++;
5640 *res++ = *pbuf++;
5641 }
5642 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005643 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005644 res += len;
5645 rescnt -= len;
5646 while (--width >= len) {
5647 --rescnt;
5648 *res++ = ' ';
5649 }
5650 if (dict && (argidx < arglen) && c != '%') {
5651 PyErr_SetString(PyExc_TypeError,
5652 "not all arguments converted");
5653 goto onError;
5654 }
5655 Py_XDECREF(temp);
5656 } /* '%' */
5657 } /* until end */
5658 if (argidx < arglen && !dict) {
5659 PyErr_SetString(PyExc_TypeError,
5660 "not all arguments converted");
5661 goto onError;
5662 }
5663
5664 if (args_owned) {
5665 Py_DECREF(args);
5666 }
5667 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005668 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005669 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005670 return (PyObject *)result;
5671
5672 onError:
5673 Py_XDECREF(result);
5674 Py_DECREF(uformat);
5675 if (args_owned) {
5676 Py_DECREF(args);
5677 }
5678 return NULL;
5679}
5680
5681static PyBufferProcs unicode_as_buffer = {
5682 (getreadbufferproc) unicode_buffer_getreadbuf,
5683 (getwritebufferproc) unicode_buffer_getwritebuf,
5684 (getsegcountproc) unicode_buffer_getsegcount,
5685 (getcharbufferproc) unicode_buffer_getcharbuf,
5686};
5687
Guido van Rossume023fe02001-08-30 03:12:59 +00005688staticforward PyObject *
5689unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5690
Tim Peters6d6c1a32001-08-02 04:15:00 +00005691static PyObject *
5692unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5693{
5694 PyObject *x = NULL;
5695 static char *kwlist[] = {"string", "encoding", "errors", 0};
5696 char *encoding = NULL;
5697 char *errors = NULL;
5698
Guido van Rossume023fe02001-08-30 03:12:59 +00005699 if (type != &PyUnicode_Type)
5700 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005701 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5702 kwlist, &x, &encoding, &errors))
5703 return NULL;
5704 if (x == NULL)
5705 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005706 if (encoding == NULL && errors == NULL)
5707 return PyObject_Unicode(x);
5708 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005709 return PyUnicode_FromEncodedObject(x, encoding, errors);
5710}
5711
Guido van Rossume023fe02001-08-30 03:12:59 +00005712static PyObject *
5713unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5714{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005715 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005716 int n;
5717
5718 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5719 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5720 if (tmp == NULL)
5721 return NULL;
5722 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005723 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5724 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005725 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005726 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5727 if (pnew->str == NULL) {
5728 _Py_ForgetReference((PyObject *)pnew);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005729 PyObject_Del(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005730 return NULL;
5731 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005732 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5733 pnew->length = n;
5734 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005735 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005736 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005737}
5738
Tim Peters6d6c1a32001-08-02 04:15:00 +00005739static char unicode_doc[] =
5740"unicode(string [, encoding[, errors]]) -> object\n\
5741\n\
5742Create a new Unicode object from the given encoded string.\n\
5743encoding defaults to the current default string encoding and \n\
5744errors, defining the error handling, to 'strict'.";
5745
Guido van Rossumd57fd912000-03-10 22:53:23 +00005746PyTypeObject PyUnicode_Type = {
5747 PyObject_HEAD_INIT(&PyType_Type)
5748 0, /* ob_size */
5749 "unicode", /* tp_name */
5750 sizeof(PyUnicodeObject), /* tp_size */
5751 0, /* tp_itemsize */
5752 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005753 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005754 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005755 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005756 0, /* tp_setattr */
5757 (cmpfunc) unicode_compare, /* tp_compare */
5758 (reprfunc) unicode_repr, /* tp_repr */
5759 0, /* tp_as_number */
5760 &unicode_as_sequence, /* tp_as_sequence */
5761 0, /* tp_as_mapping */
5762 (hashfunc) unicode_hash, /* tp_hash*/
5763 0, /* tp_call*/
5764 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005765 PyObject_GenericGetAttr, /* tp_getattro */
5766 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005767 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005768 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005769 unicode_doc, /* tp_doc */
5770 0, /* tp_traverse */
5771 0, /* tp_clear */
5772 0, /* tp_richcompare */
5773 0, /* tp_weaklistoffset */
5774 0, /* tp_iter */
5775 0, /* tp_iternext */
5776 unicode_methods, /* tp_methods */
5777 0, /* tp_members */
5778 0, /* tp_getset */
5779 0, /* tp_base */
5780 0, /* tp_dict */
5781 0, /* tp_descr_get */
5782 0, /* tp_descr_set */
5783 0, /* tp_dictoffset */
5784 0, /* tp_init */
5785 0, /* tp_alloc */
5786 unicode_new, /* tp_new */
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005787 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005788};
5789
5790/* Initialize the Unicode implementation */
5791
Thomas Wouters78890102000-07-22 19:25:51 +00005792void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005793{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005794 int i;
5795
Fred Drakee4315f52000-05-09 19:53:39 +00005796 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005797 unicode_freelist = NULL;
5798 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005799 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005800 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005801 for (i = 0; i < 256; i++)
5802 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005803}
5804
5805/* Finalize the Unicode implementation */
5806
5807void
Thomas Wouters78890102000-07-22 19:25:51 +00005808_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005809{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005810 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005811 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005813 Py_XDECREF(unicode_empty);
5814 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005815
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005816 for (i = 0; i < 256; i++) {
5817 if (unicode_latin1[i]) {
5818 Py_DECREF(unicode_latin1[i]);
5819 unicode_latin1[i] = NULL;
5820 }
5821 }
5822
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005823 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005824 PyUnicodeObject *v = u;
5825 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005826 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005827 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005828 Py_XDECREF(v->defenc);
Neil Schemenauer58aa8612002-04-12 03:07:20 +00005829 PyObject_Del(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005830 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005831 unicode_freelist = NULL;
5832 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005833}