blob: a29c75b5a34bc900b6161152cc5379137c7172b8 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
Guido van Rossum9475a232001-10-05 20:51:39 +0000227void unicode_dealloc(register PyUnicodeObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228{
Guido van Rossum9475a232001-10-05 20:51:39 +0000229 if (!PyUnicode_CheckExact(unicode)) {
230 unicode->ob_type->tp_free((PyObject *)unicode);
231 return;
232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000234 /* Keep-Alive optimization */
235 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000236 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 unicode->str = NULL;
238 unicode->length = 0;
239 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000240 if (unicode->defenc) {
241 Py_DECREF(unicode->defenc);
242 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 }
244 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000245 *(PyUnicodeObject **)unicode = unicode_freelist;
246 unicode_freelist = unicode;
247 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000248 }
249 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000250 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000251 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000252 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253 }
254}
255
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000256int PyUnicode_Resize(PyObject **unicode,
257 int length)
258{
259 register PyUnicodeObject *v;
260
261 /* Argument checks */
262 if (unicode == NULL) {
263 PyErr_BadInternalCall();
264 return -1;
265 }
266 v = (PyUnicodeObject *)*unicode;
267 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
268 PyErr_BadInternalCall();
269 return -1;
270 }
271
272 /* Resizing unicode_empty and single character objects is not
273 possible since these are being shared. We simply return a fresh
274 copy with the same Unicode content. */
275 if (v->length != length &&
276 (v == unicode_empty || v->length == 1)) {
277 PyUnicodeObject *w = _PyUnicode_New(length);
278 if (w == NULL)
279 return -1;
280 Py_UNICODE_COPY(w->str, v->str,
281 length < v->length ? length : v->length);
282 *unicode = (PyObject *)w;
283 return 0;
284 }
285
286 /* Note that we don't have to modify *unicode for unshared Unicode
287 objects, since we can modify them in-place. */
288 return unicode_resize(v, length);
289}
290
291/* Internal API for use in unicodeobject.c only ! */
292#define _PyUnicode_Resize(unicodevar, length) \
293 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
294
Guido van Rossumd57fd912000-03-10 22:53:23 +0000295PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
296 int size)
297{
298 PyUnicodeObject *unicode;
299
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000300 /* If the Unicode data is known at construction time, we can apply
301 some optimizations which share commonly used objects. */
302 if (u != NULL) {
303
304 /* Optimization for empty strings */
305 if (size == 0 && unicode_empty != NULL) {
306 Py_INCREF(unicode_empty);
307 return (PyObject *)unicode_empty;
308 }
309
310 /* Single character Unicode objects in the Latin-1 range are
311 shared when using this constructor */
312 if (size == 1 && *u < 256) {
313 unicode = unicode_latin1[*u];
314 if (!unicode) {
315 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 if (!unicode)
317 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000318 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000319 unicode_latin1[*u] = unicode;
320 }
321 Py_INCREF(unicode);
322 return (PyObject *)unicode;
323 }
324 }
325
Guido van Rossumd57fd912000-03-10 22:53:23 +0000326 unicode = _PyUnicode_New(size);
327 if (!unicode)
328 return NULL;
329
330 /* Copy the Unicode data into the new object */
331 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000332 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333
334 return (PyObject *)unicode;
335}
336
337#ifdef HAVE_WCHAR_H
338
339PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
340 int size)
341{
342 PyUnicodeObject *unicode;
343
344 if (w == NULL) {
345 PyErr_BadInternalCall();
346 return NULL;
347 }
348
349 unicode = _PyUnicode_New(size);
350 if (!unicode)
351 return NULL;
352
353 /* Copy the wchar_t data into the new object */
354#ifdef HAVE_USABLE_WCHAR_T
355 memcpy(unicode->str, w, size * sizeof(wchar_t));
356#else
357 {
358 register Py_UNICODE *u;
359 register int i;
360 u = PyUnicode_AS_UNICODE(unicode);
361 for (i = size; i >= 0; i--)
362 *u++ = *w++;
363 }
364#endif
365
366 return (PyObject *)unicode;
367}
368
369int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
370 register wchar_t *w,
371 int size)
372{
373 if (unicode == NULL) {
374 PyErr_BadInternalCall();
375 return -1;
376 }
377 if (size > PyUnicode_GET_SIZE(unicode))
378 size = PyUnicode_GET_SIZE(unicode);
379#ifdef HAVE_USABLE_WCHAR_T
380 memcpy(w, unicode->str, size * sizeof(wchar_t));
381#else
382 {
383 register Py_UNICODE *u;
384 register int i;
385 u = PyUnicode_AS_UNICODE(unicode);
386 for (i = size; i >= 0; i--)
387 *w++ = *u++;
388 }
389#endif
390
391 return size;
392}
393
394#endif
395
396PyObject *PyUnicode_FromObject(register PyObject *obj)
397{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000398 /* XXX Perhaps we should make this API an alias of
399 PyObject_Unicode() instead ?! */
400 if (PyUnicode_CheckExact(obj)) {
401 Py_INCREF(obj);
402 return obj;
403 }
404 if (PyUnicode_Check(obj)) {
405 /* For a Unicode subtype that's not a Unicode object,
406 return a true Unicode object with the same data. */
407 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
408 PyUnicode_GET_SIZE(obj));
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
411}
412
413PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
414 const char *encoding,
415 const char *errors)
416{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000417 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000418 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000419 int owned = 0;
420 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421
422 if (obj == NULL) {
423 PyErr_BadInternalCall();
424 return NULL;
425 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000426
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000427#if 0
428 /* For b/w compatibility we also accept Unicode objects provided
429 that no encodings is given and then redirect to PyObject_Unicode()
430 which then applies the additional logic for Unicode subclasses.
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000431
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000432 NOTE: This API should really only be used for object which
433 represent *encoded* Unicode !
434
435 */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000436 if (PyUnicode_Check(obj)) {
437 if (encoding) {
438 PyErr_SetString(PyExc_TypeError,
439 "decoding Unicode is not supported");
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000440 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000441 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000442 return PyObject_Unicode(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000443 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000444#else
445 if (PyUnicode_Check(obj)) {
446 PyErr_SetString(PyExc_TypeError,
447 "decoding Unicode is not supported");
448 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000449 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000450#endif
451
452 /* Coerce object */
453 if (PyString_Check(obj)) {
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000454 s = PyString_AS_STRING(obj);
455 len = PyString_GET_SIZE(obj);
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000456 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000457 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
458 /* Overwrite the error message with something more useful in
459 case of a TypeError. */
460 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000461 PyErr_Format(PyExc_TypeError,
Guido van Rossumb8c65bc2001-10-19 02:01:31 +0000462 "coercing to Unicode: need string or buffer, "
463 "%.80s found",
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000464 obj->ob_type->tp_name);
465 goto onError;
466 }
467
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000468 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 if (len == 0) {
470 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000473 else
474 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000475
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000478 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000479 return v;
480
481 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000482 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000483 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000484 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000485 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000486}
487
488PyObject *PyUnicode_Decode(const char *s,
489 int size,
490 const char *encoding,
491 const char *errors)
492{
493 PyObject *buffer = NULL, *unicode;
494
Fred Drakee4315f52000-05-09 19:53:39 +0000495 if (encoding == NULL)
496 encoding = PyUnicode_GetDefaultEncoding();
497
498 /* Shortcuts for common default encodings */
499 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000501 else if (strcmp(encoding, "latin-1") == 0)
502 return PyUnicode_DecodeLatin1(s, size, errors);
503 else if (strcmp(encoding, "ascii") == 0)
504 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505
506 /* Decode via the codec registry */
507 buffer = PyBuffer_FromMemory((void *)s, size);
508 if (buffer == NULL)
509 goto onError;
510 unicode = PyCodec_Decode(buffer, encoding, errors);
511 if (unicode == NULL)
512 goto onError;
513 if (!PyUnicode_Check(unicode)) {
514 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000515 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000516 unicode->ob_type->tp_name);
517 Py_DECREF(unicode);
518 goto onError;
519 }
520 Py_DECREF(buffer);
521 return unicode;
522
523 onError:
524 Py_XDECREF(buffer);
525 return NULL;
526}
527
528PyObject *PyUnicode_Encode(const Py_UNICODE *s,
529 int size,
530 const char *encoding,
531 const char *errors)
532{
533 PyObject *v, *unicode;
534
535 unicode = PyUnicode_FromUnicode(s, size);
536 if (unicode == NULL)
537 return NULL;
538 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
539 Py_DECREF(unicode);
540 return v;
541}
542
543PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
544 const char *encoding,
545 const char *errors)
546{
547 PyObject *v;
548
549 if (!PyUnicode_Check(unicode)) {
550 PyErr_BadArgument();
551 goto onError;
552 }
Fred Drakee4315f52000-05-09 19:53:39 +0000553
554 if (encoding == NULL)
555 encoding = PyUnicode_GetDefaultEncoding();
556
557 /* Shortcuts for common default encodings */
558 if (errors == NULL) {
559 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000560 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000561 else if (strcmp(encoding, "latin-1") == 0)
562 return PyUnicode_AsLatin1String(unicode);
563 else if (strcmp(encoding, "ascii") == 0)
564 return PyUnicode_AsASCIIString(unicode);
565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566
567 /* Encode via the codec registry */
568 v = PyCodec_Encode(unicode, encoding, errors);
569 if (v == NULL)
570 goto onError;
571 /* XXX Should we really enforce this ? */
572 if (!PyString_Check(v)) {
573 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000574 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000575 v->ob_type->tp_name);
576 Py_DECREF(v);
577 goto onError;
578 }
579 return v;
580
581 onError:
582 return NULL;
583}
584
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000585PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
586 const char *errors)
587{
588 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
589
590 if (v)
591 return v;
592 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
593 if (v && errors == NULL)
594 ((PyUnicodeObject *)unicode)->defenc = v;
595 return v;
596}
597
Guido van Rossumd57fd912000-03-10 22:53:23 +0000598Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
599{
600 if (!PyUnicode_Check(unicode)) {
601 PyErr_BadArgument();
602 goto onError;
603 }
604 return PyUnicode_AS_UNICODE(unicode);
605
606 onError:
607 return NULL;
608}
609
610int PyUnicode_GetSize(PyObject *unicode)
611{
612 if (!PyUnicode_Check(unicode)) {
613 PyErr_BadArgument();
614 goto onError;
615 }
616 return PyUnicode_GET_SIZE(unicode);
617
618 onError:
619 return -1;
620}
621
Thomas Wouters78890102000-07-22 19:25:51 +0000622const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000623{
624 return unicode_default_encoding;
625}
626
627int PyUnicode_SetDefaultEncoding(const char *encoding)
628{
629 PyObject *v;
630
631 /* Make sure the encoding is valid. As side effect, this also
632 loads the encoding into the codec registry cache. */
633 v = _PyCodec_Lookup(encoding);
634 if (v == NULL)
635 goto onError;
636 Py_DECREF(v);
637 strncpy(unicode_default_encoding,
638 encoding,
639 sizeof(unicode_default_encoding));
640 return 0;
641
642 onError:
643 return -1;
644}
645
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000646/* --- UTF-7 Codec -------------------------------------------------------- */
647
648/* see RFC2152 for details */
649
650static
651char utf7_special[128] = {
652 /* indicate whether a UTF-7 character is special i.e. cannot be directly
653 encoded:
654 0 - not special
655 1 - special
656 2 - whitespace (optional)
657 3 - RFC2152 Set O (optional) */
658 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
659 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
660 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
662 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
663 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
664 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
665 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
666
667};
668
669#define SPECIAL(c, encodeO, encodeWS) \
670 (((c)>127 || utf7_special[(c)] == 1) || \
671 (encodeWS && (utf7_special[(c)] == 2)) || \
672 (encodeO && (utf7_special[(c)] == 3)))
673
674#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
675#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
676#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
677 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
678
679#define ENCODE(out, ch, bits) \
680 while (bits >= 6) { \
681 *out++ = B64(ch >> (bits-6)); \
682 bits -= 6; \
683 }
684
685#define DECODE(out, ch, bits, surrogate) \
686 while (bits >= 16) { \
687 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
688 bits -= 16; \
689 if (surrogate) { \
690 /* We have already generated an error for the high surrogate
691 so let's not bother seeing if the low surrogate is correct or not */\
692 surrogate = 0; \
693 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
694 /* This is a surrogate pair. Unfortunately we can't represent \
695 it in a 16-bit character */ \
696 surrogate = 1; \
697 errmsg = "code pairs are not supported"; \
698 goto utf7Error; \
699 } else { \
700 *out++ = outCh; \
701 } \
702 } \
703
704static
705int utf7_decoding_error(Py_UNICODE **dest,
706 const char *errors,
707 const char *details)
708{
709 if ((errors == NULL) ||
710 (strcmp(errors,"strict") == 0)) {
711 PyErr_Format(PyExc_UnicodeError,
712 "UTF-7 decoding error: %.400s",
713 details);
714 return -1;
715 }
716 else if (strcmp(errors,"ignore") == 0) {
717 return 0;
718 }
719 else if (strcmp(errors,"replace") == 0) {
720 if (dest != NULL) {
721 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
722 (*dest)++;
723 }
724 return 0;
725 }
726 else {
727 PyErr_Format(PyExc_ValueError,
728 "UTF-7 decoding error; unknown error handling code: %.400s",
729 errors);
730 return -1;
731 }
732}
733
734PyObject *PyUnicode_DecodeUTF7(const char *s,
735 int size,
736 const char *errors)
737{
738 const char *e;
739 PyUnicodeObject *unicode;
740 Py_UNICODE *p;
741 const char *errmsg = "";
742 int inShift = 0;
743 unsigned int bitsleft = 0;
744 unsigned long charsleft = 0;
745 int surrogate = 0;
746
747 unicode = _PyUnicode_New(size);
748 if (!unicode)
749 return NULL;
750 if (size == 0)
751 return (PyObject *)unicode;
752
753 p = unicode->str;
754 e = s + size;
755
756 while (s < e) {
757 Py_UNICODE ch = *s;
758
759 if (inShift) {
760 if ((ch == '-') || !B64CHAR(ch)) {
761 inShift = 0;
762 s++;
763
764 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
765 if (bitsleft >= 6) {
766 /* The shift sequence has a partial character in it. If
767 bitsleft < 6 then we could just classify it as padding
768 but that is not the case here */
769
770 errmsg = "partial character in shift sequence";
771 goto utf7Error;
772 }
773 /* According to RFC2152 the remaining bits should be zero. We
774 choose to signal an error/insert a replacement character
775 here so indicate the potential of a misencoded character. */
776
777 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
778 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
779 errmsg = "non-zero padding bits in shift sequence";
780 goto utf7Error;
781 }
782
783 if (ch == '-') {
784 if ((s < e) && (*(s) == '-')) {
785 *p++ = '-';
786 inShift = 1;
787 }
788 } else if (SPECIAL(ch,0,0)) {
789 errmsg = "unexpected special character";
790 goto utf7Error;
791 } else {
792 *p++ = ch;
793 }
794 } else {
795 charsleft = (charsleft << 6) | UB64(ch);
796 bitsleft += 6;
797 s++;
798 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
799 }
800 }
801 else if ( ch == '+' ) {
802 s++;
803 if (s < e && *s == '-') {
804 s++;
805 *p++ = '+';
806 } else
807 {
808 inShift = 1;
809 bitsleft = 0;
810 }
811 }
812 else if (SPECIAL(ch,0,0)) {
813 errmsg = "unexpected special character";
814 s++;
815 goto utf7Error;
816 }
817 else {
818 *p++ = ch;
819 s++;
820 }
821 continue;
822 utf7Error:
823 if (utf7_decoding_error(&p, errors, errmsg))
824 goto onError;
825 }
826
827 if (inShift) {
828 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
829 goto onError;
830 }
831
832 if (_PyUnicode_Resize(&unicode, p - unicode->str))
833 goto onError;
834
835 return (PyObject *)unicode;
836
837onError:
838 Py_DECREF(unicode);
839 return NULL;
840}
841
842
843PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
844 int size,
845 int encodeSetO,
846 int encodeWhiteSpace,
847 const char *errors)
848{
849 PyObject *v;
850 /* It might be possible to tighten this worst case */
851 unsigned int cbAllocated = 5 * size;
852 int inShift = 0;
853 int i = 0;
854 unsigned int bitsleft = 0;
855 unsigned long charsleft = 0;
856 char * out;
857 char * start;
858
859 if (size == 0)
860 return PyString_FromStringAndSize(NULL, 0);
861
862 v = PyString_FromStringAndSize(NULL, cbAllocated);
863 if (v == NULL)
864 return NULL;
865
866 start = out = PyString_AS_STRING(v);
867 for (;i < size; ++i) {
868 Py_UNICODE ch = s[i];
869
870 if (!inShift) {
871 if (ch == '+') {
872 *out++ = '+';
873 *out++ = '-';
874 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
875 charsleft = ch;
876 bitsleft = 16;
877 *out++ = '+';
878 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
879 inShift = bitsleft > 0;
880 } else {
881 *out++ = (char) ch;
882 }
883 } else {
884 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
885 *out++ = B64(charsleft << (6-bitsleft));
886 charsleft = 0;
887 bitsleft = 0;
888 /* Characters not in the BASE64 set implicitly unshift the sequence
889 so no '-' is required, except if the character is itself a '-' */
890 if (B64CHAR(ch) || ch == '-') {
891 *out++ = '-';
892 }
893 inShift = 0;
894 *out++ = (char) ch;
895 } else {
896 bitsleft += 16;
897 charsleft = (charsleft << 16) | ch;
898 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
899
900 /* If the next character is special then we dont' need to terminate
901 the shift sequence. If the next character is not a BASE64 character
902 or '-' then the shift sequence will be terminated implicitly and we
903 don't have to insert a '-'. */
904
905 if (bitsleft == 0) {
906 if (i + 1 < size) {
907 Py_UNICODE ch2 = s[i+1];
908
909 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
910
911 } else if (B64CHAR(ch2) || ch2 == '-') {
912 *out++ = '-';
913 inShift = 0;
914 } else {
915 inShift = 0;
916 }
917
918 }
919 else {
920 *out++ = '-';
921 inShift = 0;
922 }
923 }
924 }
925 }
926 }
927 if (bitsleft) {
928 *out++= B64(charsleft << (6-bitsleft) );
929 *out++ = '-';
930 }
931
932 if (_PyString_Resize(&v, out - start)) {
933 Py_DECREF(v);
934 return NULL;
935 }
936 return v;
937}
938
939#undef SPECIAL
940#undef B64
941#undef B64CHAR
942#undef UB64
943#undef ENCODE
944#undef DECODE
945
Guido van Rossumd57fd912000-03-10 22:53:23 +0000946/* --- UTF-8 Codec -------------------------------------------------------- */
947
948static
949char utf8_code_length[256] = {
950 /* Map UTF-8 encoded prefix byte to sequence length. zero means
951 illegal prefix. see RFC 2279 for details */
952 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
953 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
954 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
955 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
956 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
960 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
961 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
962 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
963 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
964 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
965 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
966 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
967 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
968};
969
970static
971int utf8_decoding_error(const char **source,
972 Py_UNICODE **dest,
973 const char *errors,
974 const char *details)
975{
976 if ((errors == NULL) ||
977 (strcmp(errors,"strict") == 0)) {
978 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000979 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000980 details);
981 return -1;
982 }
983 else if (strcmp(errors,"ignore") == 0) {
984 (*source)++;
985 return 0;
986 }
987 else if (strcmp(errors,"replace") == 0) {
988 (*source)++;
989 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
990 (*dest)++;
991 return 0;
992 }
993 else {
994 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000995 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000996 errors);
997 return -1;
998 }
999}
1000
Guido van Rossumd57fd912000-03-10 22:53:23 +00001001PyObject *PyUnicode_DecodeUTF8(const char *s,
1002 int size,
1003 const char *errors)
1004{
1005 int n;
1006 const char *e;
1007 PyUnicodeObject *unicode;
1008 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001009 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001010
1011 /* Note: size will always be longer than the resulting Unicode
1012 character count */
1013 unicode = _PyUnicode_New(size);
1014 if (!unicode)
1015 return NULL;
1016 if (size == 0)
1017 return (PyObject *)unicode;
1018
1019 /* Unpack UTF-8 encoded data */
1020 p = unicode->str;
1021 e = s + size;
1022
1023 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001024 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025
1026 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001027 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 s++;
1029 continue;
1030 }
1031
1032 n = utf8_code_length[ch];
1033
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001034 if (s + n > e) {
1035 errmsg = "unexpected end of data";
1036 goto utf8Error;
1037 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001038
1039 switch (n) {
1040
1041 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001042 errmsg = "unexpected code byte";
1043 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044
1045 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001046 errmsg = "internal error";
1047 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001048
1049 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001050 if ((s[1] & 0xc0) != 0x80) {
1051 errmsg = "invalid data";
1052 goto utf8Error;
1053 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001054 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 if (ch < 0x80) {
1056 errmsg = "illegal encoding";
1057 goto utf8Error;
1058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001060 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 break;
1062
1063 case 3:
1064 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001065 (s[2] & 0xc0) != 0x80) {
1066 errmsg = "invalid data";
1067 goto utf8Error;
1068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001069 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001070 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1071 errmsg = "illegal encoding";
1072 goto utf8Error;
1073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001075 *p++ = (Py_UNICODE)ch;
1076 break;
1077
1078 case 4:
1079 if ((s[1] & 0xc0) != 0x80 ||
1080 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001081 (s[3] & 0xc0) != 0x80) {
1082 errmsg = "invalid data";
1083 goto utf8Error;
1084 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001085 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1086 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1087 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001088 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001089 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001090 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001091 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001092 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001093 errmsg = "illegal encoding";
1094 goto utf8Error;
1095 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001096#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 *p++ = (Py_UNICODE)ch;
1098#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001099 /* compute and append the two surrogates: */
1100
1101 /* translate from 10000..10FFFF to 0..FFFF */
1102 ch -= 0x10000;
1103
1104 /* high surrogate = top 10 bits added to D800 */
1105 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1106
1107 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001108 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001109#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 break;
1111
1112 default:
1113 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001114 errmsg = "unsupported Unicode code range";
1115 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 }
1117 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001118 continue;
1119
1120 utf8Error:
1121 if (utf8_decoding_error(&s, &p, errors, errmsg))
1122 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001123 }
1124
1125 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001126 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 goto onError;
1128
1129 return (PyObject *)unicode;
1130
1131onError:
1132 Py_DECREF(unicode);
1133 return NULL;
1134}
1135
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001136/* Not used anymore, now that the encoder supports UTF-16
1137 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001138#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139static
1140int utf8_encoding_error(const Py_UNICODE **source,
1141 char **dest,
1142 const char *errors,
1143 const char *details)
1144{
1145 if ((errors == NULL) ||
1146 (strcmp(errors,"strict") == 0)) {
1147 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001148 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 details);
1150 return -1;
1151 }
1152 else if (strcmp(errors,"ignore") == 0) {
1153 return 0;
1154 }
1155 else if (strcmp(errors,"replace") == 0) {
1156 **dest = '?';
1157 (*dest)++;
1158 return 0;
1159 }
1160 else {
1161 PyErr_Format(PyExc_ValueError,
1162 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001163 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 errors);
1165 return -1;
1166 }
1167}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001168#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169
1170PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1171 int size,
1172 const char *errors)
1173{
1174 PyObject *v;
1175 char *p;
1176 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001177 Py_UCS4 ch2;
1178 unsigned int cbAllocated = 3 * size;
1179 unsigned int cbWritten = 0;
1180 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001181
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001182 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 if (v == NULL)
1184 return NULL;
1185 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001186 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187
1188 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001189 while (i < size) {
1190 Py_UCS4 ch = s[i++];
1191 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001193 cbWritten++;
1194 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 else if (ch < 0x0800) {
1196 *p++ = 0xc0 | (ch >> 6);
1197 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001198 cbWritten += 2;
1199 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001200 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001201 /* Check for high surrogate */
1202 if (0xD800 <= ch && ch <= 0xDBFF) {
1203 if (i != size) {
1204 ch2 = s[i];
1205 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1206
1207 if (cbWritten >= (cbAllocated - 4)) {
1208 /* Provide enough room for some more
1209 surrogates */
1210 cbAllocated += 4*10;
1211 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001212 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001213 }
1214
1215 /* combine the two values */
1216 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1217
1218 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001219 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001220 i++;
1221 cbWritten += 4;
1222 }
1223 }
1224 }
1225 else {
1226 *p++ = (char)(0xe0 | (ch >> 12));
1227 cbWritten += 3;
1228 }
1229 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1230 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001231 } else {
1232 *p++ = 0xf0 | (ch>>18);
1233 *p++ = 0x80 | ((ch>>12) & 0x3f);
1234 *p++ = 0x80 | ((ch>>6) & 0x3f);
1235 *p++ = 0x80 | (ch & 0x3f);
1236 cbWritten += 4;
1237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001238 }
1239 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001240 if (_PyString_Resize(&v, p - q))
1241 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 return v;
1243
1244 onError:
1245 Py_DECREF(v);
1246 return NULL;
1247}
1248
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1250{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001251 if (!PyUnicode_Check(unicode)) {
1252 PyErr_BadArgument();
1253 return NULL;
1254 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001255 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1256 PyUnicode_GET_SIZE(unicode),
1257 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001258}
1259
1260/* --- UTF-16 Codec ------------------------------------------------------- */
1261
1262static
Tim Peters772747b2001-08-09 22:21:55 +00001263int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 const char *errors,
1265 const char *details)
1266{
1267 if ((errors == NULL) ||
1268 (strcmp(errors,"strict") == 0)) {
1269 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001270 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 details);
1272 return -1;
1273 }
1274 else if (strcmp(errors,"ignore") == 0) {
1275 return 0;
1276 }
1277 else if (strcmp(errors,"replace") == 0) {
1278 if (dest) {
1279 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1280 (*dest)++;
1281 }
1282 return 0;
1283 }
1284 else {
1285 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001286 "UTF-16 decoding error; "
1287 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 errors);
1289 return -1;
1290 }
1291}
1292
Tim Peters772747b2001-08-09 22:21:55 +00001293PyObject *
1294PyUnicode_DecodeUTF16(const char *s,
1295 int size,
1296 const char *errors,
1297 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001298{
1299 PyUnicodeObject *unicode;
1300 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001301 const unsigned char *q, *e;
1302 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001303 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001304 /* Offsets from q for retrieving byte pairs in the right order. */
1305#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1306 int ihi = 1, ilo = 0;
1307#else
1308 int ihi = 0, ilo = 1;
1309#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001310
1311 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001312 if (size & 1) {
1313 if (utf16_decoding_error(NULL, errors, "truncated data"))
1314 return NULL;
1315 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001316 }
1317
1318 /* Note: size will always be longer than the resulting Unicode
1319 character count */
1320 unicode = _PyUnicode_New(size);
1321 if (!unicode)
1322 return NULL;
1323 if (size == 0)
1324 return (PyObject *)unicode;
1325
1326 /* Unpack UTF-16 encoded data */
1327 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001328 q = (unsigned char *)s;
1329 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001330
1331 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001332 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001333
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001334 /* Check for BOM marks (U+FEFF) in the input and adjust current
1335 byte order setting accordingly. In native mode, the leading BOM
1336 mark is skipped, in all other modes, it is copied to the output
1337 stream as-is (giving a ZWNBSP character). */
1338 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001339 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001340#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001341 if (bom == 0xFEFF) {
1342 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001343 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001344 }
1345 else if (bom == 0xFFFE) {
1346 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001347 bo = 1;
1348 }
1349#else
Tim Peters772747b2001-08-09 22:21:55 +00001350 if (bom == 0xFEFF) {
1351 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001352 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001353 }
1354 else if (bom == 0xFFFE) {
1355 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001356 bo = -1;
1357 }
1358#endif
1359 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001360
Tim Peters772747b2001-08-09 22:21:55 +00001361 if (bo == -1) {
1362 /* force LE */
1363 ihi = 1;
1364 ilo = 0;
1365 }
1366 else if (bo == 1) {
1367 /* force BE */
1368 ihi = 0;
1369 ilo = 1;
1370 }
1371
1372 while (q < e) {
1373 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1374 q += 2;
1375
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 if (ch < 0xD800 || ch > 0xDFFF) {
1377 *p++ = ch;
1378 continue;
1379 }
1380
1381 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001382 if (q >= e) {
1383 errmsg = "unexpected end of data";
1384 goto utf16Error;
1385 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001386 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001387 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1388 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001389 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001390#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001391 *p++ = ch;
1392 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001393#else
1394 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001395#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001396 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001397 }
1398 else {
1399 errmsg = "illegal UTF-16 surrogate";
1400 goto utf16Error;
1401 }
1402
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001404 errmsg = "illegal encoding";
1405 /* Fall through to report the error */
1406
1407 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001408 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001409 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410 }
1411
1412 if (byteorder)
1413 *byteorder = bo;
1414
1415 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001416 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417 goto onError;
1418
1419 return (PyObject *)unicode;
1420
1421onError:
1422 Py_DECREF(unicode);
1423 return NULL;
1424}
1425
Tim Peters772747b2001-08-09 22:21:55 +00001426PyObject *
1427PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1428 int size,
1429 const char *errors,
1430 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431{
1432 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001433 unsigned char *p;
1434 int i, pairs;
1435 /* Offsets from p for storing byte pairs in the right order. */
1436#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1437 int ihi = 1, ilo = 0;
1438#else
1439 int ihi = 0, ilo = 1;
1440#endif
1441
1442#define STORECHAR(CH) \
1443 do { \
1444 p[ihi] = ((CH) >> 8) & 0xff; \
1445 p[ilo] = (CH) & 0xff; \
1446 p += 2; \
1447 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001449 for (i = pairs = 0; i < size; i++)
1450 if (s[i] >= 0x10000)
1451 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001453 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001454 if (v == NULL)
1455 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456
Tim Peters772747b2001-08-09 22:21:55 +00001457 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001458 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001459 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001460 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001461 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001462
1463 if (byteorder == -1) {
1464 /* force LE */
1465 ihi = 1;
1466 ilo = 0;
1467 }
1468 else if (byteorder == 1) {
1469 /* force BE */
1470 ihi = 0;
1471 ilo = 1;
1472 }
1473
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001474 while (size-- > 0) {
1475 Py_UNICODE ch = *s++;
1476 Py_UNICODE ch2 = 0;
1477 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001478 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1479 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 }
Tim Peters772747b2001-08-09 22:21:55 +00001481 STORECHAR(ch);
1482 if (ch2)
1483 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001484 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001486#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487}
1488
1489PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1490{
1491 if (!PyUnicode_Check(unicode)) {
1492 PyErr_BadArgument();
1493 return NULL;
1494 }
1495 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1496 PyUnicode_GET_SIZE(unicode),
1497 NULL,
1498 0);
1499}
1500
1501/* --- Unicode Escape Codec ----------------------------------------------- */
1502
1503static
1504int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001505 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506 const char *errors,
1507 const char *details)
1508{
1509 if ((errors == NULL) ||
1510 (strcmp(errors,"strict") == 0)) {
1511 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001512 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513 details);
1514 return -1;
1515 }
1516 else if (strcmp(errors,"ignore") == 0) {
1517 return 0;
1518 }
1519 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001520 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521 return 0;
1522 }
1523 else {
1524 PyErr_Format(PyExc_ValueError,
1525 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001526 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527 errors);
1528 return -1;
1529 }
1530}
1531
Fredrik Lundh06d12682001-01-24 07:59:11 +00001532static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001533
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1535 int size,
1536 const char *errors)
1537{
1538 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001539 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001540 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001541 char* message;
1542 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1543
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544 /* Escaped strings will always be longer than the resulting
1545 Unicode string, so we start with size here and then reduce the
1546 length after conversion to the true value. */
1547 v = _PyUnicode_New(size);
1548 if (v == NULL)
1549 goto onError;
1550 if (size == 0)
1551 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001552
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553 p = buf = PyUnicode_AS_UNICODE(v);
1554 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001555
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556 while (s < end) {
1557 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001558 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001559 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560
1561 /* Non-escape characters are interpreted as Unicode ordinals */
1562 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001563 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001564 continue;
1565 }
1566
1567 /* \ - Escapes */
1568 s++;
1569 switch (*s++) {
1570
1571 /* \x escapes */
1572 case '\n': break;
1573 case '\\': *p++ = '\\'; break;
1574 case '\'': *p++ = '\''; break;
1575 case '\"': *p++ = '\"'; break;
1576 case 'b': *p++ = '\b'; break;
1577 case 'f': *p++ = '\014'; break; /* FF */
1578 case 't': *p++ = '\t'; break;
1579 case 'n': *p++ = '\n'; break;
1580 case 'r': *p++ = '\r'; break;
1581 case 'v': *p++ = '\013'; break; /* VT */
1582 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1583
1584 /* \OOO (octal) escapes */
1585 case '0': case '1': case '2': case '3':
1586 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001587 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001589 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001591 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001592 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001593 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 break;
1595
Fredrik Lundhccc74732001-02-18 22:13:49 +00001596 /* hex escapes */
1597 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001598 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001599 digits = 2;
1600 message = "truncated \\xXX escape";
1601 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602
Fredrik Lundhccc74732001-02-18 22:13:49 +00001603 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001604 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001605 digits = 4;
1606 message = "truncated \\uXXXX escape";
1607 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608
Fredrik Lundhccc74732001-02-18 22:13:49 +00001609 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001610 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001611 digits = 8;
1612 message = "truncated \\UXXXXXXXX escape";
1613 hexescape:
1614 chr = 0;
1615 for (i = 0; i < digits; i++) {
1616 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001617 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001618 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001619 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001620 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001621 i++;
1622 break;
1623 }
1624 chr = (chr<<4) & ~0xF;
1625 if (c >= '0' && c <= '9')
1626 chr += c - '0';
1627 else if (c >= 'a' && c <= 'f')
1628 chr += 10 + c - 'a';
1629 else
1630 chr += 10 + c - 'A';
1631 }
1632 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001633 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001634 /* when we get here, chr is a 32-bit unicode character */
1635 if (chr <= 0xffff)
1636 /* UCS-2 character */
1637 *p++ = (Py_UNICODE) chr;
1638 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001639 /* UCS-4 character. Either store directly, or as
1640 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001641#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001642 *p++ = chr;
1643#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001644 chr -= 0x10000L;
1645 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001646 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001647#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001648 } else {
1649 if (unicodeescape_decoding_error(
1650 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001651 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001652 )
1653 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001654 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001655 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001656 break;
1657
1658 /* \N{name} */
1659 case 'N':
1660 message = "malformed \\N character escape";
1661 if (ucnhash_CAPI == NULL) {
1662 /* load the unicode data module */
1663 PyObject *m, *v;
1664 m = PyImport_ImportModule("unicodedata");
1665 if (m == NULL)
1666 goto ucnhashError;
1667 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1668 Py_DECREF(m);
1669 if (v == NULL)
1670 goto ucnhashError;
1671 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1672 Py_DECREF(v);
1673 if (ucnhash_CAPI == NULL)
1674 goto ucnhashError;
1675 }
1676 if (*s == '{') {
1677 const char *start = s+1;
1678 /* look for the closing brace */
1679 while (*s != '}' && s < end)
1680 s++;
1681 if (s > start && s < end && *s == '}') {
1682 /* found a name. look it up in the unicode database */
1683 message = "unknown Unicode character name";
1684 s++;
1685 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1686 goto store;
1687 }
1688 }
1689 if (unicodeescape_decoding_error(&s, &x, errors, message))
1690 goto onError;
1691 *p++ = x;
1692 break;
1693
1694 default:
1695 *p++ = '\\';
1696 *p++ = (unsigned char)s[-1];
1697 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 }
1699 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001700 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001701 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702 return (PyObject *)v;
1703
Fredrik Lundhccc74732001-02-18 22:13:49 +00001704ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001705 PyErr_SetString(
1706 PyExc_UnicodeError,
1707 "\\N escapes not supported (can't load unicodedata module)"
1708 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001709 return NULL;
1710
Fredrik Lundhccc74732001-02-18 22:13:49 +00001711onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 Py_XDECREF(v);
1713 return NULL;
1714}
1715
1716/* Return a Unicode-Escape string version of the Unicode object.
1717
1718 If quotes is true, the string is enclosed in u"" or u'' quotes as
1719 appropriate.
1720
1721*/
1722
Barry Warsaw51ac5802000-03-20 16:36:48 +00001723static const Py_UNICODE *findchar(const Py_UNICODE *s,
1724 int size,
1725 Py_UNICODE ch);
1726
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727static
1728PyObject *unicodeescape_string(const Py_UNICODE *s,
1729 int size,
1730 int quotes)
1731{
1732 PyObject *repr;
1733 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001735 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736
1737 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1738 if (repr == NULL)
1739 return NULL;
1740
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001741 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742
1743 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001744 *p++ = 'u';
1745 *p++ = (findchar(s, size, '\'') &&
1746 !findchar(s, size, '"')) ? '"' : '\'';
1747 }
1748 while (size-- > 0) {
1749 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001750
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001752 if (quotes &&
1753 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754 *p++ = '\\';
1755 *p++ = (char) ch;
Guido van Rossumad9744a2001-09-21 15:38:17 +00001756 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001758
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001759#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001760 /* Map 21-bit characters to '\U00xxxxxx' */
1761 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001762 int offset = p - PyString_AS_STRING(repr);
1763
1764 /* Resize the string if necessary */
1765 if (offset + 12 > PyString_GET_SIZE(repr)) {
1766 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1767 goto onError;
1768 p = PyString_AS_STRING(repr) + offset;
1769 }
1770
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001771 *p++ = '\\';
1772 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001773 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1774 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1775 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1776 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1777 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1778 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1779 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001780 *p++ = hexdigit[ch & 0x0000000F];
1781 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001782 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001783#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001784 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1785 else if (ch >= 0xD800 && ch < 0xDC00) {
1786 Py_UNICODE ch2;
1787 Py_UCS4 ucs;
1788
1789 ch2 = *s++;
1790 size--;
1791 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1792 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1793 *p++ = '\\';
1794 *p++ = 'U';
1795 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1796 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1797 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1798 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1799 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1800 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1801 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1802 *p++ = hexdigit[ucs & 0x0000000F];
1803 continue;
1804 }
1805 /* Fall through: isolated surrogates are copied as-is */
1806 s--;
1807 size++;
1808 }
1809
Guido van Rossumd57fd912000-03-10 22:53:23 +00001810 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001811 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001812 *p++ = '\\';
1813 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001814 *p++ = hexdigit[(ch >> 12) & 0x000F];
1815 *p++ = hexdigit[(ch >> 8) & 0x000F];
1816 *p++ = hexdigit[(ch >> 4) & 0x000F];
1817 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001818 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001819
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001820 /* Map special whitespace to '\t', \n', '\r' */
1821 else if (ch == '\t') {
1822 *p++ = '\\';
1823 *p++ = 't';
1824 }
1825 else if (ch == '\n') {
1826 *p++ = '\\';
1827 *p++ = 'n';
1828 }
1829 else if (ch == '\r') {
1830 *p++ = '\\';
1831 *p++ = 'r';
1832 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001833
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001834 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001835 else if (ch < ' ' || ch >= 128) {
1836 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001837 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001838 *p++ = hexdigit[(ch >> 4) & 0x000F];
1839 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001841
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 /* Copy everything else as-is */
1843 else
1844 *p++ = (char) ch;
1845 }
1846 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001847 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848
1849 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001850 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001851 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852
1853 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001854
1855 onError:
1856 Py_DECREF(repr);
1857 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001858}
1859
1860PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1861 int size)
1862{
1863 return unicodeescape_string(s, size, 0);
1864}
1865
1866PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1867{
1868 if (!PyUnicode_Check(unicode)) {
1869 PyErr_BadArgument();
1870 return NULL;
1871 }
1872 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1873 PyUnicode_GET_SIZE(unicode));
1874}
1875
1876/* --- Raw Unicode Escape Codec ------------------------------------------- */
1877
1878PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1879 int size,
1880 const char *errors)
1881{
1882 PyUnicodeObject *v;
1883 Py_UNICODE *p, *buf;
1884 const char *end;
1885 const char *bs;
1886
1887 /* Escaped strings will always be longer than the resulting
1888 Unicode string, so we start with size here and then reduce the
1889 length after conversion to the true value. */
1890 v = _PyUnicode_New(size);
1891 if (v == NULL)
1892 goto onError;
1893 if (size == 0)
1894 return (PyObject *)v;
1895 p = buf = PyUnicode_AS_UNICODE(v);
1896 end = s + size;
1897 while (s < end) {
1898 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001899 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001900 int i;
1901
1902 /* Non-escape characters are interpreted as Unicode ordinals */
1903 if (*s != '\\') {
1904 *p++ = (unsigned char)*s++;
1905 continue;
1906 }
1907
1908 /* \u-escapes are only interpreted iff the number of leading
1909 backslashes if odd */
1910 bs = s;
1911 for (;s < end;) {
1912 if (*s != '\\')
1913 break;
1914 *p++ = (unsigned char)*s++;
1915 }
1916 if (((s - bs) & 1) == 0 ||
1917 s >= end ||
1918 *s != 'u') {
1919 continue;
1920 }
1921 p--;
1922 s++;
1923
1924 /* \uXXXX with 4 hex digits */
1925 for (x = 0, i = 0; i < 4; i++) {
1926 c = (unsigned char)s[i];
1927 if (!isxdigit(c)) {
1928 if (unicodeescape_decoding_error(&s, &x, errors,
1929 "truncated \\uXXXX"))
1930 goto onError;
1931 i++;
1932 break;
1933 }
1934 x = (x<<4) & ~0xF;
1935 if (c >= '0' && c <= '9')
1936 x += c - '0';
1937 else if (c >= 'a' && c <= 'f')
1938 x += 10 + c - 'a';
1939 else
1940 x += 10 + c - 'A';
1941 }
1942 s += i;
1943 *p++ = x;
1944 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001945 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001946 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001947 return (PyObject *)v;
1948
1949 onError:
1950 Py_XDECREF(v);
1951 return NULL;
1952}
1953
1954PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1955 int size)
1956{
1957 PyObject *repr;
1958 char *p;
1959 char *q;
1960
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001961 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962
1963 repr = PyString_FromStringAndSize(NULL, 6 * size);
1964 if (repr == NULL)
1965 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001966 if (size == 0)
1967 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001968
1969 p = q = PyString_AS_STRING(repr);
1970 while (size-- > 0) {
1971 Py_UNICODE ch = *s++;
1972 /* Map 16-bit characters to '\uxxxx' */
1973 if (ch >= 256) {
1974 *p++ = '\\';
1975 *p++ = 'u';
1976 *p++ = hexdigit[(ch >> 12) & 0xf];
1977 *p++ = hexdigit[(ch >> 8) & 0xf];
1978 *p++ = hexdigit[(ch >> 4) & 0xf];
1979 *p++ = hexdigit[ch & 15];
1980 }
1981 /* Copy everything else as-is */
1982 else
1983 *p++ = (char) ch;
1984 }
1985 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001986 if (_PyString_Resize(&repr, p - q))
1987 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988
1989 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001990
1991 onError:
1992 Py_DECREF(repr);
1993 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001994}
1995
1996PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1997{
1998 if (!PyUnicode_Check(unicode)) {
1999 PyErr_BadArgument();
2000 return NULL;
2001 }
2002 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2003 PyUnicode_GET_SIZE(unicode));
2004}
2005
2006/* --- Latin-1 Codec ------------------------------------------------------ */
2007
2008PyObject *PyUnicode_DecodeLatin1(const char *s,
2009 int size,
2010 const char *errors)
2011{
2012 PyUnicodeObject *v;
2013 Py_UNICODE *p;
2014
2015 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002016 if (size == 1 && *(unsigned char*)s < 256) {
2017 Py_UNICODE r = *(unsigned char*)s;
2018 return PyUnicode_FromUnicode(&r, 1);
2019 }
2020
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 v = _PyUnicode_New(size);
2022 if (v == NULL)
2023 goto onError;
2024 if (size == 0)
2025 return (PyObject *)v;
2026 p = PyUnicode_AS_UNICODE(v);
2027 while (size-- > 0)
2028 *p++ = (unsigned char)*s++;
2029 return (PyObject *)v;
2030
2031 onError:
2032 Py_XDECREF(v);
2033 return NULL;
2034}
2035
2036static
2037int latin1_encoding_error(const Py_UNICODE **source,
2038 char **dest,
2039 const char *errors,
2040 const char *details)
2041{
2042 if ((errors == NULL) ||
2043 (strcmp(errors,"strict") == 0)) {
2044 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002045 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 details);
2047 return -1;
2048 }
2049 else if (strcmp(errors,"ignore") == 0) {
2050 return 0;
2051 }
2052 else if (strcmp(errors,"replace") == 0) {
2053 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002054 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 return 0;
2056 }
2057 else {
2058 PyErr_Format(PyExc_ValueError,
2059 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002060 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 errors);
2062 return -1;
2063 }
2064}
2065
2066PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2067 int size,
2068 const char *errors)
2069{
2070 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002071 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002072
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 repr = PyString_FromStringAndSize(NULL, size);
2074 if (repr == NULL)
2075 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002076 if (size == 0)
2077 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078
2079 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002080 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 while (size-- > 0) {
2082 Py_UNICODE ch = *p++;
2083 if (ch >= 256) {
2084 if (latin1_encoding_error(&p, &s, errors,
2085 "ordinal not in range(256)"))
2086 goto onError;
2087 }
2088 else
2089 *s++ = (char)ch;
2090 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002091 /* Resize if error handling skipped some characters */
2092 if (s - start < PyString_GET_SIZE(repr))
2093 if (_PyString_Resize(&repr, s - start))
2094 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 return repr;
2096
2097 onError:
2098 Py_DECREF(repr);
2099 return NULL;
2100}
2101
2102PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2103{
2104 if (!PyUnicode_Check(unicode)) {
2105 PyErr_BadArgument();
2106 return NULL;
2107 }
2108 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2109 PyUnicode_GET_SIZE(unicode),
2110 NULL);
2111}
2112
2113/* --- 7-bit ASCII Codec -------------------------------------------------- */
2114
2115static
2116int ascii_decoding_error(const char **source,
2117 Py_UNICODE **dest,
2118 const char *errors,
2119 const char *details)
2120{
2121 if ((errors == NULL) ||
2122 (strcmp(errors,"strict") == 0)) {
2123 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002124 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 details);
2126 return -1;
2127 }
2128 else if (strcmp(errors,"ignore") == 0) {
2129 return 0;
2130 }
2131 else if (strcmp(errors,"replace") == 0) {
2132 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2133 (*dest)++;
2134 return 0;
2135 }
2136 else {
2137 PyErr_Format(PyExc_ValueError,
2138 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002139 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002140 errors);
2141 return -1;
2142 }
2143}
2144
2145PyObject *PyUnicode_DecodeASCII(const char *s,
2146 int size,
2147 const char *errors)
2148{
2149 PyUnicodeObject *v;
2150 Py_UNICODE *p;
2151
2152 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002153 if (size == 1 && *(unsigned char*)s < 128) {
2154 Py_UNICODE r = *(unsigned char*)s;
2155 return PyUnicode_FromUnicode(&r, 1);
2156 }
2157
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 v = _PyUnicode_New(size);
2159 if (v == NULL)
2160 goto onError;
2161 if (size == 0)
2162 return (PyObject *)v;
2163 p = PyUnicode_AS_UNICODE(v);
2164 while (size-- > 0) {
2165 register unsigned char c;
2166
2167 c = (unsigned char)*s++;
2168 if (c < 128)
2169 *p++ = c;
2170 else if (ascii_decoding_error(&s, &p, errors,
2171 "ordinal not in range(128)"))
2172 goto onError;
2173 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002174 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002175 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002176 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 return (PyObject *)v;
2178
2179 onError:
2180 Py_XDECREF(v);
2181 return NULL;
2182}
2183
2184static
2185int ascii_encoding_error(const Py_UNICODE **source,
2186 char **dest,
2187 const char *errors,
2188 const char *details)
2189{
2190 if ((errors == NULL) ||
2191 (strcmp(errors,"strict") == 0)) {
2192 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002193 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 details);
2195 return -1;
2196 }
2197 else if (strcmp(errors,"ignore") == 0) {
2198 return 0;
2199 }
2200 else if (strcmp(errors,"replace") == 0) {
2201 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002202 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002203 return 0;
2204 }
2205 else {
2206 PyErr_Format(PyExc_ValueError,
2207 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002208 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 errors);
2210 return -1;
2211 }
2212}
2213
2214PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2215 int size,
2216 const char *errors)
2217{
2218 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002219 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002220
Guido van Rossumd57fd912000-03-10 22:53:23 +00002221 repr = PyString_FromStringAndSize(NULL, size);
2222 if (repr == NULL)
2223 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002224 if (size == 0)
2225 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002226
2227 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002228 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002229 while (size-- > 0) {
2230 Py_UNICODE ch = *p++;
2231 if (ch >= 128) {
2232 if (ascii_encoding_error(&p, &s, errors,
2233 "ordinal not in range(128)"))
2234 goto onError;
2235 }
2236 else
2237 *s++ = (char)ch;
2238 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002239 /* Resize if error handling skipped some characters */
2240 if (s - start < PyString_GET_SIZE(repr))
2241 if (_PyString_Resize(&repr, s - start))
2242 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243 return repr;
2244
2245 onError:
2246 Py_DECREF(repr);
2247 return NULL;
2248}
2249
2250PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2251{
2252 if (!PyUnicode_Check(unicode)) {
2253 PyErr_BadArgument();
2254 return NULL;
2255 }
2256 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2257 PyUnicode_GET_SIZE(unicode),
2258 NULL);
2259}
2260
Fredrik Lundh30831632001-06-26 15:11:00 +00002261#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002262
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002263/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002264
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002265PyObject *PyUnicode_DecodeMBCS(const char *s,
2266 int size,
2267 const char *errors)
2268{
2269 PyUnicodeObject *v;
2270 Py_UNICODE *p;
2271
2272 /* First get the size of the result */
2273 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002274 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002275 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2276
2277 v = _PyUnicode_New(usize);
2278 if (v == NULL)
2279 return NULL;
2280 if (usize == 0)
2281 return (PyObject *)v;
2282 p = PyUnicode_AS_UNICODE(v);
2283 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2284 Py_DECREF(v);
2285 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2286 }
2287
2288 return (PyObject *)v;
2289}
2290
2291PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2292 int size,
2293 const char *errors)
2294{
2295 PyObject *repr;
2296 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002297 DWORD mbcssize;
2298
2299 /* If there are no characters, bail now! */
2300 if (size==0)
2301 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002302
2303 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002304 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002305 if (mbcssize==0)
2306 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2307
2308 repr = PyString_FromStringAndSize(NULL, mbcssize);
2309 if (repr == NULL)
2310 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002311 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002312 return repr;
2313
2314 /* Do the conversion */
2315 s = PyString_AS_STRING(repr);
2316 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2317 Py_DECREF(repr);
2318 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2319 }
2320 return repr;
2321}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002322
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002323#endif /* MS_WIN32 */
2324
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325/* --- Character Mapping Codec -------------------------------------------- */
2326
2327static
2328int charmap_decoding_error(const char **source,
2329 Py_UNICODE **dest,
2330 const char *errors,
2331 const char *details)
2332{
2333 if ((errors == NULL) ||
2334 (strcmp(errors,"strict") == 0)) {
2335 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002336 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002337 details);
2338 return -1;
2339 }
2340 else if (strcmp(errors,"ignore") == 0) {
2341 return 0;
2342 }
2343 else if (strcmp(errors,"replace") == 0) {
2344 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2345 (*dest)++;
2346 return 0;
2347 }
2348 else {
2349 PyErr_Format(PyExc_ValueError,
2350 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002351 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002352 errors);
2353 return -1;
2354 }
2355}
2356
2357PyObject *PyUnicode_DecodeCharmap(const char *s,
2358 int size,
2359 PyObject *mapping,
2360 const char *errors)
2361{
2362 PyUnicodeObject *v;
2363 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002364 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002365
2366 /* Default to Latin-1 */
2367 if (mapping == NULL)
2368 return PyUnicode_DecodeLatin1(s, size, errors);
2369
2370 v = _PyUnicode_New(size);
2371 if (v == NULL)
2372 goto onError;
2373 if (size == 0)
2374 return (PyObject *)v;
2375 p = PyUnicode_AS_UNICODE(v);
2376 while (size-- > 0) {
2377 unsigned char ch = *s++;
2378 PyObject *w, *x;
2379
2380 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2381 w = PyInt_FromLong((long)ch);
2382 if (w == NULL)
2383 goto onError;
2384 x = PyObject_GetItem(mapping, w);
2385 Py_DECREF(w);
2386 if (x == NULL) {
2387 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002388 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002389 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002390 x = Py_None;
2391 Py_INCREF(x);
2392 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002393 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002394 }
2395
2396 /* Apply mapping */
2397 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002398 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002399 if (value < 0 || value > 65535) {
2400 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002401 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002402 Py_DECREF(x);
2403 goto onError;
2404 }
2405 *p++ = (Py_UNICODE)value;
2406 }
2407 else if (x == Py_None) {
2408 /* undefined mapping */
2409 if (charmap_decoding_error(&s, &p, errors,
2410 "character maps to <undefined>")) {
2411 Py_DECREF(x);
2412 goto onError;
2413 }
2414 }
2415 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002416 int targetsize = PyUnicode_GET_SIZE(x);
2417
2418 if (targetsize == 1)
2419 /* 1-1 mapping */
2420 *p++ = *PyUnicode_AS_UNICODE(x);
2421
2422 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002423 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002424 if (targetsize > extrachars) {
2425 /* resize first */
2426 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2427 int needed = (targetsize - extrachars) + \
2428 (targetsize << 2);
2429 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002430 if (_PyUnicode_Resize(&v,
2431 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002432 Py_DECREF(x);
2433 goto onError;
2434 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002435 p = PyUnicode_AS_UNICODE(v) + oldpos;
2436 }
2437 Py_UNICODE_COPY(p,
2438 PyUnicode_AS_UNICODE(x),
2439 targetsize);
2440 p += targetsize;
2441 extrachars -= targetsize;
2442 }
2443 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444 }
2445 else {
2446 /* wrong return value */
2447 PyErr_SetString(PyExc_TypeError,
2448 "character mapping must return integer, None or unicode");
2449 Py_DECREF(x);
2450 goto onError;
2451 }
2452 Py_DECREF(x);
2453 }
2454 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002455 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002456 goto onError;
2457 return (PyObject *)v;
2458
2459 onError:
2460 Py_XDECREF(v);
2461 return NULL;
2462}
2463
2464static
2465int charmap_encoding_error(const Py_UNICODE **source,
2466 char **dest,
2467 const char *errors,
2468 const char *details)
2469{
2470 if ((errors == NULL) ||
2471 (strcmp(errors,"strict") == 0)) {
2472 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002473 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002474 details);
2475 return -1;
2476 }
2477 else if (strcmp(errors,"ignore") == 0) {
2478 return 0;
2479 }
2480 else if (strcmp(errors,"replace") == 0) {
2481 **dest = '?';
2482 (*dest)++;
2483 return 0;
2484 }
2485 else {
2486 PyErr_Format(PyExc_ValueError,
2487 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002488 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002489 errors);
2490 return -1;
2491 }
2492}
2493
2494PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2495 int size,
2496 PyObject *mapping,
2497 const char *errors)
2498{
2499 PyObject *v;
2500 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002501 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002502
2503 /* Default to Latin-1 */
2504 if (mapping == NULL)
2505 return PyUnicode_EncodeLatin1(p, size, errors);
2506
2507 v = PyString_FromStringAndSize(NULL, size);
2508 if (v == NULL)
2509 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002510 if (size == 0)
2511 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002512 s = PyString_AS_STRING(v);
2513 while (size-- > 0) {
2514 Py_UNICODE ch = *p++;
2515 PyObject *w, *x;
2516
2517 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2518 w = PyInt_FromLong((long)ch);
2519 if (w == NULL)
2520 goto onError;
2521 x = PyObject_GetItem(mapping, w);
2522 Py_DECREF(w);
2523 if (x == NULL) {
2524 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002525 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002527 x = Py_None;
2528 Py_INCREF(x);
2529 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002530 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002531 }
2532
2533 /* Apply mapping */
2534 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002535 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 if (value < 0 || value > 255) {
2537 PyErr_SetString(PyExc_TypeError,
2538 "character mapping must be in range(256)");
2539 Py_DECREF(x);
2540 goto onError;
2541 }
2542 *s++ = (char)value;
2543 }
2544 else if (x == Py_None) {
2545 /* undefined mapping */
2546 if (charmap_encoding_error(&p, &s, errors,
2547 "character maps to <undefined>")) {
2548 Py_DECREF(x);
2549 goto onError;
2550 }
2551 }
2552 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002553 int targetsize = PyString_GET_SIZE(x);
2554
2555 if (targetsize == 1)
2556 /* 1-1 mapping */
2557 *s++ = *PyString_AS_STRING(x);
2558
2559 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002560 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002561 if (targetsize > extrachars) {
2562 /* resize first */
2563 int oldpos = (int)(s - PyString_AS_STRING(v));
2564 int needed = (targetsize - extrachars) + \
2565 (targetsize << 2);
2566 extrachars += needed;
2567 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002568 Py_DECREF(x);
2569 goto onError;
2570 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002571 s = PyString_AS_STRING(v) + oldpos;
2572 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002573 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002574 s += targetsize;
2575 extrachars -= targetsize;
2576 }
2577 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002578 }
2579 else {
2580 /* wrong return value */
2581 PyErr_SetString(PyExc_TypeError,
2582 "character mapping must return integer, None or unicode");
2583 Py_DECREF(x);
2584 goto onError;
2585 }
2586 Py_DECREF(x);
2587 }
2588 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2589 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2590 goto onError;
2591 return v;
2592
2593 onError:
2594 Py_DECREF(v);
2595 return NULL;
2596}
2597
2598PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2599 PyObject *mapping)
2600{
2601 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2602 PyErr_BadArgument();
2603 return NULL;
2604 }
2605 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2606 PyUnicode_GET_SIZE(unicode),
2607 mapping,
2608 NULL);
2609}
2610
2611static
2612int translate_error(const Py_UNICODE **source,
2613 Py_UNICODE **dest,
2614 const char *errors,
2615 const char *details)
2616{
2617 if ((errors == NULL) ||
2618 (strcmp(errors,"strict") == 0)) {
2619 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002620 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002621 details);
2622 return -1;
2623 }
2624 else if (strcmp(errors,"ignore") == 0) {
2625 return 0;
2626 }
2627 else if (strcmp(errors,"replace") == 0) {
2628 **dest = '?';
2629 (*dest)++;
2630 return 0;
2631 }
2632 else {
2633 PyErr_Format(PyExc_ValueError,
2634 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002635 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002636 errors);
2637 return -1;
2638 }
2639}
2640
2641PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2642 int size,
2643 PyObject *mapping,
2644 const char *errors)
2645{
2646 PyUnicodeObject *v;
2647 Py_UNICODE *p;
2648
2649 if (mapping == NULL) {
2650 PyErr_BadArgument();
2651 return NULL;
2652 }
2653
2654 /* Output will never be longer than input */
2655 v = _PyUnicode_New(size);
2656 if (v == NULL)
2657 goto onError;
2658 if (size == 0)
2659 goto done;
2660 p = PyUnicode_AS_UNICODE(v);
2661 while (size-- > 0) {
2662 Py_UNICODE ch = *s++;
2663 PyObject *w, *x;
2664
2665 /* Get mapping */
2666 w = PyInt_FromLong(ch);
2667 if (w == NULL)
2668 goto onError;
2669 x = PyObject_GetItem(mapping, w);
2670 Py_DECREF(w);
2671 if (x == NULL) {
2672 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2673 /* No mapping found: default to 1-1 mapping */
2674 PyErr_Clear();
2675 *p++ = ch;
2676 continue;
2677 }
2678 goto onError;
2679 }
2680
2681 /* Apply mapping */
2682 if (PyInt_Check(x))
2683 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2684 else if (x == Py_None) {
2685 /* undefined mapping */
2686 if (translate_error(&s, &p, errors,
2687 "character maps to <undefined>")) {
2688 Py_DECREF(x);
2689 goto onError;
2690 }
2691 }
2692 else if (PyUnicode_Check(x)) {
2693 if (PyUnicode_GET_SIZE(x) != 1) {
2694 /* 1-n mapping */
2695 PyErr_SetString(PyExc_NotImplementedError,
2696 "1-n mappings are currently not implemented");
2697 Py_DECREF(x);
2698 goto onError;
2699 }
2700 *p++ = *PyUnicode_AS_UNICODE(x);
2701 }
2702 else {
2703 /* wrong return value */
2704 PyErr_SetString(PyExc_TypeError,
2705 "translate mapping must return integer, None or unicode");
2706 Py_DECREF(x);
2707 goto onError;
2708 }
2709 Py_DECREF(x);
2710 }
2711 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002712 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002714
2715 done:
2716 return (PyObject *)v;
2717
2718 onError:
2719 Py_XDECREF(v);
2720 return NULL;
2721}
2722
2723PyObject *PyUnicode_Translate(PyObject *str,
2724 PyObject *mapping,
2725 const char *errors)
2726{
2727 PyObject *result;
2728
2729 str = PyUnicode_FromObject(str);
2730 if (str == NULL)
2731 goto onError;
2732 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2733 PyUnicode_GET_SIZE(str),
2734 mapping,
2735 errors);
2736 Py_DECREF(str);
2737 return result;
2738
2739 onError:
2740 Py_XDECREF(str);
2741 return NULL;
2742}
2743
Guido van Rossum9e896b32000-04-05 20:11:21 +00002744/* --- Decimal Encoder ---------------------------------------------------- */
2745
2746int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2747 int length,
2748 char *output,
2749 const char *errors)
2750{
2751 Py_UNICODE *p, *end;
2752
2753 if (output == NULL) {
2754 PyErr_BadArgument();
2755 return -1;
2756 }
2757
2758 p = s;
2759 end = s + length;
2760 while (p < end) {
2761 register Py_UNICODE ch = *p++;
2762 int decimal;
2763
2764 if (Py_UNICODE_ISSPACE(ch)) {
2765 *output++ = ' ';
2766 continue;
2767 }
2768 decimal = Py_UNICODE_TODECIMAL(ch);
2769 if (decimal >= 0) {
2770 *output++ = '0' + decimal;
2771 continue;
2772 }
Guido van Rossumba477042000-04-06 18:18:10 +00002773 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002774 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002775 continue;
2776 }
2777 /* All other characters are considered invalid */
2778 if (errors == NULL || strcmp(errors, "strict") == 0) {
2779 PyErr_SetString(PyExc_ValueError,
2780 "invalid decimal Unicode string");
2781 goto onError;
2782 }
2783 else if (strcmp(errors, "ignore") == 0)
2784 continue;
2785 else if (strcmp(errors, "replace") == 0) {
2786 *output++ = '?';
2787 continue;
2788 }
2789 }
2790 /* 0-terminate the output string */
2791 *output++ = '\0';
2792 return 0;
2793
2794 onError:
2795 return -1;
2796}
2797
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798/* --- Helpers ------------------------------------------------------------ */
2799
2800static
2801int count(PyUnicodeObject *self,
2802 int start,
2803 int end,
2804 PyUnicodeObject *substring)
2805{
2806 int count = 0;
2807
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002808 if (start < 0)
2809 start += self->length;
2810 if (start < 0)
2811 start = 0;
2812 if (end > self->length)
2813 end = self->length;
2814 if (end < 0)
2815 end += self->length;
2816 if (end < 0)
2817 end = 0;
2818
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002819 if (substring->length == 0)
2820 return (end - start + 1);
2821
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 end -= substring->length;
2823
2824 while (start <= end)
2825 if (Py_UNICODE_MATCH(self, start, substring)) {
2826 count++;
2827 start += substring->length;
2828 } else
2829 start++;
2830
2831 return count;
2832}
2833
2834int PyUnicode_Count(PyObject *str,
2835 PyObject *substr,
2836 int start,
2837 int end)
2838{
2839 int result;
2840
2841 str = PyUnicode_FromObject(str);
2842 if (str == NULL)
2843 return -1;
2844 substr = PyUnicode_FromObject(substr);
2845 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002846 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 return -1;
2848 }
2849
2850 result = count((PyUnicodeObject *)str,
2851 start, end,
2852 (PyUnicodeObject *)substr);
2853
2854 Py_DECREF(str);
2855 Py_DECREF(substr);
2856 return result;
2857}
2858
2859static
2860int findstring(PyUnicodeObject *self,
2861 PyUnicodeObject *substring,
2862 int start,
2863 int end,
2864 int direction)
2865{
2866 if (start < 0)
2867 start += self->length;
2868 if (start < 0)
2869 start = 0;
2870
2871 if (substring->length == 0)
2872 return start;
2873
2874 if (end > self->length)
2875 end = self->length;
2876 if (end < 0)
2877 end += self->length;
2878 if (end < 0)
2879 end = 0;
2880
2881 end -= substring->length;
2882
2883 if (direction < 0) {
2884 for (; end >= start; end--)
2885 if (Py_UNICODE_MATCH(self, end, substring))
2886 return end;
2887 } else {
2888 for (; start <= end; start++)
2889 if (Py_UNICODE_MATCH(self, start, substring))
2890 return start;
2891 }
2892
2893 return -1;
2894}
2895
2896int PyUnicode_Find(PyObject *str,
2897 PyObject *substr,
2898 int start,
2899 int end,
2900 int direction)
2901{
2902 int result;
2903
2904 str = PyUnicode_FromObject(str);
2905 if (str == NULL)
2906 return -1;
2907 substr = PyUnicode_FromObject(substr);
2908 if (substr == NULL) {
2909 Py_DECREF(substr);
2910 return -1;
2911 }
2912
2913 result = findstring((PyUnicodeObject *)str,
2914 (PyUnicodeObject *)substr,
2915 start, end, direction);
2916 Py_DECREF(str);
2917 Py_DECREF(substr);
2918 return result;
2919}
2920
2921static
2922int tailmatch(PyUnicodeObject *self,
2923 PyUnicodeObject *substring,
2924 int start,
2925 int end,
2926 int direction)
2927{
2928 if (start < 0)
2929 start += self->length;
2930 if (start < 0)
2931 start = 0;
2932
2933 if (substring->length == 0)
2934 return 1;
2935
2936 if (end > self->length)
2937 end = self->length;
2938 if (end < 0)
2939 end += self->length;
2940 if (end < 0)
2941 end = 0;
2942
2943 end -= substring->length;
2944 if (end < start)
2945 return 0;
2946
2947 if (direction > 0) {
2948 if (Py_UNICODE_MATCH(self, end, substring))
2949 return 1;
2950 } else {
2951 if (Py_UNICODE_MATCH(self, start, substring))
2952 return 1;
2953 }
2954
2955 return 0;
2956}
2957
2958int PyUnicode_Tailmatch(PyObject *str,
2959 PyObject *substr,
2960 int start,
2961 int end,
2962 int direction)
2963{
2964 int result;
2965
2966 str = PyUnicode_FromObject(str);
2967 if (str == NULL)
2968 return -1;
2969 substr = PyUnicode_FromObject(substr);
2970 if (substr == NULL) {
2971 Py_DECREF(substr);
2972 return -1;
2973 }
2974
2975 result = tailmatch((PyUnicodeObject *)str,
2976 (PyUnicodeObject *)substr,
2977 start, end, direction);
2978 Py_DECREF(str);
2979 Py_DECREF(substr);
2980 return result;
2981}
2982
2983static
2984const Py_UNICODE *findchar(const Py_UNICODE *s,
2985 int size,
2986 Py_UNICODE ch)
2987{
2988 /* like wcschr, but doesn't stop at NULL characters */
2989
2990 while (size-- > 0) {
2991 if (*s == ch)
2992 return s;
2993 s++;
2994 }
2995
2996 return NULL;
2997}
2998
2999/* Apply fixfct filter to the Unicode object self and return a
3000 reference to the modified object */
3001
3002static
3003PyObject *fixup(PyUnicodeObject *self,
3004 int (*fixfct)(PyUnicodeObject *s))
3005{
3006
3007 PyUnicodeObject *u;
3008
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003009 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003010 if (u == NULL)
3011 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003012
3013 Py_UNICODE_COPY(u->str, self->str, self->length);
3014
Tim Peters7a29bd52001-09-12 03:03:31 +00003015 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003016 /* fixfct should return TRUE if it modified the buffer. If
3017 FALSE, return a reference to the original buffer instead
3018 (to save space, not time) */
3019 Py_INCREF(self);
3020 Py_DECREF(u);
3021 return (PyObject*) self;
3022 }
3023 return (PyObject*) u;
3024}
3025
3026static
3027int fixupper(PyUnicodeObject *self)
3028{
3029 int len = self->length;
3030 Py_UNICODE *s = self->str;
3031 int status = 0;
3032
3033 while (len-- > 0) {
3034 register Py_UNICODE ch;
3035
3036 ch = Py_UNICODE_TOUPPER(*s);
3037 if (ch != *s) {
3038 status = 1;
3039 *s = ch;
3040 }
3041 s++;
3042 }
3043
3044 return status;
3045}
3046
3047static
3048int fixlower(PyUnicodeObject *self)
3049{
3050 int len = self->length;
3051 Py_UNICODE *s = self->str;
3052 int status = 0;
3053
3054 while (len-- > 0) {
3055 register Py_UNICODE ch;
3056
3057 ch = Py_UNICODE_TOLOWER(*s);
3058 if (ch != *s) {
3059 status = 1;
3060 *s = ch;
3061 }
3062 s++;
3063 }
3064
3065 return status;
3066}
3067
3068static
3069int fixswapcase(PyUnicodeObject *self)
3070{
3071 int len = self->length;
3072 Py_UNICODE *s = self->str;
3073 int status = 0;
3074
3075 while (len-- > 0) {
3076 if (Py_UNICODE_ISUPPER(*s)) {
3077 *s = Py_UNICODE_TOLOWER(*s);
3078 status = 1;
3079 } else if (Py_UNICODE_ISLOWER(*s)) {
3080 *s = Py_UNICODE_TOUPPER(*s);
3081 status = 1;
3082 }
3083 s++;
3084 }
3085
3086 return status;
3087}
3088
3089static
3090int fixcapitalize(PyUnicodeObject *self)
3091{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003092 int len = self->length;
3093 Py_UNICODE *s = self->str;
3094 int status = 0;
3095
3096 if (len == 0)
3097 return 0;
3098 if (Py_UNICODE_ISLOWER(*s)) {
3099 *s = Py_UNICODE_TOUPPER(*s);
3100 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003101 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003102 s++;
3103 while (--len > 0) {
3104 if (Py_UNICODE_ISUPPER(*s)) {
3105 *s = Py_UNICODE_TOLOWER(*s);
3106 status = 1;
3107 }
3108 s++;
3109 }
3110 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003111}
3112
3113static
3114int fixtitle(PyUnicodeObject *self)
3115{
3116 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3117 register Py_UNICODE *e;
3118 int previous_is_cased;
3119
3120 /* Shortcut for single character strings */
3121 if (PyUnicode_GET_SIZE(self) == 1) {
3122 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3123 if (*p != ch) {
3124 *p = ch;
3125 return 1;
3126 }
3127 else
3128 return 0;
3129 }
3130
3131 e = p + PyUnicode_GET_SIZE(self);
3132 previous_is_cased = 0;
3133 for (; p < e; p++) {
3134 register const Py_UNICODE ch = *p;
3135
3136 if (previous_is_cased)
3137 *p = Py_UNICODE_TOLOWER(ch);
3138 else
3139 *p = Py_UNICODE_TOTITLE(ch);
3140
3141 if (Py_UNICODE_ISLOWER(ch) ||
3142 Py_UNICODE_ISUPPER(ch) ||
3143 Py_UNICODE_ISTITLE(ch))
3144 previous_is_cased = 1;
3145 else
3146 previous_is_cased = 0;
3147 }
3148 return 1;
3149}
3150
3151PyObject *PyUnicode_Join(PyObject *separator,
3152 PyObject *seq)
3153{
3154 Py_UNICODE *sep;
3155 int seplen;
3156 PyUnicodeObject *res = NULL;
3157 int reslen = 0;
3158 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 int sz = 100;
3160 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003161 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162
Tim Peters2cfe3682001-05-05 05:36:48 +00003163 it = PyObject_GetIter(seq);
3164 if (it == NULL)
3165 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166
3167 if (separator == NULL) {
3168 Py_UNICODE blank = ' ';
3169 sep = &blank;
3170 seplen = 1;
3171 }
3172 else {
3173 separator = PyUnicode_FromObject(separator);
3174 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003175 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003176 sep = PyUnicode_AS_UNICODE(separator);
3177 seplen = PyUnicode_GET_SIZE(separator);
3178 }
3179
3180 res = _PyUnicode_New(sz);
3181 if (res == NULL)
3182 goto onError;
3183 p = PyUnicode_AS_UNICODE(res);
3184 reslen = 0;
3185
Tim Peters2cfe3682001-05-05 05:36:48 +00003186 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003187 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003188 PyObject *item = PyIter_Next(it);
3189 if (item == NULL) {
3190 if (PyErr_Occurred())
3191 goto onError;
3192 break;
3193 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003194 if (!PyUnicode_Check(item)) {
3195 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003196 if (!PyString_Check(item)) {
3197 PyErr_Format(PyExc_TypeError,
3198 "sequence item %i: expected string or Unicode,"
3199 " %.80s found",
3200 i, item->ob_type->tp_name);
3201 Py_DECREF(item);
3202 goto onError;
3203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 v = PyUnicode_FromObject(item);
3205 Py_DECREF(item);
3206 item = v;
3207 if (item == NULL)
3208 goto onError;
3209 }
3210 itemlen = PyUnicode_GET_SIZE(item);
3211 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003212 if (_PyUnicode_Resize(&res, sz*2)) {
3213 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003215 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003216 sz *= 2;
3217 p = PyUnicode_AS_UNICODE(res) + reslen;
3218 }
3219 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003220 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 p += seplen;
3222 reslen += seplen;
3223 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003224 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 p += itemlen;
3226 reslen += itemlen;
3227 Py_DECREF(item);
3228 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003229 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003230 goto onError;
3231
3232 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003233 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 return (PyObject *)res;
3235
3236 onError:
3237 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003238 Py_XDECREF(res);
3239 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003240 return NULL;
3241}
3242
3243static
3244PyUnicodeObject *pad(PyUnicodeObject *self,
3245 int left,
3246 int right,
3247 Py_UNICODE fill)
3248{
3249 PyUnicodeObject *u;
3250
3251 if (left < 0)
3252 left = 0;
3253 if (right < 0)
3254 right = 0;
3255
Tim Peters7a29bd52001-09-12 03:03:31 +00003256 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257 Py_INCREF(self);
3258 return self;
3259 }
3260
3261 u = _PyUnicode_New(left + self->length + right);
3262 if (u) {
3263 if (left)
3264 Py_UNICODE_FILL(u->str, fill, left);
3265 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3266 if (right)
3267 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3268 }
3269
3270 return u;
3271}
3272
3273#define SPLIT_APPEND(data, left, right) \
3274 str = PyUnicode_FromUnicode(data + left, right - left); \
3275 if (!str) \
3276 goto onError; \
3277 if (PyList_Append(list, str)) { \
3278 Py_DECREF(str); \
3279 goto onError; \
3280 } \
3281 else \
3282 Py_DECREF(str);
3283
3284static
3285PyObject *split_whitespace(PyUnicodeObject *self,
3286 PyObject *list,
3287 int maxcount)
3288{
3289 register int i;
3290 register int j;
3291 int len = self->length;
3292 PyObject *str;
3293
3294 for (i = j = 0; i < len; ) {
3295 /* find a token */
3296 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3297 i++;
3298 j = i;
3299 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3300 i++;
3301 if (j < i) {
3302 if (maxcount-- <= 0)
3303 break;
3304 SPLIT_APPEND(self->str, j, i);
3305 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3306 i++;
3307 j = i;
3308 }
3309 }
3310 if (j < len) {
3311 SPLIT_APPEND(self->str, j, len);
3312 }
3313 return list;
3314
3315 onError:
3316 Py_DECREF(list);
3317 return NULL;
3318}
3319
3320PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003321 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003322{
3323 register int i;
3324 register int j;
3325 int len;
3326 PyObject *list;
3327 PyObject *str;
3328 Py_UNICODE *data;
3329
3330 string = PyUnicode_FromObject(string);
3331 if (string == NULL)
3332 return NULL;
3333 data = PyUnicode_AS_UNICODE(string);
3334 len = PyUnicode_GET_SIZE(string);
3335
Guido van Rossumd57fd912000-03-10 22:53:23 +00003336 list = PyList_New(0);
3337 if (!list)
3338 goto onError;
3339
3340 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003341 int eol;
3342
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 /* Find a line and append it */
3344 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3345 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003346
3347 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003348 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349 if (i < len) {
3350 if (data[i] == '\r' && i + 1 < len &&
3351 data[i+1] == '\n')
3352 i += 2;
3353 else
3354 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003355 if (keepends)
3356 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003357 }
Guido van Rossum86662912000-04-11 15:38:46 +00003358 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 j = i;
3360 }
3361 if (j < len) {
3362 SPLIT_APPEND(data, j, len);
3363 }
3364
3365 Py_DECREF(string);
3366 return list;
3367
3368 onError:
3369 Py_DECREF(list);
3370 Py_DECREF(string);
3371 return NULL;
3372}
3373
3374static
3375PyObject *split_char(PyUnicodeObject *self,
3376 PyObject *list,
3377 Py_UNICODE ch,
3378 int maxcount)
3379{
3380 register int i;
3381 register int j;
3382 int len = self->length;
3383 PyObject *str;
3384
3385 for (i = j = 0; i < len; ) {
3386 if (self->str[i] == ch) {
3387 if (maxcount-- <= 0)
3388 break;
3389 SPLIT_APPEND(self->str, j, i);
3390 i = j = i + 1;
3391 } else
3392 i++;
3393 }
3394 if (j <= len) {
3395 SPLIT_APPEND(self->str, j, len);
3396 }
3397 return list;
3398
3399 onError:
3400 Py_DECREF(list);
3401 return NULL;
3402}
3403
3404static
3405PyObject *split_substring(PyUnicodeObject *self,
3406 PyObject *list,
3407 PyUnicodeObject *substring,
3408 int maxcount)
3409{
3410 register int i;
3411 register int j;
3412 int len = self->length;
3413 int sublen = substring->length;
3414 PyObject *str;
3415
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003416 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003417 if (Py_UNICODE_MATCH(self, i, substring)) {
3418 if (maxcount-- <= 0)
3419 break;
3420 SPLIT_APPEND(self->str, j, i);
3421 i = j = i + sublen;
3422 } else
3423 i++;
3424 }
3425 if (j <= len) {
3426 SPLIT_APPEND(self->str, j, len);
3427 }
3428 return list;
3429
3430 onError:
3431 Py_DECREF(list);
3432 return NULL;
3433}
3434
3435#undef SPLIT_APPEND
3436
3437static
3438PyObject *split(PyUnicodeObject *self,
3439 PyUnicodeObject *substring,
3440 int maxcount)
3441{
3442 PyObject *list;
3443
3444 if (maxcount < 0)
3445 maxcount = INT_MAX;
3446
3447 list = PyList_New(0);
3448 if (!list)
3449 return NULL;
3450
3451 if (substring == NULL)
3452 return split_whitespace(self,list,maxcount);
3453
3454 else if (substring->length == 1)
3455 return split_char(self,list,substring->str[0],maxcount);
3456
3457 else if (substring->length == 0) {
3458 Py_DECREF(list);
3459 PyErr_SetString(PyExc_ValueError, "empty separator");
3460 return NULL;
3461 }
3462 else
3463 return split_substring(self,list,substring,maxcount);
3464}
3465
3466static
3467PyObject *strip(PyUnicodeObject *self,
3468 int left,
3469 int right)
3470{
3471 Py_UNICODE *p = self->str;
3472 int start = 0;
3473 int end = self->length;
3474
3475 if (left)
3476 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3477 start++;
3478
3479 if (right)
3480 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3481 end--;
3482
Tim Peters7a29bd52001-09-12 03:03:31 +00003483 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003484 /* couldn't strip anything off, return original string */
3485 Py_INCREF(self);
3486 return (PyObject*) self;
3487 }
3488
3489 return (PyObject*) PyUnicode_FromUnicode(
3490 self->str + start,
3491 end - start
3492 );
3493}
3494
3495static
3496PyObject *replace(PyUnicodeObject *self,
3497 PyUnicodeObject *str1,
3498 PyUnicodeObject *str2,
3499 int maxcount)
3500{
3501 PyUnicodeObject *u;
3502
3503 if (maxcount < 0)
3504 maxcount = INT_MAX;
3505
3506 if (str1->length == 1 && str2->length == 1) {
3507 int i;
3508
3509 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003510 if (!findchar(self->str, self->length, str1->str[0]) &&
3511 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512 /* nothing to replace, return original string */
3513 Py_INCREF(self);
3514 u = self;
3515 } else {
3516 Py_UNICODE u1 = str1->str[0];
3517 Py_UNICODE u2 = str2->str[0];
3518
3519 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003520 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 self->length
3522 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003523 if (u != NULL) {
3524 Py_UNICODE_COPY(u->str, self->str,
3525 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003526 for (i = 0; i < u->length; i++)
3527 if (u->str[i] == u1) {
3528 if (--maxcount < 0)
3529 break;
3530 u->str[i] = u2;
3531 }
3532 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003533 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003534
3535 } else {
3536 int n, i;
3537 Py_UNICODE *p;
3538
3539 /* replace strings */
3540 n = count(self, 0, self->length, str1);
3541 if (n > maxcount)
3542 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003543 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 /* nothing to replace, return original string */
3545 Py_INCREF(self);
3546 u = self;
3547 } else {
3548 u = _PyUnicode_New(
3549 self->length + n * (str2->length - str1->length));
3550 if (u) {
3551 i = 0;
3552 p = u->str;
3553 while (i <= self->length - str1->length)
3554 if (Py_UNICODE_MATCH(self, i, str1)) {
3555 /* replace string segment */
3556 Py_UNICODE_COPY(p, str2->str, str2->length);
3557 p += str2->length;
3558 i += str1->length;
3559 if (--n <= 0) {
3560 /* copy remaining part */
3561 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3562 break;
3563 }
3564 } else
3565 *p++ = self->str[i++];
3566 }
3567 }
3568 }
3569
3570 return (PyObject *) u;
3571}
3572
3573/* --- Unicode Object Methods --------------------------------------------- */
3574
3575static char title__doc__[] =
3576"S.title() -> unicode\n\
3577\n\
3578Return a titlecased version of S, i.e. words start with title case\n\
3579characters, all remaining cased characters have lower case.";
3580
3581static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003582unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003584 return fixup(self, fixtitle);
3585}
3586
3587static char capitalize__doc__[] =
3588"S.capitalize() -> unicode\n\
3589\n\
3590Return a capitalized version of S, i.e. make the first character\n\
3591have upper case.";
3592
3593static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003594unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 return fixup(self, fixcapitalize);
3597}
3598
3599#if 0
3600static char capwords__doc__[] =
3601"S.capwords() -> unicode\n\
3602\n\
3603Apply .capitalize() to all words in S and return the result with\n\
3604normalized whitespace (all whitespace strings are replaced by ' ').";
3605
3606static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003607unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003608{
3609 PyObject *list;
3610 PyObject *item;
3611 int i;
3612
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613 /* Split into words */
3614 list = split(self, NULL, -1);
3615 if (!list)
3616 return NULL;
3617
3618 /* Capitalize each word */
3619 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3620 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3621 fixcapitalize);
3622 if (item == NULL)
3623 goto onError;
3624 Py_DECREF(PyList_GET_ITEM(list, i));
3625 PyList_SET_ITEM(list, i, item);
3626 }
3627
3628 /* Join the words to form a new string */
3629 item = PyUnicode_Join(NULL, list);
3630
3631onError:
3632 Py_DECREF(list);
3633 return (PyObject *)item;
3634}
3635#endif
3636
3637static char center__doc__[] =
3638"S.center(width) -> unicode\n\
3639\n\
3640Return S centered in a Unicode string of length width. Padding is done\n\
3641using spaces.";
3642
3643static PyObject *
3644unicode_center(PyUnicodeObject *self, PyObject *args)
3645{
3646 int marg, left;
3647 int width;
3648
3649 if (!PyArg_ParseTuple(args, "i:center", &width))
3650 return NULL;
3651
Tim Peters7a29bd52001-09-12 03:03:31 +00003652 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653 Py_INCREF(self);
3654 return (PyObject*) self;
3655 }
3656
3657 marg = width - self->length;
3658 left = marg / 2 + (marg & width & 1);
3659
3660 return (PyObject*) pad(self, left, marg - left, ' ');
3661}
3662
Marc-André Lemburge5034372000-08-08 08:04:29 +00003663#if 0
3664
3665/* This code should go into some future Unicode collation support
3666 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003667 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003668
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003669/* speedy UTF-16 code point order comparison */
3670/* gleaned from: */
3671/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3672
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003673static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003674{
3675 0, 0, 0, 0, 0, 0, 0, 0,
3676 0, 0, 0, 0, 0, 0, 0, 0,
3677 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003678 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003679};
3680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681static int
3682unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3683{
3684 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 Py_UNICODE *s1 = str1->str;
3687 Py_UNICODE *s2 = str2->str;
3688
3689 len1 = str1->length;
3690 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003691
Guido van Rossumd57fd912000-03-10 22:53:23 +00003692 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003693 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003694
3695 c1 = *s1++;
3696 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003697
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003698 if (c1 > (1<<11) * 26)
3699 c1 += utf16Fixup[c1>>11];
3700 if (c2 > (1<<11) * 26)
3701 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003702 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003703
3704 if (c1 != c2)
3705 return (c1 < c2) ? -1 : 1;
3706
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003707 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 }
3709
3710 return (len1 < len2) ? -1 : (len1 != len2);
3711}
3712
Marc-André Lemburge5034372000-08-08 08:04:29 +00003713#else
3714
3715static int
3716unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3717{
3718 register int len1, len2;
3719
3720 Py_UNICODE *s1 = str1->str;
3721 Py_UNICODE *s2 = str2->str;
3722
3723 len1 = str1->length;
3724 len2 = str2->length;
3725
3726 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003727 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003728
Fredrik Lundh45714e92001-06-26 16:39:36 +00003729 c1 = *s1++;
3730 c2 = *s2++;
3731
3732 if (c1 != c2)
3733 return (c1 < c2) ? -1 : 1;
3734
Marc-André Lemburge5034372000-08-08 08:04:29 +00003735 len1--; len2--;
3736 }
3737
3738 return (len1 < len2) ? -1 : (len1 != len2);
3739}
3740
3741#endif
3742
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743int PyUnicode_Compare(PyObject *left,
3744 PyObject *right)
3745{
3746 PyUnicodeObject *u = NULL, *v = NULL;
3747 int result;
3748
3749 /* Coerce the two arguments */
3750 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3751 if (u == NULL)
3752 goto onError;
3753 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3754 if (v == NULL)
3755 goto onError;
3756
Thomas Wouters7e474022000-07-16 12:04:32 +00003757 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 if (v == u) {
3759 Py_DECREF(u);
3760 Py_DECREF(v);
3761 return 0;
3762 }
3763
3764 result = unicode_compare(u, v);
3765
3766 Py_DECREF(u);
3767 Py_DECREF(v);
3768 return result;
3769
3770onError:
3771 Py_XDECREF(u);
3772 Py_XDECREF(v);
3773 return -1;
3774}
3775
Guido van Rossum403d68b2000-03-13 15:55:09 +00003776int PyUnicode_Contains(PyObject *container,
3777 PyObject *element)
3778{
3779 PyUnicodeObject *u = NULL, *v = NULL;
3780 int result;
3781 register const Py_UNICODE *p, *e;
3782 register Py_UNICODE ch;
3783
3784 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003785 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003786 if (v == NULL) {
3787 PyErr_SetString(PyExc_TypeError,
3788 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003789 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003790 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003791 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3792 if (u == NULL) {
3793 Py_DECREF(v);
3794 goto onError;
3795 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003796
3797 /* Check v in u */
3798 if (PyUnicode_GET_SIZE(v) != 1) {
3799 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003800 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003801 goto onError;
3802 }
3803 ch = *PyUnicode_AS_UNICODE(v);
3804 p = PyUnicode_AS_UNICODE(u);
3805 e = p + PyUnicode_GET_SIZE(u);
3806 result = 0;
3807 while (p < e) {
3808 if (*p++ == ch) {
3809 result = 1;
3810 break;
3811 }
3812 }
3813
3814 Py_DECREF(u);
3815 Py_DECREF(v);
3816 return result;
3817
3818onError:
3819 Py_XDECREF(u);
3820 Py_XDECREF(v);
3821 return -1;
3822}
3823
Guido van Rossumd57fd912000-03-10 22:53:23 +00003824/* Concat to string or Unicode object giving a new Unicode object. */
3825
3826PyObject *PyUnicode_Concat(PyObject *left,
3827 PyObject *right)
3828{
3829 PyUnicodeObject *u = NULL, *v = NULL, *w;
3830
3831 /* Coerce the two arguments */
3832 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3833 if (u == NULL)
3834 goto onError;
3835 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3836 if (v == NULL)
3837 goto onError;
3838
3839 /* Shortcuts */
3840 if (v == unicode_empty) {
3841 Py_DECREF(v);
3842 return (PyObject *)u;
3843 }
3844 if (u == unicode_empty) {
3845 Py_DECREF(u);
3846 return (PyObject *)v;
3847 }
3848
3849 /* Concat the two Unicode strings */
3850 w = _PyUnicode_New(u->length + v->length);
3851 if (w == NULL)
3852 goto onError;
3853 Py_UNICODE_COPY(w->str, u->str, u->length);
3854 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3855
3856 Py_DECREF(u);
3857 Py_DECREF(v);
3858 return (PyObject *)w;
3859
3860onError:
3861 Py_XDECREF(u);
3862 Py_XDECREF(v);
3863 return NULL;
3864}
3865
3866static char count__doc__[] =
3867"S.count(sub[, start[, end]]) -> int\n\
3868\n\
3869Return the number of occurrences of substring sub in Unicode string\n\
3870S[start:end]. Optional arguments start and end are\n\
3871interpreted as in slice notation.";
3872
3873static PyObject *
3874unicode_count(PyUnicodeObject *self, PyObject *args)
3875{
3876 PyUnicodeObject *substring;
3877 int start = 0;
3878 int end = INT_MAX;
3879 PyObject *result;
3880
Guido van Rossumb8872e62000-05-09 14:14:27 +00003881 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3882 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003883 return NULL;
3884
3885 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3886 (PyObject *)substring);
3887 if (substring == NULL)
3888 return NULL;
3889
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 if (start < 0)
3891 start += self->length;
3892 if (start < 0)
3893 start = 0;
3894 if (end > self->length)
3895 end = self->length;
3896 if (end < 0)
3897 end += self->length;
3898 if (end < 0)
3899 end = 0;
3900
3901 result = PyInt_FromLong((long) count(self, start, end, substring));
3902
3903 Py_DECREF(substring);
3904 return result;
3905}
3906
3907static char encode__doc__[] =
3908"S.encode([encoding[,errors]]) -> string\n\
3909\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003910Return an encoded string version of S. Default encoding is the current\n\
3911default string encoding. errors may be given to set a different error\n\
3912handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3913a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003914
3915static PyObject *
3916unicode_encode(PyUnicodeObject *self, PyObject *args)
3917{
3918 char *encoding = NULL;
3919 char *errors = NULL;
3920 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3921 return NULL;
3922 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3923}
3924
3925static char expandtabs__doc__[] =
3926"S.expandtabs([tabsize]) -> unicode\n\
3927\n\
3928Return a copy of S where all tab characters are expanded using spaces.\n\
3929If tabsize is not given, a tab size of 8 characters is assumed.";
3930
3931static PyObject*
3932unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3933{
3934 Py_UNICODE *e;
3935 Py_UNICODE *p;
3936 Py_UNICODE *q;
3937 int i, j;
3938 PyUnicodeObject *u;
3939 int tabsize = 8;
3940
3941 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3942 return NULL;
3943
Thomas Wouters7e474022000-07-16 12:04:32 +00003944 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003945 i = j = 0;
3946 e = self->str + self->length;
3947 for (p = self->str; p < e; p++)
3948 if (*p == '\t') {
3949 if (tabsize > 0)
3950 j += tabsize - (j % tabsize);
3951 }
3952 else {
3953 j++;
3954 if (*p == '\n' || *p == '\r') {
3955 i += j;
3956 j = 0;
3957 }
3958 }
3959
3960 /* Second pass: create output string and fill it */
3961 u = _PyUnicode_New(i + j);
3962 if (!u)
3963 return NULL;
3964
3965 j = 0;
3966 q = u->str;
3967
3968 for (p = self->str; p < e; p++)
3969 if (*p == '\t') {
3970 if (tabsize > 0) {
3971 i = tabsize - (j % tabsize);
3972 j += i;
3973 while (i--)
3974 *q++ = ' ';
3975 }
3976 }
3977 else {
3978 j++;
3979 *q++ = *p;
3980 if (*p == '\n' || *p == '\r')
3981 j = 0;
3982 }
3983
3984 return (PyObject*) u;
3985}
3986
3987static char find__doc__[] =
3988"S.find(sub [,start [,end]]) -> int\n\
3989\n\
3990Return the lowest index in S where substring sub is found,\n\
3991such that sub is contained within s[start,end]. Optional\n\
3992arguments start and end are interpreted as in slice notation.\n\
3993\n\
3994Return -1 on failure.";
3995
3996static PyObject *
3997unicode_find(PyUnicodeObject *self, PyObject *args)
3998{
3999 PyUnicodeObject *substring;
4000 int start = 0;
4001 int end = INT_MAX;
4002 PyObject *result;
4003
Guido van Rossumb8872e62000-05-09 14:14:27 +00004004 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4005 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 return NULL;
4007 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4008 (PyObject *)substring);
4009 if (substring == NULL)
4010 return NULL;
4011
4012 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4013
4014 Py_DECREF(substring);
4015 return result;
4016}
4017
4018static PyObject *
4019unicode_getitem(PyUnicodeObject *self, int index)
4020{
4021 if (index < 0 || index >= self->length) {
4022 PyErr_SetString(PyExc_IndexError, "string index out of range");
4023 return NULL;
4024 }
4025
4026 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4027}
4028
4029static long
4030unicode_hash(PyUnicodeObject *self)
4031{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004032 /* Since Unicode objects compare equal to their ASCII string
4033 counterparts, they should use the individual character values
4034 as basis for their hash value. This is needed to assure that
4035 strings and Unicode objects behave in the same way as
4036 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004037
Fredrik Lundhdde61642000-07-10 18:27:47 +00004038 register int len;
4039 register Py_UNICODE *p;
4040 register long x;
4041
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042 if (self->hash != -1)
4043 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004044 len = PyUnicode_GET_SIZE(self);
4045 p = PyUnicode_AS_UNICODE(self);
4046 x = *p << 7;
4047 while (--len >= 0)
4048 x = (1000003*x) ^ *p++;
4049 x ^= PyUnicode_GET_SIZE(self);
4050 if (x == -1)
4051 x = -2;
4052 self->hash = x;
4053 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004054}
4055
4056static char index__doc__[] =
4057"S.index(sub [,start [,end]]) -> int\n\
4058\n\
4059Like S.find() but raise ValueError when the substring is not found.";
4060
4061static PyObject *
4062unicode_index(PyUnicodeObject *self, PyObject *args)
4063{
4064 int result;
4065 PyUnicodeObject *substring;
4066 int start = 0;
4067 int end = INT_MAX;
4068
Guido van Rossumb8872e62000-05-09 14:14:27 +00004069 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4070 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004071 return NULL;
4072
4073 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4074 (PyObject *)substring);
4075 if (substring == NULL)
4076 return NULL;
4077
4078 result = findstring(self, substring, start, end, 1);
4079
4080 Py_DECREF(substring);
4081 if (result < 0) {
4082 PyErr_SetString(PyExc_ValueError, "substring not found");
4083 return NULL;
4084 }
4085 return PyInt_FromLong(result);
4086}
4087
4088static char islower__doc__[] =
4089"S.islower() -> int\n\
4090\n\
4091Return 1 if all cased characters in S are lowercase and there is\n\
4092at least one cased character in S, 0 otherwise.";
4093
4094static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004095unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096{
4097 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4098 register const Py_UNICODE *e;
4099 int cased;
4100
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101 /* Shortcut for single character strings */
4102 if (PyUnicode_GET_SIZE(self) == 1)
4103 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4104
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004105 /* Special case for empty strings */
4106 if (PyString_GET_SIZE(self) == 0)
4107 return PyInt_FromLong(0);
4108
Guido van Rossumd57fd912000-03-10 22:53:23 +00004109 e = p + PyUnicode_GET_SIZE(self);
4110 cased = 0;
4111 for (; p < e; p++) {
4112 register const Py_UNICODE ch = *p;
4113
4114 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4115 return PyInt_FromLong(0);
4116 else if (!cased && Py_UNICODE_ISLOWER(ch))
4117 cased = 1;
4118 }
4119 return PyInt_FromLong(cased);
4120}
4121
4122static char isupper__doc__[] =
4123"S.isupper() -> int\n\
4124\n\
4125Return 1 if all cased characters in S are uppercase and there is\n\
4126at least one cased character in S, 0 otherwise.";
4127
4128static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004129unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004130{
4131 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4132 register const Py_UNICODE *e;
4133 int cased;
4134
Guido van Rossumd57fd912000-03-10 22:53:23 +00004135 /* Shortcut for single character strings */
4136 if (PyUnicode_GET_SIZE(self) == 1)
4137 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4138
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004139 /* Special case for empty strings */
4140 if (PyString_GET_SIZE(self) == 0)
4141 return PyInt_FromLong(0);
4142
Guido van Rossumd57fd912000-03-10 22:53:23 +00004143 e = p + PyUnicode_GET_SIZE(self);
4144 cased = 0;
4145 for (; p < e; p++) {
4146 register const Py_UNICODE ch = *p;
4147
4148 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4149 return PyInt_FromLong(0);
4150 else if (!cased && Py_UNICODE_ISUPPER(ch))
4151 cased = 1;
4152 }
4153 return PyInt_FromLong(cased);
4154}
4155
4156static char istitle__doc__[] =
4157"S.istitle() -> int\n\
4158\n\
4159Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4160may only follow uncased characters and lowercase characters only cased\n\
4161ones. Return 0 otherwise.";
4162
4163static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004164unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004165{
4166 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4167 register const Py_UNICODE *e;
4168 int cased, previous_is_cased;
4169
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 /* Shortcut for single character strings */
4171 if (PyUnicode_GET_SIZE(self) == 1)
4172 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4173 (Py_UNICODE_ISUPPER(*p) != 0));
4174
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004175 /* Special case for empty strings */
4176 if (PyString_GET_SIZE(self) == 0)
4177 return PyInt_FromLong(0);
4178
Guido van Rossumd57fd912000-03-10 22:53:23 +00004179 e = p + PyUnicode_GET_SIZE(self);
4180 cased = 0;
4181 previous_is_cased = 0;
4182 for (; p < e; p++) {
4183 register const Py_UNICODE ch = *p;
4184
4185 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4186 if (previous_is_cased)
4187 return PyInt_FromLong(0);
4188 previous_is_cased = 1;
4189 cased = 1;
4190 }
4191 else if (Py_UNICODE_ISLOWER(ch)) {
4192 if (!previous_is_cased)
4193 return PyInt_FromLong(0);
4194 previous_is_cased = 1;
4195 cased = 1;
4196 }
4197 else
4198 previous_is_cased = 0;
4199 }
4200 return PyInt_FromLong(cased);
4201}
4202
4203static char isspace__doc__[] =
4204"S.isspace() -> int\n\
4205\n\
4206Return 1 if there are only whitespace characters in S,\n\
42070 otherwise.";
4208
4209static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004210unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004211{
4212 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4213 register const Py_UNICODE *e;
4214
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215 /* Shortcut for single character strings */
4216 if (PyUnicode_GET_SIZE(self) == 1 &&
4217 Py_UNICODE_ISSPACE(*p))
4218 return PyInt_FromLong(1);
4219
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004220 /* Special case for empty strings */
4221 if (PyString_GET_SIZE(self) == 0)
4222 return PyInt_FromLong(0);
4223
Guido van Rossumd57fd912000-03-10 22:53:23 +00004224 e = p + PyUnicode_GET_SIZE(self);
4225 for (; p < e; p++) {
4226 if (!Py_UNICODE_ISSPACE(*p))
4227 return PyInt_FromLong(0);
4228 }
4229 return PyInt_FromLong(1);
4230}
4231
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004232static char isalpha__doc__[] =
4233"S.isalpha() -> int\n\
4234\n\
4235Return 1 if all characters in S are alphabetic\n\
4236and there is at least one character in S, 0 otherwise.";
4237
4238static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004239unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004240{
4241 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4242 register const Py_UNICODE *e;
4243
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004244 /* Shortcut for single character strings */
4245 if (PyUnicode_GET_SIZE(self) == 1 &&
4246 Py_UNICODE_ISALPHA(*p))
4247 return PyInt_FromLong(1);
4248
4249 /* Special case for empty strings */
4250 if (PyString_GET_SIZE(self) == 0)
4251 return PyInt_FromLong(0);
4252
4253 e = p + PyUnicode_GET_SIZE(self);
4254 for (; p < e; p++) {
4255 if (!Py_UNICODE_ISALPHA(*p))
4256 return PyInt_FromLong(0);
4257 }
4258 return PyInt_FromLong(1);
4259}
4260
4261static char isalnum__doc__[] =
4262"S.isalnum() -> int\n\
4263\n\
4264Return 1 if all characters in S are alphanumeric\n\
4265and there is at least one character in S, 0 otherwise.";
4266
4267static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004268unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004269{
4270 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4271 register const Py_UNICODE *e;
4272
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004273 /* Shortcut for single character strings */
4274 if (PyUnicode_GET_SIZE(self) == 1 &&
4275 Py_UNICODE_ISALNUM(*p))
4276 return PyInt_FromLong(1);
4277
4278 /* Special case for empty strings */
4279 if (PyString_GET_SIZE(self) == 0)
4280 return PyInt_FromLong(0);
4281
4282 e = p + PyUnicode_GET_SIZE(self);
4283 for (; p < e; p++) {
4284 if (!Py_UNICODE_ISALNUM(*p))
4285 return PyInt_FromLong(0);
4286 }
4287 return PyInt_FromLong(1);
4288}
4289
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290static char isdecimal__doc__[] =
4291"S.isdecimal() -> int\n\
4292\n\
4293Return 1 if there are only decimal characters in S,\n\
42940 otherwise.";
4295
4296static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004297unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004298{
4299 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4300 register const Py_UNICODE *e;
4301
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302 /* Shortcut for single character strings */
4303 if (PyUnicode_GET_SIZE(self) == 1 &&
4304 Py_UNICODE_ISDECIMAL(*p))
4305 return PyInt_FromLong(1);
4306
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004307 /* Special case for empty strings */
4308 if (PyString_GET_SIZE(self) == 0)
4309 return PyInt_FromLong(0);
4310
Guido van Rossumd57fd912000-03-10 22:53:23 +00004311 e = p + PyUnicode_GET_SIZE(self);
4312 for (; p < e; p++) {
4313 if (!Py_UNICODE_ISDECIMAL(*p))
4314 return PyInt_FromLong(0);
4315 }
4316 return PyInt_FromLong(1);
4317}
4318
4319static char isdigit__doc__[] =
4320"S.isdigit() -> int\n\
4321\n\
4322Return 1 if there are only digit characters in S,\n\
43230 otherwise.";
4324
4325static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004326unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327{
4328 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4329 register const Py_UNICODE *e;
4330
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 /* Shortcut for single character strings */
4332 if (PyUnicode_GET_SIZE(self) == 1 &&
4333 Py_UNICODE_ISDIGIT(*p))
4334 return PyInt_FromLong(1);
4335
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004336 /* Special case for empty strings */
4337 if (PyString_GET_SIZE(self) == 0)
4338 return PyInt_FromLong(0);
4339
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340 e = p + PyUnicode_GET_SIZE(self);
4341 for (; p < e; p++) {
4342 if (!Py_UNICODE_ISDIGIT(*p))
4343 return PyInt_FromLong(0);
4344 }
4345 return PyInt_FromLong(1);
4346}
4347
4348static char isnumeric__doc__[] =
4349"S.isnumeric() -> int\n\
4350\n\
4351Return 1 if there are only numeric characters in S,\n\
43520 otherwise.";
4353
4354static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004355unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004356{
4357 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4358 register const Py_UNICODE *e;
4359
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360 /* Shortcut for single character strings */
4361 if (PyUnicode_GET_SIZE(self) == 1 &&
4362 Py_UNICODE_ISNUMERIC(*p))
4363 return PyInt_FromLong(1);
4364
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004365 /* Special case for empty strings */
4366 if (PyString_GET_SIZE(self) == 0)
4367 return PyInt_FromLong(0);
4368
Guido van Rossumd57fd912000-03-10 22:53:23 +00004369 e = p + PyUnicode_GET_SIZE(self);
4370 for (; p < e; p++) {
4371 if (!Py_UNICODE_ISNUMERIC(*p))
4372 return PyInt_FromLong(0);
4373 }
4374 return PyInt_FromLong(1);
4375}
4376
4377static char join__doc__[] =
4378"S.join(sequence) -> unicode\n\
4379\n\
4380Return a string which is the concatenation of the strings in the\n\
4381sequence. The separator between elements is S.";
4382
4383static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004384unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004385{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004386 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004387}
4388
4389static int
4390unicode_length(PyUnicodeObject *self)
4391{
4392 return self->length;
4393}
4394
4395static char ljust__doc__[] =
4396"S.ljust(width) -> unicode\n\
4397\n\
4398Return S left justified in a Unicode string of length width. Padding is\n\
4399done using spaces.";
4400
4401static PyObject *
4402unicode_ljust(PyUnicodeObject *self, PyObject *args)
4403{
4404 int width;
4405 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4406 return NULL;
4407
Tim Peters7a29bd52001-09-12 03:03:31 +00004408 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004409 Py_INCREF(self);
4410 return (PyObject*) self;
4411 }
4412
4413 return (PyObject*) pad(self, 0, width - self->length, ' ');
4414}
4415
4416static char lower__doc__[] =
4417"S.lower() -> unicode\n\
4418\n\
4419Return a copy of the string S converted to lowercase.";
4420
4421static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004422unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 return fixup(self, fixlower);
4425}
4426
4427static char lstrip__doc__[] =
4428"S.lstrip() -> unicode\n\
4429\n\
4430Return a copy of the string S with leading whitespace removed.";
4431
4432static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004433unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004434{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004435 return strip(self, 1, 0);
4436}
4437
4438static PyObject*
4439unicode_repeat(PyUnicodeObject *str, int len)
4440{
4441 PyUnicodeObject *u;
4442 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004443 int nchars;
4444 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004445
4446 if (len < 0)
4447 len = 0;
4448
Tim Peters7a29bd52001-09-12 03:03:31 +00004449 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004450 /* no repeat, return original string */
4451 Py_INCREF(str);
4452 return (PyObject*) str;
4453 }
Tim Peters8f422462000-09-09 06:13:41 +00004454
4455 /* ensure # of chars needed doesn't overflow int and # of bytes
4456 * needed doesn't overflow size_t
4457 */
4458 nchars = len * str->length;
4459 if (len && nchars / len != str->length) {
4460 PyErr_SetString(PyExc_OverflowError,
4461 "repeated string is too long");
4462 return NULL;
4463 }
4464 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4465 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4466 PyErr_SetString(PyExc_OverflowError,
4467 "repeated string is too long");
4468 return NULL;
4469 }
4470 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004471 if (!u)
4472 return NULL;
4473
4474 p = u->str;
4475
4476 while (len-- > 0) {
4477 Py_UNICODE_COPY(p, str->str, str->length);
4478 p += str->length;
4479 }
4480
4481 return (PyObject*) u;
4482}
4483
4484PyObject *PyUnicode_Replace(PyObject *obj,
4485 PyObject *subobj,
4486 PyObject *replobj,
4487 int maxcount)
4488{
4489 PyObject *self;
4490 PyObject *str1;
4491 PyObject *str2;
4492 PyObject *result;
4493
4494 self = PyUnicode_FromObject(obj);
4495 if (self == NULL)
4496 return NULL;
4497 str1 = PyUnicode_FromObject(subobj);
4498 if (str1 == NULL) {
4499 Py_DECREF(self);
4500 return NULL;
4501 }
4502 str2 = PyUnicode_FromObject(replobj);
4503 if (str2 == NULL) {
4504 Py_DECREF(self);
4505 Py_DECREF(str1);
4506 return NULL;
4507 }
4508 result = replace((PyUnicodeObject *)self,
4509 (PyUnicodeObject *)str1,
4510 (PyUnicodeObject *)str2,
4511 maxcount);
4512 Py_DECREF(self);
4513 Py_DECREF(str1);
4514 Py_DECREF(str2);
4515 return result;
4516}
4517
4518static char replace__doc__[] =
4519"S.replace (old, new[, maxsplit]) -> unicode\n\
4520\n\
4521Return a copy of S with all occurrences of substring\n\
4522old replaced by new. If the optional argument maxsplit is\n\
4523given, only the first maxsplit occurrences are replaced.";
4524
4525static PyObject*
4526unicode_replace(PyUnicodeObject *self, PyObject *args)
4527{
4528 PyUnicodeObject *str1;
4529 PyUnicodeObject *str2;
4530 int maxcount = -1;
4531 PyObject *result;
4532
4533 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4534 return NULL;
4535 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4536 if (str1 == NULL)
4537 return NULL;
4538 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4539 if (str2 == NULL)
4540 return NULL;
4541
4542 result = replace(self, str1, str2, maxcount);
4543
4544 Py_DECREF(str1);
4545 Py_DECREF(str2);
4546 return result;
4547}
4548
4549static
4550PyObject *unicode_repr(PyObject *unicode)
4551{
4552 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4553 PyUnicode_GET_SIZE(unicode),
4554 1);
4555}
4556
4557static char rfind__doc__[] =
4558"S.rfind(sub [,start [,end]]) -> int\n\
4559\n\
4560Return the highest index in S where substring sub is found,\n\
4561such that sub is contained within s[start,end]. Optional\n\
4562arguments start and end are interpreted as in slice notation.\n\
4563\n\
4564Return -1 on failure.";
4565
4566static PyObject *
4567unicode_rfind(PyUnicodeObject *self, PyObject *args)
4568{
4569 PyUnicodeObject *substring;
4570 int start = 0;
4571 int end = INT_MAX;
4572 PyObject *result;
4573
Guido van Rossumb8872e62000-05-09 14:14:27 +00004574 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4575 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004576 return NULL;
4577 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4578 (PyObject *)substring);
4579 if (substring == NULL)
4580 return NULL;
4581
4582 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4583
4584 Py_DECREF(substring);
4585 return result;
4586}
4587
4588static char rindex__doc__[] =
4589"S.rindex(sub [,start [,end]]) -> int\n\
4590\n\
4591Like S.rfind() but raise ValueError when the substring is not found.";
4592
4593static PyObject *
4594unicode_rindex(PyUnicodeObject *self, PyObject *args)
4595{
4596 int result;
4597 PyUnicodeObject *substring;
4598 int start = 0;
4599 int end = INT_MAX;
4600
Guido van Rossumb8872e62000-05-09 14:14:27 +00004601 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4602 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603 return NULL;
4604 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4605 (PyObject *)substring);
4606 if (substring == NULL)
4607 return NULL;
4608
4609 result = findstring(self, substring, start, end, -1);
4610
4611 Py_DECREF(substring);
4612 if (result < 0) {
4613 PyErr_SetString(PyExc_ValueError, "substring not found");
4614 return NULL;
4615 }
4616 return PyInt_FromLong(result);
4617}
4618
4619static char rjust__doc__[] =
4620"S.rjust(width) -> unicode\n\
4621\n\
4622Return S right justified in a Unicode string of length width. Padding is\n\
4623done using spaces.";
4624
4625static PyObject *
4626unicode_rjust(PyUnicodeObject *self, PyObject *args)
4627{
4628 int width;
4629 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4630 return NULL;
4631
Tim Peters7a29bd52001-09-12 03:03:31 +00004632 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004633 Py_INCREF(self);
4634 return (PyObject*) self;
4635 }
4636
4637 return (PyObject*) pad(self, width - self->length, 0, ' ');
4638}
4639
4640static char rstrip__doc__[] =
4641"S.rstrip() -> unicode\n\
4642\n\
4643Return a copy of the string S with trailing whitespace removed.";
4644
4645static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004646unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004647{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004648 return strip(self, 0, 1);
4649}
4650
4651static PyObject*
4652unicode_slice(PyUnicodeObject *self, int start, int end)
4653{
4654 /* standard clamping */
4655 if (start < 0)
4656 start = 0;
4657 if (end < 0)
4658 end = 0;
4659 if (end > self->length)
4660 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004661 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 /* full slice, return original string */
4663 Py_INCREF(self);
4664 return (PyObject*) self;
4665 }
4666 if (start > end)
4667 start = end;
4668 /* copy slice */
4669 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4670 end - start);
4671}
4672
4673PyObject *PyUnicode_Split(PyObject *s,
4674 PyObject *sep,
4675 int maxsplit)
4676{
4677 PyObject *result;
4678
4679 s = PyUnicode_FromObject(s);
4680 if (s == NULL)
4681 return NULL;
4682 if (sep != NULL) {
4683 sep = PyUnicode_FromObject(sep);
4684 if (sep == NULL) {
4685 Py_DECREF(s);
4686 return NULL;
4687 }
4688 }
4689
4690 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4691
4692 Py_DECREF(s);
4693 Py_XDECREF(sep);
4694 return result;
4695}
4696
4697static char split__doc__[] =
4698"S.split([sep [,maxsplit]]) -> list of strings\n\
4699\n\
4700Return a list of the words in S, using sep as the\n\
4701delimiter string. If maxsplit is given, at most maxsplit\n\
4702splits are done. If sep is not specified, any whitespace string\n\
4703is a separator.";
4704
4705static PyObject*
4706unicode_split(PyUnicodeObject *self, PyObject *args)
4707{
4708 PyObject *substring = Py_None;
4709 int maxcount = -1;
4710
4711 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4712 return NULL;
4713
4714 if (substring == Py_None)
4715 return split(self, NULL, maxcount);
4716 else if (PyUnicode_Check(substring))
4717 return split(self, (PyUnicodeObject *)substring, maxcount);
4718 else
4719 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4720}
4721
4722static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004723"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004724\n\
4725Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004726Line breaks are not included in the resulting list unless keepends\n\
4727is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728
4729static PyObject*
4730unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4731{
Guido van Rossum86662912000-04-11 15:38:46 +00004732 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733
Guido van Rossum86662912000-04-11 15:38:46 +00004734 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 return NULL;
4736
Guido van Rossum86662912000-04-11 15:38:46 +00004737 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738}
4739
4740static
4741PyObject *unicode_str(PyUnicodeObject *self)
4742{
Fred Drakee4315f52000-05-09 19:53:39 +00004743 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744}
4745
4746static char strip__doc__[] =
4747"S.strip() -> unicode\n\
4748\n\
4749Return a copy of S with leading and trailing whitespace removed.";
4750
4751static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004752unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004754 return strip(self, 1, 1);
4755}
4756
4757static char swapcase__doc__[] =
4758"S.swapcase() -> unicode\n\
4759\n\
4760Return a copy of S with uppercase characters converted to lowercase\n\
4761and vice versa.";
4762
4763static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004764unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 return fixup(self, fixswapcase);
4767}
4768
4769static char translate__doc__[] =
4770"S.translate(table) -> unicode\n\
4771\n\
4772Return a copy of the string S, where all characters have been mapped\n\
4773through the given translation table, which must be a mapping of\n\
4774Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4775are left untouched. Characters mapped to None are deleted.";
4776
4777static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004778unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780 return PyUnicode_TranslateCharmap(self->str,
4781 self->length,
4782 table,
4783 "ignore");
4784}
4785
4786static char upper__doc__[] =
4787"S.upper() -> unicode\n\
4788\n\
4789Return a copy of S converted to uppercase.";
4790
4791static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004792unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794 return fixup(self, fixupper);
4795}
4796
4797#if 0
4798static char zfill__doc__[] =
4799"S.zfill(width) -> unicode\n\
4800\n\
4801Pad a numeric string x with zeros on the left, to fill a field\n\
4802of the specified width. The string x is never truncated.";
4803
4804static PyObject *
4805unicode_zfill(PyUnicodeObject *self, PyObject *args)
4806{
4807 int fill;
4808 PyUnicodeObject *u;
4809
4810 int width;
4811 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4812 return NULL;
4813
4814 if (self->length >= width) {
4815 Py_INCREF(self);
4816 return (PyObject*) self;
4817 }
4818
4819 fill = width - self->length;
4820
4821 u = pad(self, fill, 0, '0');
4822
4823 if (u->str[fill] == '+' || u->str[fill] == '-') {
4824 /* move sign to beginning of string */
4825 u->str[0] = u->str[fill];
4826 u->str[fill] = '0';
4827 }
4828
4829 return (PyObject*) u;
4830}
4831#endif
4832
4833#if 0
4834static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004835unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004836{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004837 return PyInt_FromLong(unicode_freelist_size);
4838}
4839#endif
4840
4841static char startswith__doc__[] =
4842"S.startswith(prefix[, start[, end]]) -> int\n\
4843\n\
4844Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4845optional start, test S beginning at that position. With optional end, stop\n\
4846comparing S at that position.";
4847
4848static PyObject *
4849unicode_startswith(PyUnicodeObject *self,
4850 PyObject *args)
4851{
4852 PyUnicodeObject *substring;
4853 int start = 0;
4854 int end = INT_MAX;
4855 PyObject *result;
4856
Guido van Rossumb8872e62000-05-09 14:14:27 +00004857 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4858 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 return NULL;
4860 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4861 (PyObject *)substring);
4862 if (substring == NULL)
4863 return NULL;
4864
4865 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4866
4867 Py_DECREF(substring);
4868 return result;
4869}
4870
4871
4872static char endswith__doc__[] =
4873"S.endswith(suffix[, start[, end]]) -> int\n\
4874\n\
4875Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4876optional start, test S beginning at that position. With optional end, stop\n\
4877comparing S at that position.";
4878
4879static PyObject *
4880unicode_endswith(PyUnicodeObject *self,
4881 PyObject *args)
4882{
4883 PyUnicodeObject *substring;
4884 int start = 0;
4885 int end = INT_MAX;
4886 PyObject *result;
4887
Guido van Rossumb8872e62000-05-09 14:14:27 +00004888 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4889 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004890 return NULL;
4891 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4892 (PyObject *)substring);
4893 if (substring == NULL)
4894 return NULL;
4895
4896 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4897
4898 Py_DECREF(substring);
4899 return result;
4900}
4901
4902
4903static PyMethodDef unicode_methods[] = {
4904
4905 /* Order is according to common usage: often used methods should
4906 appear first, since lookup is done sequentially. */
4907
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004908 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4909 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4910 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4911 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4912 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4913 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4914 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4915 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4916 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4917 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4918 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4919 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4920 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4921 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4922/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4923 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4924 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4925 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4926 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4927 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4928 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4929 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4930 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4931 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4932 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4933 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4934 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4935 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4936 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4937 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4938 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4939 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4940 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4941 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4942 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004944 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4945 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946#endif
4947
4948#if 0
4949 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004950 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004951#endif
4952
4953 {NULL, NULL}
4954};
4955
Guido van Rossumd57fd912000-03-10 22:53:23 +00004956static PySequenceMethods unicode_as_sequence = {
4957 (inquiry) unicode_length, /* sq_length */
4958 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4959 (intargfunc) unicode_repeat, /* sq_repeat */
4960 (intargfunc) unicode_getitem, /* sq_item */
4961 (intintargfunc) unicode_slice, /* sq_slice */
4962 0, /* sq_ass_item */
4963 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004964 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004965};
4966
4967static int
4968unicode_buffer_getreadbuf(PyUnicodeObject *self,
4969 int index,
4970 const void **ptr)
4971{
4972 if (index != 0) {
4973 PyErr_SetString(PyExc_SystemError,
4974 "accessing non-existent unicode segment");
4975 return -1;
4976 }
4977 *ptr = (void *) self->str;
4978 return PyUnicode_GET_DATA_SIZE(self);
4979}
4980
4981static int
4982unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4983 const void **ptr)
4984{
4985 PyErr_SetString(PyExc_TypeError,
4986 "cannot use unicode as modifyable buffer");
4987 return -1;
4988}
4989
4990static int
4991unicode_buffer_getsegcount(PyUnicodeObject *self,
4992 int *lenp)
4993{
4994 if (lenp)
4995 *lenp = PyUnicode_GET_DATA_SIZE(self);
4996 return 1;
4997}
4998
4999static int
5000unicode_buffer_getcharbuf(PyUnicodeObject *self,
5001 int index,
5002 const void **ptr)
5003{
5004 PyObject *str;
5005
5006 if (index != 0) {
5007 PyErr_SetString(PyExc_SystemError,
5008 "accessing non-existent unicode segment");
5009 return -1;
5010 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005011 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005012 if (str == NULL)
5013 return -1;
5014 *ptr = (void *) PyString_AS_STRING(str);
5015 return PyString_GET_SIZE(str);
5016}
5017
5018/* Helpers for PyUnicode_Format() */
5019
5020static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005021getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022{
5023 int argidx = *p_argidx;
5024 if (argidx < arglen) {
5025 (*p_argidx)++;
5026 if (arglen < 0)
5027 return args;
5028 else
5029 return PyTuple_GetItem(args, argidx);
5030 }
5031 PyErr_SetString(PyExc_TypeError,
5032 "not enough arguments for format string");
5033 return NULL;
5034}
5035
5036#define F_LJUST (1<<0)
5037#define F_SIGN (1<<1)
5038#define F_BLANK (1<<2)
5039#define F_ALT (1<<3)
5040#define F_ZERO (1<<4)
5041
5042static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044{
5045 register int i;
5046 int len;
5047 va_list va;
5048 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050
5051 /* First, format the string as char array, then expand to Py_UNICODE
5052 array. */
5053 charbuffer = (char *)buffer;
5054 len = vsprintf(charbuffer, format, va);
5055 for (i = len - 1; i >= 0; i--)
5056 buffer[i] = (Py_UNICODE) charbuffer[i];
5057
5058 va_end(va);
5059 return len;
5060}
5061
5062static int
5063formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005064 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 int flags,
5066 int prec,
5067 int type,
5068 PyObject *v)
5069{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005070 /* fmt = '%#.' + `prec` + `type`
5071 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 char fmt[20];
5073 double x;
5074
5075 x = PyFloat_AsDouble(v);
5076 if (x == -1.0 && PyErr_Occurred())
5077 return -1;
5078 if (prec < 0)
5079 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5081 type = 'g';
5082 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005083 /* worst case length calc to ensure no buffer overrun:
5084 fmt = %#.<prec>g
5085 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5086 for any double rep.)
5087 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5088 If prec=0 the effective precision is 1 (the leading digit is
5089 always given), therefore increase by one to 10+prec. */
5090 if (buflen <= (size_t)10 + (size_t)prec) {
5091 PyErr_SetString(PyExc_OverflowError,
5092 "formatted float is too long (precision too long?)");
5093 return -1;
5094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 return usprintf(buf, fmt, x);
5096}
5097
Tim Peters38fd5b62000-09-21 05:43:11 +00005098static PyObject*
5099formatlong(PyObject *val, int flags, int prec, int type)
5100{
5101 char *buf;
5102 int i, len;
5103 PyObject *str; /* temporary string object. */
5104 PyUnicodeObject *result;
5105
5106 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5107 if (!str)
5108 return NULL;
5109 result = _PyUnicode_New(len);
5110 for (i = 0; i < len; i++)
5111 result->str[i] = buf[i];
5112 result->str[len] = 0;
5113 Py_DECREF(str);
5114 return (PyObject*)result;
5115}
5116
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117static int
5118formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005119 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 int flags,
5121 int prec,
5122 int type,
5123 PyObject *v)
5124{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005125 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005126 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5127 + 1 + 1 = 24*/
5128 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005129 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005130 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131
5132 x = PyInt_AsLong(v);
5133 if (x == -1 && PyErr_Occurred())
5134 return -1;
5135 if (prec < 0)
5136 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005137 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5138 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5139 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5140 PyErr_SetString(PyExc_OverflowError,
5141 "formatted integer is too long (precision too long?)");
5142 return -1;
5143 }
Tim Petersfff53252001-04-12 18:38:48 +00005144 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5145 * but we want it (for consistency with other %#x conversions, and
5146 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005147 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5148 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5149 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005150 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005151 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5152 /* Only way to know what the platform does is to try it. */
5153 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5154 if (fmt[1] != (char)type) {
5155 /* Supply our own leading 0x/0X -- needed under std C */
5156 use_native_c_format = 0;
5157 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5158 }
5159 }
5160 if (use_native_c_format)
5161 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005162 return usprintf(buf, fmt, x);
5163}
5164
5165static int
5166formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005167 size_t buflen,
5168 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005169{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005170 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005171 if (PyUnicode_Check(v)) {
5172 if (PyUnicode_GET_SIZE(v) != 1)
5173 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005174 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005177 else if (PyString_Check(v)) {
5178 if (PyString_GET_SIZE(v) != 1)
5179 goto onError;
5180 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182
5183 else {
5184 /* Integer input truncated to a character */
5185 long x;
5186 x = PyInt_AsLong(v);
5187 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005188 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005189 buf[0] = (char) x;
5190 }
5191 buf[1] = '\0';
5192 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005193
5194 onError:
5195 PyErr_SetString(PyExc_TypeError,
5196 "%c requires int or char");
5197 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198}
5199
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005200/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5201
5202 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5203 chars are formatted. XXX This is a magic number. Each formatting
5204 routine does bounds checking to ensure no overflow, but a better
5205 solution may be to malloc a buffer of appropriate size for each
5206 format. For now, the current solution is sufficient.
5207*/
5208#define FORMATBUFLEN (size_t)120
5209
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210PyObject *PyUnicode_Format(PyObject *format,
5211 PyObject *args)
5212{
5213 Py_UNICODE *fmt, *res;
5214 int fmtcnt, rescnt, reslen, arglen, argidx;
5215 int args_owned = 0;
5216 PyUnicodeObject *result = NULL;
5217 PyObject *dict = NULL;
5218 PyObject *uformat;
5219
5220 if (format == NULL || args == NULL) {
5221 PyErr_BadInternalCall();
5222 return NULL;
5223 }
5224 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005225 if (uformat == NULL)
5226 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005227 fmt = PyUnicode_AS_UNICODE(uformat);
5228 fmtcnt = PyUnicode_GET_SIZE(uformat);
5229
5230 reslen = rescnt = fmtcnt + 100;
5231 result = _PyUnicode_New(reslen);
5232 if (result == NULL)
5233 goto onError;
5234 res = PyUnicode_AS_UNICODE(result);
5235
5236 if (PyTuple_Check(args)) {
5237 arglen = PyTuple_Size(args);
5238 argidx = 0;
5239 }
5240 else {
5241 arglen = -1;
5242 argidx = -2;
5243 }
5244 if (args->ob_type->tp_as_mapping)
5245 dict = args;
5246
5247 while (--fmtcnt >= 0) {
5248 if (*fmt != '%') {
5249 if (--rescnt < 0) {
5250 rescnt = fmtcnt + 100;
5251 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005252 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253 return NULL;
5254 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5255 --rescnt;
5256 }
5257 *res++ = *fmt++;
5258 }
5259 else {
5260 /* Got a format specifier */
5261 int flags = 0;
5262 int width = -1;
5263 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 Py_UNICODE c = '\0';
5265 Py_UNICODE fill;
5266 PyObject *v = NULL;
5267 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005268 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005269 Py_UNICODE sign;
5270 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005271 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272
5273 fmt++;
5274 if (*fmt == '(') {
5275 Py_UNICODE *keystart;
5276 int keylen;
5277 PyObject *key;
5278 int pcount = 1;
5279
5280 if (dict == NULL) {
5281 PyErr_SetString(PyExc_TypeError,
5282 "format requires a mapping");
5283 goto onError;
5284 }
5285 ++fmt;
5286 --fmtcnt;
5287 keystart = fmt;
5288 /* Skip over balanced parentheses */
5289 while (pcount > 0 && --fmtcnt >= 0) {
5290 if (*fmt == ')')
5291 --pcount;
5292 else if (*fmt == '(')
5293 ++pcount;
5294 fmt++;
5295 }
5296 keylen = fmt - keystart - 1;
5297 if (fmtcnt < 0 || pcount > 0) {
5298 PyErr_SetString(PyExc_ValueError,
5299 "incomplete format key");
5300 goto onError;
5301 }
Fred Drakee4315f52000-05-09 19:53:39 +00005302 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005303 then looked up since Python uses strings to hold
5304 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005305 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 key = PyUnicode_EncodeUTF8(keystart,
5307 keylen,
5308 NULL);
5309 if (key == NULL)
5310 goto onError;
5311 if (args_owned) {
5312 Py_DECREF(args);
5313 args_owned = 0;
5314 }
5315 args = PyObject_GetItem(dict, key);
5316 Py_DECREF(key);
5317 if (args == NULL) {
5318 goto onError;
5319 }
5320 args_owned = 1;
5321 arglen = -1;
5322 argidx = -2;
5323 }
5324 while (--fmtcnt >= 0) {
5325 switch (c = *fmt++) {
5326 case '-': flags |= F_LJUST; continue;
5327 case '+': flags |= F_SIGN; continue;
5328 case ' ': flags |= F_BLANK; continue;
5329 case '#': flags |= F_ALT; continue;
5330 case '0': flags |= F_ZERO; continue;
5331 }
5332 break;
5333 }
5334 if (c == '*') {
5335 v = getnextarg(args, arglen, &argidx);
5336 if (v == NULL)
5337 goto onError;
5338 if (!PyInt_Check(v)) {
5339 PyErr_SetString(PyExc_TypeError,
5340 "* wants int");
5341 goto onError;
5342 }
5343 width = PyInt_AsLong(v);
5344 if (width < 0) {
5345 flags |= F_LJUST;
5346 width = -width;
5347 }
5348 if (--fmtcnt >= 0)
5349 c = *fmt++;
5350 }
5351 else if (c >= '0' && c <= '9') {
5352 width = c - '0';
5353 while (--fmtcnt >= 0) {
5354 c = *fmt++;
5355 if (c < '0' || c > '9')
5356 break;
5357 if ((width*10) / 10 != width) {
5358 PyErr_SetString(PyExc_ValueError,
5359 "width too big");
5360 goto onError;
5361 }
5362 width = width*10 + (c - '0');
5363 }
5364 }
5365 if (c == '.') {
5366 prec = 0;
5367 if (--fmtcnt >= 0)
5368 c = *fmt++;
5369 if (c == '*') {
5370 v = getnextarg(args, arglen, &argidx);
5371 if (v == NULL)
5372 goto onError;
5373 if (!PyInt_Check(v)) {
5374 PyErr_SetString(PyExc_TypeError,
5375 "* wants int");
5376 goto onError;
5377 }
5378 prec = PyInt_AsLong(v);
5379 if (prec < 0)
5380 prec = 0;
5381 if (--fmtcnt >= 0)
5382 c = *fmt++;
5383 }
5384 else if (c >= '0' && c <= '9') {
5385 prec = c - '0';
5386 while (--fmtcnt >= 0) {
5387 c = Py_CHARMASK(*fmt++);
5388 if (c < '0' || c > '9')
5389 break;
5390 if ((prec*10) / 10 != prec) {
5391 PyErr_SetString(PyExc_ValueError,
5392 "prec too big");
5393 goto onError;
5394 }
5395 prec = prec*10 + (c - '0');
5396 }
5397 }
5398 } /* prec */
5399 if (fmtcnt >= 0) {
5400 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005401 if (--fmtcnt >= 0)
5402 c = *fmt++;
5403 }
5404 }
5405 if (fmtcnt < 0) {
5406 PyErr_SetString(PyExc_ValueError,
5407 "incomplete format");
5408 goto onError;
5409 }
5410 if (c != '%') {
5411 v = getnextarg(args, arglen, &argidx);
5412 if (v == NULL)
5413 goto onError;
5414 }
5415 sign = 0;
5416 fill = ' ';
5417 switch (c) {
5418
5419 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005420 pbuf = formatbuf;
5421 /* presume that buffer length is at least 1 */
5422 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005423 len = 1;
5424 break;
5425
5426 case 's':
5427 case 'r':
5428 if (PyUnicode_Check(v) && c == 's') {
5429 temp = v;
5430 Py_INCREF(temp);
5431 }
5432 else {
5433 PyObject *unicode;
5434 if (c == 's')
5435 temp = PyObject_Str(v);
5436 else
5437 temp = PyObject_Repr(v);
5438 if (temp == NULL)
5439 goto onError;
5440 if (!PyString_Check(temp)) {
5441 /* XXX Note: this should never happen, since
5442 PyObject_Repr() and PyObject_Str() assure
5443 this */
5444 Py_DECREF(temp);
5445 PyErr_SetString(PyExc_TypeError,
5446 "%s argument has non-string str()");
5447 goto onError;
5448 }
Fred Drakee4315f52000-05-09 19:53:39 +00005449 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005450 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005451 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005452 "strict");
5453 Py_DECREF(temp);
5454 temp = unicode;
5455 if (temp == NULL)
5456 goto onError;
5457 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005458 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005459 len = PyUnicode_GET_SIZE(temp);
5460 if (prec >= 0 && len > prec)
5461 len = prec;
5462 break;
5463
5464 case 'i':
5465 case 'd':
5466 case 'u':
5467 case 'o':
5468 case 'x':
5469 case 'X':
5470 if (c == 'i')
5471 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005472 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005473 temp = formatlong(v, flags, prec, c);
5474 if (!temp)
5475 goto onError;
5476 pbuf = PyUnicode_AS_UNICODE(temp);
5477 len = PyUnicode_GET_SIZE(temp);
5478 /* unbounded ints can always produce
5479 a sign character! */
5480 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005481 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005482 else {
5483 pbuf = formatbuf;
5484 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5485 flags, prec, c, v);
5486 if (len < 0)
5487 goto onError;
5488 /* only d conversion is signed */
5489 sign = c == 'd';
5490 }
5491 if (flags & F_ZERO)
5492 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005493 break;
5494
5495 case 'e':
5496 case 'E':
5497 case 'f':
5498 case 'g':
5499 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005500 pbuf = formatbuf;
5501 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5502 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005503 if (len < 0)
5504 goto onError;
5505 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005506 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 fill = '0';
5508 break;
5509
5510 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005511 pbuf = formatbuf;
5512 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005513 if (len < 0)
5514 goto onError;
5515 break;
5516
5517 default:
5518 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005519 "unsupported format character '%c' (0x%x) "
5520 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005521 (31<=c && c<=126) ? c : '?',
5522 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005523 goto onError;
5524 }
5525 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005526 if (*pbuf == '-' || *pbuf == '+') {
5527 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005528 len--;
5529 }
5530 else if (flags & F_SIGN)
5531 sign = '+';
5532 else if (flags & F_BLANK)
5533 sign = ' ';
5534 else
5535 sign = 0;
5536 }
5537 if (width < len)
5538 width = len;
5539 if (rescnt < width + (sign != 0)) {
5540 reslen -= rescnt;
5541 rescnt = width + fmtcnt + 100;
5542 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005543 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005544 return NULL;
5545 res = PyUnicode_AS_UNICODE(result)
5546 + reslen - rescnt;
5547 }
5548 if (sign) {
5549 if (fill != ' ')
5550 *res++ = sign;
5551 rescnt--;
5552 if (width > len)
5553 width--;
5554 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005555 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5556 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005557 assert(pbuf[1] == c);
5558 if (fill != ' ') {
5559 *res++ = *pbuf++;
5560 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005561 }
Tim Petersfff53252001-04-12 18:38:48 +00005562 rescnt -= 2;
5563 width -= 2;
5564 if (width < 0)
5565 width = 0;
5566 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005567 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005568 if (width > len && !(flags & F_LJUST)) {
5569 do {
5570 --rescnt;
5571 *res++ = fill;
5572 } while (--width > len);
5573 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005574 if (fill == ' ') {
5575 if (sign)
5576 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005577 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005578 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005579 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005580 *res++ = *pbuf++;
5581 *res++ = *pbuf++;
5582 }
5583 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005584 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005585 res += len;
5586 rescnt -= len;
5587 while (--width >= len) {
5588 --rescnt;
5589 *res++ = ' ';
5590 }
5591 if (dict && (argidx < arglen) && c != '%') {
5592 PyErr_SetString(PyExc_TypeError,
5593 "not all arguments converted");
5594 goto onError;
5595 }
5596 Py_XDECREF(temp);
5597 } /* '%' */
5598 } /* until end */
5599 if (argidx < arglen && !dict) {
5600 PyErr_SetString(PyExc_TypeError,
5601 "not all arguments converted");
5602 goto onError;
5603 }
5604
5605 if (args_owned) {
5606 Py_DECREF(args);
5607 }
5608 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005609 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005610 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005611 return (PyObject *)result;
5612
5613 onError:
5614 Py_XDECREF(result);
5615 Py_DECREF(uformat);
5616 if (args_owned) {
5617 Py_DECREF(args);
5618 }
5619 return NULL;
5620}
5621
5622static PyBufferProcs unicode_as_buffer = {
5623 (getreadbufferproc) unicode_buffer_getreadbuf,
5624 (getwritebufferproc) unicode_buffer_getwritebuf,
5625 (getsegcountproc) unicode_buffer_getsegcount,
5626 (getcharbufferproc) unicode_buffer_getcharbuf,
5627};
5628
Guido van Rossume023fe02001-08-30 03:12:59 +00005629staticforward PyObject *
5630unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5631
Tim Peters6d6c1a32001-08-02 04:15:00 +00005632static PyObject *
5633unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5634{
5635 PyObject *x = NULL;
5636 static char *kwlist[] = {"string", "encoding", "errors", 0};
5637 char *encoding = NULL;
5638 char *errors = NULL;
5639
Guido van Rossume023fe02001-08-30 03:12:59 +00005640 if (type != &PyUnicode_Type)
5641 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005642 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5643 kwlist, &x, &encoding, &errors))
5644 return NULL;
5645 if (x == NULL)
5646 return (PyObject *)_PyUnicode_New(0);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00005647 if (encoding == NULL && errors == NULL)
5648 return PyObject_Unicode(x);
5649 else
Tim Peters6d6c1a32001-08-02 04:15:00 +00005650 return PyUnicode_FromEncodedObject(x, encoding, errors);
5651}
5652
Guido van Rossume023fe02001-08-30 03:12:59 +00005653static PyObject *
5654unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5655{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005656 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005657 int n;
5658
5659 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5660 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5661 if (tmp == NULL)
5662 return NULL;
5663 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005664 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5665 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005666 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005667 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5668 if (pnew->str == NULL) {
5669 _Py_ForgetReference((PyObject *)pnew);
5670 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005671 return NULL;
5672 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005673 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5674 pnew->length = n;
5675 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005676 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005677 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005678}
5679
Tim Peters6d6c1a32001-08-02 04:15:00 +00005680static char unicode_doc[] =
5681"unicode(string [, encoding[, errors]]) -> object\n\
5682\n\
5683Create a new Unicode object from the given encoded string.\n\
5684encoding defaults to the current default string encoding and \n\
5685errors, defining the error handling, to 'strict'.";
5686
Guido van Rossumd57fd912000-03-10 22:53:23 +00005687PyTypeObject PyUnicode_Type = {
5688 PyObject_HEAD_INIT(&PyType_Type)
5689 0, /* ob_size */
5690 "unicode", /* tp_name */
5691 sizeof(PyUnicodeObject), /* tp_size */
5692 0, /* tp_itemsize */
5693 /* Slots */
Guido van Rossum9475a232001-10-05 20:51:39 +00005694 (destructor)unicode_dealloc, /* tp_dealloc */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005695 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005696 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005697 0, /* tp_setattr */
5698 (cmpfunc) unicode_compare, /* tp_compare */
5699 (reprfunc) unicode_repr, /* tp_repr */
5700 0, /* tp_as_number */
5701 &unicode_as_sequence, /* tp_as_sequence */
5702 0, /* tp_as_mapping */
5703 (hashfunc) unicode_hash, /* tp_hash*/
5704 0, /* tp_call*/
5705 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005706 PyObject_GenericGetAttr, /* tp_getattro */
5707 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005708 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005709 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005710 unicode_doc, /* tp_doc */
5711 0, /* tp_traverse */
5712 0, /* tp_clear */
5713 0, /* tp_richcompare */
5714 0, /* tp_weaklistoffset */
5715 0, /* tp_iter */
5716 0, /* tp_iternext */
5717 unicode_methods, /* tp_methods */
5718 0, /* tp_members */
5719 0, /* tp_getset */
5720 0, /* tp_base */
5721 0, /* tp_dict */
5722 0, /* tp_descr_get */
5723 0, /* tp_descr_set */
5724 0, /* tp_dictoffset */
5725 0, /* tp_init */
5726 0, /* tp_alloc */
5727 unicode_new, /* tp_new */
Guido van Rossum9475a232001-10-05 20:51:39 +00005728 _PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729};
5730
5731/* Initialize the Unicode implementation */
5732
Thomas Wouters78890102000-07-22 19:25:51 +00005733void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005735 int i;
5736
Fred Drakee4315f52000-05-09 19:53:39 +00005737 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005738 unicode_freelist = NULL;
5739 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005741 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005742 for (i = 0; i < 256; i++)
5743 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744}
5745
5746/* Finalize the Unicode implementation */
5747
5748void
Thomas Wouters78890102000-07-22 19:25:51 +00005749_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005751 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005752 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005754 Py_XDECREF(unicode_empty);
5755 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005756
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005757 for (i = 0; i < 256; i++) {
5758 if (unicode_latin1[i]) {
5759 Py_DECREF(unicode_latin1[i]);
5760 unicode_latin1[i] = NULL;
5761 }
5762 }
5763
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005764 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 PyUnicodeObject *v = u;
5766 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005767 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005768 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005769 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005770 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005772 unicode_freelist = NULL;
5773 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774}