blob: 896e80f794993621c085dee594548ab7b653907d [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000401 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000405 int reclevel;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406
407 if (obj == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000411
412 /* Coerce object */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000413 for (reclevel = 0; reclevel < 2; reclevel++) {
414
415 if (PyUnicode_Check(obj)) {
416 if (encoding) {
417 PyErr_SetString(PyExc_TypeError,
418 "decoding Unicode is not supported");
419 goto onError;
420 }
421 if (PyUnicode_CheckExact(obj)) {
422 Py_INCREF(obj);
423 v = obj;
424 }
425 else {
426 /* For a subclass of unicode, return a true unicode object
427 with the same string value. */
428 v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
429 PyUnicode_GET_SIZE(obj));
430 }
431 goto done;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000432 }
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000433 else if (PyString_Check(obj)) {
434 s = PyString_AS_STRING(obj);
435 len = PyString_GET_SIZE(obj);
436 break;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000437 }
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000438 else {
439 PyObject *w;
440
441 /* Try char buffer interface */
442 if (PyObject_AsCharBuffer(obj, &s, &len))
443 PyErr_Clear();
444 else
445 break;
446
447 /* Mimic the behaviour of str(object) if everything else
448 fails (see PyObject_Str()); this also covers instances
449 which implement __str__. */
450 if (obj->ob_type->tp_str == NULL)
451 w = PyObject_Repr(obj);
452 else
453 w = (*obj->ob_type->tp_str)(obj);
454 if (w == NULL)
455 goto onError;
456 if (owned) {
457 Py_DECREF(obj);
458 }
459 obj = w;
460 owned = 1;
Tim Peters78e0fc72001-09-11 03:07:38 +0000461 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000464 if (s == NULL) {
465 PyErr_Format(PyExc_TypeError,
466 "coercing to Unicode: __str__ recursion limit exceeded "
467 "(last type: %.80s)",
468 obj->ob_type->tp_name);
469 goto onError;
470 }
471
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000472 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000473 if (len == 0) {
474 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 else
478 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000479
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000480 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000481 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000482 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000483 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000484 return v;
485
486 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000487 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000488 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000489 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491}
492
493PyObject *PyUnicode_Decode(const char *s,
494 int size,
495 const char *encoding,
496 const char *errors)
497{
498 PyObject *buffer = NULL, *unicode;
499
Fred Drakee4315f52000-05-09 19:53:39 +0000500 if (encoding == NULL)
501 encoding = PyUnicode_GetDefaultEncoding();
502
503 /* Shortcuts for common default encodings */
504 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000506 else if (strcmp(encoding, "latin-1") == 0)
507 return PyUnicode_DecodeLatin1(s, size, errors);
508 else if (strcmp(encoding, "ascii") == 0)
509 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 /* Decode via the codec registry */
512 buffer = PyBuffer_FromMemory((void *)s, size);
513 if (buffer == NULL)
514 goto onError;
515 unicode = PyCodec_Decode(buffer, encoding, errors);
516 if (unicode == NULL)
517 goto onError;
518 if (!PyUnicode_Check(unicode)) {
519 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000520 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 unicode->ob_type->tp_name);
522 Py_DECREF(unicode);
523 goto onError;
524 }
525 Py_DECREF(buffer);
526 return unicode;
527
528 onError:
529 Py_XDECREF(buffer);
530 return NULL;
531}
532
533PyObject *PyUnicode_Encode(const Py_UNICODE *s,
534 int size,
535 const char *encoding,
536 const char *errors)
537{
538 PyObject *v, *unicode;
539
540 unicode = PyUnicode_FromUnicode(s, size);
541 if (unicode == NULL)
542 return NULL;
543 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
544 Py_DECREF(unicode);
545 return v;
546}
547
548PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
549 const char *encoding,
550 const char *errors)
551{
552 PyObject *v;
553
554 if (!PyUnicode_Check(unicode)) {
555 PyErr_BadArgument();
556 goto onError;
557 }
Fred Drakee4315f52000-05-09 19:53:39 +0000558
559 if (encoding == NULL)
560 encoding = PyUnicode_GetDefaultEncoding();
561
562 /* Shortcuts for common default encodings */
563 if (errors == NULL) {
564 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000565 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000566 else if (strcmp(encoding, "latin-1") == 0)
567 return PyUnicode_AsLatin1String(unicode);
568 else if (strcmp(encoding, "ascii") == 0)
569 return PyUnicode_AsASCIIString(unicode);
570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571
572 /* Encode via the codec registry */
573 v = PyCodec_Encode(unicode, encoding, errors);
574 if (v == NULL)
575 goto onError;
576 /* XXX Should we really enforce this ? */
577 if (!PyString_Check(v)) {
578 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000579 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580 v->ob_type->tp_name);
581 Py_DECREF(v);
582 goto onError;
583 }
584 return v;
585
586 onError:
587 return NULL;
588}
589
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000590PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
591 const char *errors)
592{
593 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
594
595 if (v)
596 return v;
597 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
598 if (v && errors == NULL)
599 ((PyUnicodeObject *)unicode)->defenc = v;
600 return v;
601}
602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
604{
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
608 }
609 return PyUnicode_AS_UNICODE(unicode);
610
611 onError:
612 return NULL;
613}
614
615int PyUnicode_GetSize(PyObject *unicode)
616{
617 if (!PyUnicode_Check(unicode)) {
618 PyErr_BadArgument();
619 goto onError;
620 }
621 return PyUnicode_GET_SIZE(unicode);
622
623 onError:
624 return -1;
625}
626
Thomas Wouters78890102000-07-22 19:25:51 +0000627const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000628{
629 return unicode_default_encoding;
630}
631
632int PyUnicode_SetDefaultEncoding(const char *encoding)
633{
634 PyObject *v;
635
636 /* Make sure the encoding is valid. As side effect, this also
637 loads the encoding into the codec registry cache. */
638 v = _PyCodec_Lookup(encoding);
639 if (v == NULL)
640 goto onError;
641 Py_DECREF(v);
642 strncpy(unicode_default_encoding,
643 encoding,
644 sizeof(unicode_default_encoding));
645 return 0;
646
647 onError:
648 return -1;
649}
650
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000651/* --- UTF-7 Codec -------------------------------------------------------- */
652
653/* see RFC2152 for details */
654
655static
656char utf7_special[128] = {
657 /* indicate whether a UTF-7 character is special i.e. cannot be directly
658 encoded:
659 0 - not special
660 1 - special
661 2 - whitespace (optional)
662 3 - RFC2152 Set O (optional) */
663 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
665 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
667 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
669 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
671
672};
673
674#define SPECIAL(c, encodeO, encodeWS) \
675 (((c)>127 || utf7_special[(c)] == 1) || \
676 (encodeWS && (utf7_special[(c)] == 2)) || \
677 (encodeO && (utf7_special[(c)] == 3)))
678
679#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
680#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
681#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
682 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
683
684#define ENCODE(out, ch, bits) \
685 while (bits >= 6) { \
686 *out++ = B64(ch >> (bits-6)); \
687 bits -= 6; \
688 }
689
690#define DECODE(out, ch, bits, surrogate) \
691 while (bits >= 16) { \
692 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
693 bits -= 16; \
694 if (surrogate) { \
695 /* We have already generated an error for the high surrogate
696 so let's not bother seeing if the low surrogate is correct or not */\
697 surrogate = 0; \
698 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
699 /* This is a surrogate pair. Unfortunately we can't represent \
700 it in a 16-bit character */ \
701 surrogate = 1; \
702 errmsg = "code pairs are not supported"; \
703 goto utf7Error; \
704 } else { \
705 *out++ = outCh; \
706 } \
707 } \
708
709static
710int utf7_decoding_error(Py_UNICODE **dest,
711 const char *errors,
712 const char *details)
713{
714 if ((errors == NULL) ||
715 (strcmp(errors,"strict") == 0)) {
716 PyErr_Format(PyExc_UnicodeError,
717 "UTF-7 decoding error: %.400s",
718 details);
719 return -1;
720 }
721 else if (strcmp(errors,"ignore") == 0) {
722 return 0;
723 }
724 else if (strcmp(errors,"replace") == 0) {
725 if (dest != NULL) {
726 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
727 (*dest)++;
728 }
729 return 0;
730 }
731 else {
732 PyErr_Format(PyExc_ValueError,
733 "UTF-7 decoding error; unknown error handling code: %.400s",
734 errors);
735 return -1;
736 }
737}
738
739PyObject *PyUnicode_DecodeUTF7(const char *s,
740 int size,
741 const char *errors)
742{
743 const char *e;
744 PyUnicodeObject *unicode;
745 Py_UNICODE *p;
746 const char *errmsg = "";
747 int inShift = 0;
748 unsigned int bitsleft = 0;
749 unsigned long charsleft = 0;
750 int surrogate = 0;
751
752 unicode = _PyUnicode_New(size);
753 if (!unicode)
754 return NULL;
755 if (size == 0)
756 return (PyObject *)unicode;
757
758 p = unicode->str;
759 e = s + size;
760
761 while (s < e) {
762 Py_UNICODE ch = *s;
763
764 if (inShift) {
765 if ((ch == '-') || !B64CHAR(ch)) {
766 inShift = 0;
767 s++;
768
769 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
770 if (bitsleft >= 6) {
771 /* The shift sequence has a partial character in it. If
772 bitsleft < 6 then we could just classify it as padding
773 but that is not the case here */
774
775 errmsg = "partial character in shift sequence";
776 goto utf7Error;
777 }
778 /* According to RFC2152 the remaining bits should be zero. We
779 choose to signal an error/insert a replacement character
780 here so indicate the potential of a misencoded character. */
781
782 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
783 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
784 errmsg = "non-zero padding bits in shift sequence";
785 goto utf7Error;
786 }
787
788 if (ch == '-') {
789 if ((s < e) && (*(s) == '-')) {
790 *p++ = '-';
791 inShift = 1;
792 }
793 } else if (SPECIAL(ch,0,0)) {
794 errmsg = "unexpected special character";
795 goto utf7Error;
796 } else {
797 *p++ = ch;
798 }
799 } else {
800 charsleft = (charsleft << 6) | UB64(ch);
801 bitsleft += 6;
802 s++;
803 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
804 }
805 }
806 else if ( ch == '+' ) {
807 s++;
808 if (s < e && *s == '-') {
809 s++;
810 *p++ = '+';
811 } else
812 {
813 inShift = 1;
814 bitsleft = 0;
815 }
816 }
817 else if (SPECIAL(ch,0,0)) {
818 errmsg = "unexpected special character";
819 s++;
820 goto utf7Error;
821 }
822 else {
823 *p++ = ch;
824 s++;
825 }
826 continue;
827 utf7Error:
828 if (utf7_decoding_error(&p, errors, errmsg))
829 goto onError;
830 }
831
832 if (inShift) {
833 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
834 goto onError;
835 }
836
837 if (_PyUnicode_Resize(&unicode, p - unicode->str))
838 goto onError;
839
840 return (PyObject *)unicode;
841
842onError:
843 Py_DECREF(unicode);
844 return NULL;
845}
846
847
848PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
849 int size,
850 int encodeSetO,
851 int encodeWhiteSpace,
852 const char *errors)
853{
854 PyObject *v;
855 /* It might be possible to tighten this worst case */
856 unsigned int cbAllocated = 5 * size;
857 int inShift = 0;
858 int i = 0;
859 unsigned int bitsleft = 0;
860 unsigned long charsleft = 0;
861 char * out;
862 char * start;
863
864 if (size == 0)
865 return PyString_FromStringAndSize(NULL, 0);
866
867 v = PyString_FromStringAndSize(NULL, cbAllocated);
868 if (v == NULL)
869 return NULL;
870
871 start = out = PyString_AS_STRING(v);
872 for (;i < size; ++i) {
873 Py_UNICODE ch = s[i];
874
875 if (!inShift) {
876 if (ch == '+') {
877 *out++ = '+';
878 *out++ = '-';
879 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
880 charsleft = ch;
881 bitsleft = 16;
882 *out++ = '+';
883 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
884 inShift = bitsleft > 0;
885 } else {
886 *out++ = (char) ch;
887 }
888 } else {
889 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
890 *out++ = B64(charsleft << (6-bitsleft));
891 charsleft = 0;
892 bitsleft = 0;
893 /* Characters not in the BASE64 set implicitly unshift the sequence
894 so no '-' is required, except if the character is itself a '-' */
895 if (B64CHAR(ch) || ch == '-') {
896 *out++ = '-';
897 }
898 inShift = 0;
899 *out++ = (char) ch;
900 } else {
901 bitsleft += 16;
902 charsleft = (charsleft << 16) | ch;
903 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
904
905 /* If the next character is special then we dont' need to terminate
906 the shift sequence. If the next character is not a BASE64 character
907 or '-' then the shift sequence will be terminated implicitly and we
908 don't have to insert a '-'. */
909
910 if (bitsleft == 0) {
911 if (i + 1 < size) {
912 Py_UNICODE ch2 = s[i+1];
913
914 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
915
916 } else if (B64CHAR(ch2) || ch2 == '-') {
917 *out++ = '-';
918 inShift = 0;
919 } else {
920 inShift = 0;
921 }
922
923 }
924 else {
925 *out++ = '-';
926 inShift = 0;
927 }
928 }
929 }
930 }
931 }
932 if (bitsleft) {
933 *out++= B64(charsleft << (6-bitsleft) );
934 *out++ = '-';
935 }
936
937 if (_PyString_Resize(&v, out - start)) {
938 Py_DECREF(v);
939 return NULL;
940 }
941 return v;
942}
943
944#undef SPECIAL
945#undef B64
946#undef B64CHAR
947#undef UB64
948#undef ENCODE
949#undef DECODE
950
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951/* --- UTF-8 Codec -------------------------------------------------------- */
952
953static
954char utf8_code_length[256] = {
955 /* Map UTF-8 encoded prefix byte to sequence length. zero means
956 illegal prefix. see RFC 2279 for details */
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
964 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
969 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
971 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
972 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
973};
974
975static
976int utf8_decoding_error(const char **source,
977 Py_UNICODE **dest,
978 const char *errors,
979 const char *details)
980{
981 if ((errors == NULL) ||
982 (strcmp(errors,"strict") == 0)) {
983 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000984 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000985 details);
986 return -1;
987 }
988 else if (strcmp(errors,"ignore") == 0) {
989 (*source)++;
990 return 0;
991 }
992 else if (strcmp(errors,"replace") == 0) {
993 (*source)++;
994 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
995 (*dest)++;
996 return 0;
997 }
998 else {
999 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001000 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001001 errors);
1002 return -1;
1003 }
1004}
1005
Guido van Rossumd57fd912000-03-10 22:53:23 +00001006PyObject *PyUnicode_DecodeUTF8(const char *s,
1007 int size,
1008 const char *errors)
1009{
1010 int n;
1011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001014 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015
1016 /* Note: size will always be longer than the resulting Unicode
1017 character count */
1018 unicode = _PyUnicode_New(size);
1019 if (!unicode)
1020 return NULL;
1021 if (size == 0)
1022 return (PyObject *)unicode;
1023
1024 /* Unpack UTF-8 encoded data */
1025 p = unicode->str;
1026 e = s + size;
1027
1028 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001029 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030
1031 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001032 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 s++;
1034 continue;
1035 }
1036
1037 n = utf8_code_length[ch];
1038
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001039 if (s + n > e) {
1040 errmsg = "unexpected end of data";
1041 goto utf8Error;
1042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 switch (n) {
1045
1046 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001047 errmsg = "unexpected code byte";
1048 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049
1050 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001051 errmsg = "internal error";
1052 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053
1054 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 if ((s[1] & 0xc0) != 0x80) {
1056 errmsg = "invalid data";
1057 goto utf8Error;
1058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001060 if (ch < 0x80) {
1061 errmsg = "illegal encoding";
1062 goto utf8Error;
1063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001065 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001066 break;
1067
1068 case 3:
1069 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001070 (s[2] & 0xc0) != 0x80) {
1071 errmsg = "invalid data";
1072 goto utf8Error;
1073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001075 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001080 *p++ = (Py_UNICODE)ch;
1081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
1175PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1176 int size,
1177 const char *errors)
1178{
1179 PyObject *v;
1180 char *p;
1181 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001182 Py_UCS4 ch2;
1183 unsigned int cbAllocated = 3 * size;
1184 unsigned int cbWritten = 0;
1185 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001187 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 if (v == NULL)
1189 return NULL;
1190 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001191 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192
1193 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001194 while (i < size) {
1195 Py_UCS4 ch = s[i++];
1196 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001198 cbWritten++;
1199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 else if (ch < 0x0800) {
1201 *p++ = 0xc0 | (ch >> 6);
1202 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001203 cbWritten += 2;
1204 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001205 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001206 /* Check for high surrogate */
1207 if (0xD800 <= ch && ch <= 0xDBFF) {
1208 if (i != size) {
1209 ch2 = s[i];
1210 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1211
1212 if (cbWritten >= (cbAllocated - 4)) {
1213 /* Provide enough room for some more
1214 surrogates */
1215 cbAllocated += 4*10;
1216 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001217 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001218 }
1219
1220 /* combine the two values */
1221 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1222
1223 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001224 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001225 i++;
1226 cbWritten += 4;
1227 }
1228 }
1229 }
1230 else {
1231 *p++ = (char)(0xe0 | (ch >> 12));
1232 cbWritten += 3;
1233 }
1234 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1235 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001236 } else {
1237 *p++ = 0xf0 | (ch>>18);
1238 *p++ = 0x80 | ((ch>>12) & 0x3f);
1239 *p++ = 0x80 | ((ch>>6) & 0x3f);
1240 *p++ = 0x80 | (ch & 0x3f);
1241 cbWritten += 4;
1242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
1244 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001245 if (_PyString_Resize(&v, p - q))
1246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return v;
1248
1249 onError:
1250 Py_DECREF(v);
1251 return NULL;
1252}
1253
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1255{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 if (!PyUnicode_Check(unicode)) {
1257 PyErr_BadArgument();
1258 return NULL;
1259 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001260 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1261 PyUnicode_GET_SIZE(unicode),
1262 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263}
1264
1265/* --- UTF-16 Codec ------------------------------------------------------- */
1266
1267static
Tim Peters772747b2001-08-09 22:21:55 +00001268int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 const char *errors,
1270 const char *details)
1271{
1272 if ((errors == NULL) ||
1273 (strcmp(errors,"strict") == 0)) {
1274 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001275 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 details);
1277 return -1;
1278 }
1279 else if (strcmp(errors,"ignore") == 0) {
1280 return 0;
1281 }
1282 else if (strcmp(errors,"replace") == 0) {
1283 if (dest) {
1284 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1285 (*dest)++;
1286 }
1287 return 0;
1288 }
1289 else {
1290 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001291 "UTF-16 decoding error; "
1292 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 errors);
1294 return -1;
1295 }
1296}
1297
Tim Peters772747b2001-08-09 22:21:55 +00001298PyObject *
1299PyUnicode_DecodeUTF16(const char *s,
1300 int size,
1301 const char *errors,
1302 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
1304 PyUnicodeObject *unicode;
1305 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001306 const unsigned char *q, *e;
1307 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001309 /* Offsets from q for retrieving byte pairs in the right order. */
1310#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1311 int ihi = 1, ilo = 0;
1312#else
1313 int ihi = 0, ilo = 1;
1314#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315
1316 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001317 if (size & 1) {
1318 if (utf16_decoding_error(NULL, errors, "truncated data"))
1319 return NULL;
1320 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
1322
1323 /* Note: size will always be longer than the resulting Unicode
1324 character count */
1325 unicode = _PyUnicode_New(size);
1326 if (!unicode)
1327 return NULL;
1328 if (size == 0)
1329 return (PyObject *)unicode;
1330
1331 /* Unpack UTF-16 encoded data */
1332 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001333 q = (unsigned char *)s;
1334 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335
1336 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001337 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001339 /* Check for BOM marks (U+FEFF) in the input and adjust current
1340 byte order setting accordingly. In native mode, the leading BOM
1341 mark is skipped, in all other modes, it is copied to the output
1342 stream as-is (giving a ZWNBSP character). */
1343 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001344 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001345#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001346 if (bom == 0xFEFF) {
1347 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001348 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001349 }
1350 else if (bom == 0xFFFE) {
1351 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001352 bo = 1;
1353 }
1354#else
Tim Peters772747b2001-08-09 22:21:55 +00001355 if (bom == 0xFEFF) {
1356 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001357 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001358 }
1359 else if (bom == 0xFFFE) {
1360 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001361 bo = -1;
1362 }
1363#endif
1364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
Tim Peters772747b2001-08-09 22:21:55 +00001366 if (bo == -1) {
1367 /* force LE */
1368 ihi = 1;
1369 ilo = 0;
1370 }
1371 else if (bo == 1) {
1372 /* force BE */
1373 ihi = 0;
1374 ilo = 1;
1375 }
1376
1377 while (q < e) {
1378 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1379 q += 2;
1380
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 if (ch < 0xD800 || ch > 0xDFFF) {
1382 *p++ = ch;
1383 continue;
1384 }
1385
1386 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001387 if (q >= e) {
1388 errmsg = "unexpected end of data";
1389 goto utf16Error;
1390 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001391 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001392 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1393 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001394 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001395#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001396 *p++ = ch;
1397 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001398#else
1399 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001400#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001401 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001402 }
1403 else {
1404 errmsg = "illegal UTF-16 surrogate";
1405 goto utf16Error;
1406 }
1407
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001409 errmsg = "illegal encoding";
1410 /* Fall through to report the error */
1411
1412 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001413 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001414 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415 }
1416
1417 if (byteorder)
1418 *byteorder = bo;
1419
1420 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001421 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422 goto onError;
1423
1424 return (PyObject *)unicode;
1425
1426onError:
1427 Py_DECREF(unicode);
1428 return NULL;
1429}
1430
Tim Peters772747b2001-08-09 22:21:55 +00001431PyObject *
1432PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1433 int size,
1434 const char *errors,
1435 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436{
1437 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001438 unsigned char *p;
1439 int i, pairs;
1440 /* Offsets from p for storing byte pairs in the right order. */
1441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442 int ihi = 1, ilo = 0;
1443#else
1444 int ihi = 0, ilo = 1;
1445#endif
1446
1447#define STORECHAR(CH) \
1448 do { \
1449 p[ihi] = ((CH) >> 8) & 0xff; \
1450 p[ilo] = (CH) & 0xff; \
1451 p += 2; \
1452 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001454 for (i = pairs = 0; i < size; i++)
1455 if (s[i] >= 0x10000)
1456 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001458 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459 if (v == NULL)
1460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
Tim Peters772747b2001-08-09 22:21:55 +00001462 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001464 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001465 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001466 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001467
1468 if (byteorder == -1) {
1469 /* force LE */
1470 ihi = 1;
1471 ilo = 0;
1472 }
1473 else if (byteorder == 1) {
1474 /* force BE */
1475 ihi = 0;
1476 ilo = 1;
1477 }
1478
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001479 while (size-- > 0) {
1480 Py_UNICODE ch = *s++;
1481 Py_UNICODE ch2 = 0;
1482 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001483 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1484 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485 }
Tim Peters772747b2001-08-09 22:21:55 +00001486 STORECHAR(ch);
1487 if (ch2)
1488 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001491#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001492}
1493
1494PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1495{
1496 if (!PyUnicode_Check(unicode)) {
1497 PyErr_BadArgument();
1498 return NULL;
1499 }
1500 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1501 PyUnicode_GET_SIZE(unicode),
1502 NULL,
1503 0);
1504}
1505
1506/* --- Unicode Escape Codec ----------------------------------------------- */
1507
1508static
1509int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001510 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 const char *errors,
1512 const char *details)
1513{
1514 if ((errors == NULL) ||
1515 (strcmp(errors,"strict") == 0)) {
1516 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001517 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 details);
1519 return -1;
1520 }
1521 else if (strcmp(errors,"ignore") == 0) {
1522 return 0;
1523 }
1524 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001525 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 return 0;
1527 }
1528 else {
1529 PyErr_Format(PyExc_ValueError,
1530 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001531 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 errors);
1533 return -1;
1534 }
1535}
1536
Fredrik Lundh06d12682001-01-24 07:59:11 +00001537static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001538
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1540 int size,
1541 const char *errors)
1542{
1543 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001544 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001546 char* message;
1547 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1548
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 /* Escaped strings will always be longer than the resulting
1550 Unicode string, so we start with size here and then reduce the
1551 length after conversion to the true value. */
1552 v = _PyUnicode_New(size);
1553 if (v == NULL)
1554 goto onError;
1555 if (size == 0)
1556 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001557
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 p = buf = PyUnicode_AS_UNICODE(v);
1559 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001560
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 while (s < end) {
1562 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001563 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001564 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565
1566 /* Non-escape characters are interpreted as Unicode ordinals */
1567 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001568 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569 continue;
1570 }
1571
1572 /* \ - Escapes */
1573 s++;
1574 switch (*s++) {
1575
1576 /* \x escapes */
1577 case '\n': break;
1578 case '\\': *p++ = '\\'; break;
1579 case '\'': *p++ = '\''; break;
1580 case '\"': *p++ = '\"'; break;
1581 case 'b': *p++ = '\b'; break;
1582 case 'f': *p++ = '\014'; break; /* FF */
1583 case 't': *p++ = '\t'; break;
1584 case 'n': *p++ = '\n'; break;
1585 case 'r': *p++ = '\r'; break;
1586 case 'v': *p++ = '\013'; break; /* VT */
1587 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1588
1589 /* \OOO (octal) escapes */
1590 case '0': case '1': case '2': case '3':
1591 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001592 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001594 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001596 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001598 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 break;
1600
Fredrik Lundhccc74732001-02-18 22:13:49 +00001601 /* hex escapes */
1602 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001604 digits = 2;
1605 message = "truncated \\xXX escape";
1606 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001607
Fredrik Lundhccc74732001-02-18 22:13:49 +00001608 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001610 digits = 4;
1611 message = "truncated \\uXXXX escape";
1612 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613
Fredrik Lundhccc74732001-02-18 22:13:49 +00001614 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001615 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001616 digits = 8;
1617 message = "truncated \\UXXXXXXXX escape";
1618 hexescape:
1619 chr = 0;
1620 for (i = 0; i < digits; i++) {
1621 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001622 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001623 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001624 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001625 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001626 i++;
1627 break;
1628 }
1629 chr = (chr<<4) & ~0xF;
1630 if (c >= '0' && c <= '9')
1631 chr += c - '0';
1632 else if (c >= 'a' && c <= 'f')
1633 chr += 10 + c - 'a';
1634 else
1635 chr += 10 + c - 'A';
1636 }
1637 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001638 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001639 /* when we get here, chr is a 32-bit unicode character */
1640 if (chr <= 0xffff)
1641 /* UCS-2 character */
1642 *p++ = (Py_UNICODE) chr;
1643 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001644 /* UCS-4 character. Either store directly, or as
1645 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001646#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001647 *p++ = chr;
1648#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001649 chr -= 0x10000L;
1650 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001651 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001652#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001653 } else {
1654 if (unicodeescape_decoding_error(
1655 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001656 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001657 )
1658 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001659 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001660 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001661 break;
1662
1663 /* \N{name} */
1664 case 'N':
1665 message = "malformed \\N character escape";
1666 if (ucnhash_CAPI == NULL) {
1667 /* load the unicode data module */
1668 PyObject *m, *v;
1669 m = PyImport_ImportModule("unicodedata");
1670 if (m == NULL)
1671 goto ucnhashError;
1672 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1673 Py_DECREF(m);
1674 if (v == NULL)
1675 goto ucnhashError;
1676 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1677 Py_DECREF(v);
1678 if (ucnhash_CAPI == NULL)
1679 goto ucnhashError;
1680 }
1681 if (*s == '{') {
1682 const char *start = s+1;
1683 /* look for the closing brace */
1684 while (*s != '}' && s < end)
1685 s++;
1686 if (s > start && s < end && *s == '}') {
1687 /* found a name. look it up in the unicode database */
1688 message = "unknown Unicode character name";
1689 s++;
1690 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1691 goto store;
1692 }
1693 }
1694 if (unicodeescape_decoding_error(&s, &x, errors, message))
1695 goto onError;
1696 *p++ = x;
1697 break;
1698
1699 default:
1700 *p++ = '\\';
1701 *p++ = (unsigned char)s[-1];
1702 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703 }
1704 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001705 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001706 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 return (PyObject *)v;
1708
Fredrik Lundhccc74732001-02-18 22:13:49 +00001709ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001710 PyErr_SetString(
1711 PyExc_UnicodeError,
1712 "\\N escapes not supported (can't load unicodedata module)"
1713 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001714 return NULL;
1715
Fredrik Lundhccc74732001-02-18 22:13:49 +00001716onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 Py_XDECREF(v);
1718 return NULL;
1719}
1720
1721/* Return a Unicode-Escape string version of the Unicode object.
1722
1723 If quotes is true, the string is enclosed in u"" or u'' quotes as
1724 appropriate.
1725
1726*/
1727
Barry Warsaw51ac5802000-03-20 16:36:48 +00001728static const Py_UNICODE *findchar(const Py_UNICODE *s,
1729 int size,
1730 Py_UNICODE ch);
1731
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732static
1733PyObject *unicodeescape_string(const Py_UNICODE *s,
1734 int size,
1735 int quotes)
1736{
1737 PyObject *repr;
1738 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001740 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741
1742 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1743 if (repr == NULL)
1744 return NULL;
1745
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001746 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747
1748 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 *p++ = 'u';
1750 *p++ = (findchar(s, size, '\'') &&
1751 !findchar(s, size, '"')) ? '"' : '\'';
1752 }
1753 while (size-- > 0) {
1754 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001755
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001757 if (quotes &&
1758 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 *p++ = '\\';
1760 *p++ = (char) ch;
1761 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001762
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001763#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001764 /* Map 21-bit characters to '\U00xxxxxx' */
1765 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001766 int offset = p - PyString_AS_STRING(repr);
1767
1768 /* Resize the string if necessary */
1769 if (offset + 12 > PyString_GET_SIZE(repr)) {
1770 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1771 goto onError;
1772 p = PyString_AS_STRING(repr) + offset;
1773 }
1774
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001775 *p++ = '\\';
1776 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001777 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1778 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1779 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1780 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1781 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1782 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1783 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001784 *p++ = hexdigit[ch & 0x0000000F];
1785 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001786 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001787#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001788 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1789 else if (ch >= 0xD800 && ch < 0xDC00) {
1790 Py_UNICODE ch2;
1791 Py_UCS4 ucs;
1792
1793 ch2 = *s++;
1794 size--;
1795 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1796 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1797 *p++ = '\\';
1798 *p++ = 'U';
1799 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1800 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1801 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1802 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1803 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1804 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1805 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1806 *p++ = hexdigit[ucs & 0x0000000F];
1807 continue;
1808 }
1809 /* Fall through: isolated surrogates are copied as-is */
1810 s--;
1811 size++;
1812 }
1813
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001815 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 *p++ = '\\';
1817 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001818 *p++ = hexdigit[(ch >> 12) & 0x000F];
1819 *p++ = hexdigit[(ch >> 8) & 0x000F];
1820 *p++ = hexdigit[(ch >> 4) & 0x000F];
1821 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001823
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001824 /* Map special whitespace to '\t', \n', '\r' */
1825 else if (ch == '\t') {
1826 *p++ = '\\';
1827 *p++ = 't';
1828 }
1829 else if (ch == '\n') {
1830 *p++ = '\\';
1831 *p++ = 'n';
1832 }
1833 else if (ch == '\r') {
1834 *p++ = '\\';
1835 *p++ = 'r';
1836 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001837
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001838 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 else if (ch < ' ' || ch >= 128) {
1840 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001841 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001842 *p++ = hexdigit[(ch >> 4) & 0x000F];
1843 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001845
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 /* Copy everything else as-is */
1847 else
1848 *p++ = (char) ch;
1849 }
1850 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001851 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852
1853 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001854 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856
1857 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001858
1859 onError:
1860 Py_DECREF(repr);
1861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862}
1863
1864PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1865 int size)
1866{
1867 return unicodeescape_string(s, size, 0);
1868}
1869
1870PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1871{
1872 if (!PyUnicode_Check(unicode)) {
1873 PyErr_BadArgument();
1874 return NULL;
1875 }
1876 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1877 PyUnicode_GET_SIZE(unicode));
1878}
1879
1880/* --- Raw Unicode Escape Codec ------------------------------------------- */
1881
1882PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1883 int size,
1884 const char *errors)
1885{
1886 PyUnicodeObject *v;
1887 Py_UNICODE *p, *buf;
1888 const char *end;
1889 const char *bs;
1890
1891 /* Escaped strings will always be longer than the resulting
1892 Unicode string, so we start with size here and then reduce the
1893 length after conversion to the true value. */
1894 v = _PyUnicode_New(size);
1895 if (v == NULL)
1896 goto onError;
1897 if (size == 0)
1898 return (PyObject *)v;
1899 p = buf = PyUnicode_AS_UNICODE(v);
1900 end = s + size;
1901 while (s < end) {
1902 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001903 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 int i;
1905
1906 /* Non-escape characters are interpreted as Unicode ordinals */
1907 if (*s != '\\') {
1908 *p++ = (unsigned char)*s++;
1909 continue;
1910 }
1911
1912 /* \u-escapes are only interpreted iff the number of leading
1913 backslashes if odd */
1914 bs = s;
1915 for (;s < end;) {
1916 if (*s != '\\')
1917 break;
1918 *p++ = (unsigned char)*s++;
1919 }
1920 if (((s - bs) & 1) == 0 ||
1921 s >= end ||
1922 *s != 'u') {
1923 continue;
1924 }
1925 p--;
1926 s++;
1927
1928 /* \uXXXX with 4 hex digits */
1929 for (x = 0, i = 0; i < 4; i++) {
1930 c = (unsigned char)s[i];
1931 if (!isxdigit(c)) {
1932 if (unicodeescape_decoding_error(&s, &x, errors,
1933 "truncated \\uXXXX"))
1934 goto onError;
1935 i++;
1936 break;
1937 }
1938 x = (x<<4) & ~0xF;
1939 if (c >= '0' && c <= '9')
1940 x += c - '0';
1941 else if (c >= 'a' && c <= 'f')
1942 x += 10 + c - 'a';
1943 else
1944 x += 10 + c - 'A';
1945 }
1946 s += i;
1947 *p++ = x;
1948 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001949 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001950 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 return (PyObject *)v;
1952
1953 onError:
1954 Py_XDECREF(v);
1955 return NULL;
1956}
1957
1958PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1959 int size)
1960{
1961 PyObject *repr;
1962 char *p;
1963 char *q;
1964
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001965 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966
1967 repr = PyString_FromStringAndSize(NULL, 6 * size);
1968 if (repr == NULL)
1969 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001970 if (size == 0)
1971 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972
1973 p = q = PyString_AS_STRING(repr);
1974 while (size-- > 0) {
1975 Py_UNICODE ch = *s++;
1976 /* Map 16-bit characters to '\uxxxx' */
1977 if (ch >= 256) {
1978 *p++ = '\\';
1979 *p++ = 'u';
1980 *p++ = hexdigit[(ch >> 12) & 0xf];
1981 *p++ = hexdigit[(ch >> 8) & 0xf];
1982 *p++ = hexdigit[(ch >> 4) & 0xf];
1983 *p++ = hexdigit[ch & 15];
1984 }
1985 /* Copy everything else as-is */
1986 else
1987 *p++ = (char) ch;
1988 }
1989 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001990 if (_PyString_Resize(&repr, p - q))
1991 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992
1993 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001994
1995 onError:
1996 Py_DECREF(repr);
1997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998}
1999
2000PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2001{
2002 if (!PyUnicode_Check(unicode)) {
2003 PyErr_BadArgument();
2004 return NULL;
2005 }
2006 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2007 PyUnicode_GET_SIZE(unicode));
2008}
2009
2010/* --- Latin-1 Codec ------------------------------------------------------ */
2011
2012PyObject *PyUnicode_DecodeLatin1(const char *s,
2013 int size,
2014 const char *errors)
2015{
2016 PyUnicodeObject *v;
2017 Py_UNICODE *p;
2018
2019 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002020 if (size == 1 && *(unsigned char*)s < 256) {
2021 Py_UNICODE r = *(unsigned char*)s;
2022 return PyUnicode_FromUnicode(&r, 1);
2023 }
2024
Guido van Rossumd57fd912000-03-10 22:53:23 +00002025 v = _PyUnicode_New(size);
2026 if (v == NULL)
2027 goto onError;
2028 if (size == 0)
2029 return (PyObject *)v;
2030 p = PyUnicode_AS_UNICODE(v);
2031 while (size-- > 0)
2032 *p++ = (unsigned char)*s++;
2033 return (PyObject *)v;
2034
2035 onError:
2036 Py_XDECREF(v);
2037 return NULL;
2038}
2039
2040static
2041int latin1_encoding_error(const Py_UNICODE **source,
2042 char **dest,
2043 const char *errors,
2044 const char *details)
2045{
2046 if ((errors == NULL) ||
2047 (strcmp(errors,"strict") == 0)) {
2048 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002049 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 details);
2051 return -1;
2052 }
2053 else if (strcmp(errors,"ignore") == 0) {
2054 return 0;
2055 }
2056 else if (strcmp(errors,"replace") == 0) {
2057 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002058 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 return 0;
2060 }
2061 else {
2062 PyErr_Format(PyExc_ValueError,
2063 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002064 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 errors);
2066 return -1;
2067 }
2068}
2069
2070PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2071 int size,
2072 const char *errors)
2073{
2074 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002075 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002076
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 repr = PyString_FromStringAndSize(NULL, size);
2078 if (repr == NULL)
2079 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002080 if (size == 0)
2081 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
2083 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002084 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 while (size-- > 0) {
2086 Py_UNICODE ch = *p++;
2087 if (ch >= 256) {
2088 if (latin1_encoding_error(&p, &s, errors,
2089 "ordinal not in range(256)"))
2090 goto onError;
2091 }
2092 else
2093 *s++ = (char)ch;
2094 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002095 /* Resize if error handling skipped some characters */
2096 if (s - start < PyString_GET_SIZE(repr))
2097 if (_PyString_Resize(&repr, s - start))
2098 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 return repr;
2100
2101 onError:
2102 Py_DECREF(repr);
2103 return NULL;
2104}
2105
2106PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2107{
2108 if (!PyUnicode_Check(unicode)) {
2109 PyErr_BadArgument();
2110 return NULL;
2111 }
2112 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2113 PyUnicode_GET_SIZE(unicode),
2114 NULL);
2115}
2116
2117/* --- 7-bit ASCII Codec -------------------------------------------------- */
2118
2119static
2120int ascii_decoding_error(const char **source,
2121 Py_UNICODE **dest,
2122 const char *errors,
2123 const char *details)
2124{
2125 if ((errors == NULL) ||
2126 (strcmp(errors,"strict") == 0)) {
2127 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002128 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 details);
2130 return -1;
2131 }
2132 else if (strcmp(errors,"ignore") == 0) {
2133 return 0;
2134 }
2135 else if (strcmp(errors,"replace") == 0) {
2136 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2137 (*dest)++;
2138 return 0;
2139 }
2140 else {
2141 PyErr_Format(PyExc_ValueError,
2142 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002143 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 errors);
2145 return -1;
2146 }
2147}
2148
2149PyObject *PyUnicode_DecodeASCII(const char *s,
2150 int size,
2151 const char *errors)
2152{
2153 PyUnicodeObject *v;
2154 Py_UNICODE *p;
2155
2156 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002157 if (size == 1 && *(unsigned char*)s < 128) {
2158 Py_UNICODE r = *(unsigned char*)s;
2159 return PyUnicode_FromUnicode(&r, 1);
2160 }
2161
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 v = _PyUnicode_New(size);
2163 if (v == NULL)
2164 goto onError;
2165 if (size == 0)
2166 return (PyObject *)v;
2167 p = PyUnicode_AS_UNICODE(v);
2168 while (size-- > 0) {
2169 register unsigned char c;
2170
2171 c = (unsigned char)*s++;
2172 if (c < 128)
2173 *p++ = c;
2174 else if (ascii_decoding_error(&s, &p, errors,
2175 "ordinal not in range(128)"))
2176 goto onError;
2177 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002178 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002179 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002180 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 return (PyObject *)v;
2182
2183 onError:
2184 Py_XDECREF(v);
2185 return NULL;
2186}
2187
2188static
2189int ascii_encoding_error(const Py_UNICODE **source,
2190 char **dest,
2191 const char *errors,
2192 const char *details)
2193{
2194 if ((errors == NULL) ||
2195 (strcmp(errors,"strict") == 0)) {
2196 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002197 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 details);
2199 return -1;
2200 }
2201 else if (strcmp(errors,"ignore") == 0) {
2202 return 0;
2203 }
2204 else if (strcmp(errors,"replace") == 0) {
2205 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002206 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 return 0;
2208 }
2209 else {
2210 PyErr_Format(PyExc_ValueError,
2211 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002212 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 errors);
2214 return -1;
2215 }
2216}
2217
2218PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2219 int size,
2220 const char *errors)
2221{
2222 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002223 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002224
Guido van Rossumd57fd912000-03-10 22:53:23 +00002225 repr = PyString_FromStringAndSize(NULL, size);
2226 if (repr == NULL)
2227 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002228 if (size == 0)
2229 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230
2231 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002232 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 while (size-- > 0) {
2234 Py_UNICODE ch = *p++;
2235 if (ch >= 128) {
2236 if (ascii_encoding_error(&p, &s, errors,
2237 "ordinal not in range(128)"))
2238 goto onError;
2239 }
2240 else
2241 *s++ = (char)ch;
2242 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002243 /* Resize if error handling skipped some characters */
2244 if (s - start < PyString_GET_SIZE(repr))
2245 if (_PyString_Resize(&repr, s - start))
2246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 return repr;
2248
2249 onError:
2250 Py_DECREF(repr);
2251 return NULL;
2252}
2253
2254PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2255{
2256 if (!PyUnicode_Check(unicode)) {
2257 PyErr_BadArgument();
2258 return NULL;
2259 }
2260 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2261 PyUnicode_GET_SIZE(unicode),
2262 NULL);
2263}
2264
Fredrik Lundh30831632001-06-26 15:11:00 +00002265#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002266
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002267/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002268
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002269PyObject *PyUnicode_DecodeMBCS(const char *s,
2270 int size,
2271 const char *errors)
2272{
2273 PyUnicodeObject *v;
2274 Py_UNICODE *p;
2275
2276 /* First get the size of the result */
2277 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002278 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002279 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2280
2281 v = _PyUnicode_New(usize);
2282 if (v == NULL)
2283 return NULL;
2284 if (usize == 0)
2285 return (PyObject *)v;
2286 p = PyUnicode_AS_UNICODE(v);
2287 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2288 Py_DECREF(v);
2289 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2290 }
2291
2292 return (PyObject *)v;
2293}
2294
2295PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2296 int size,
2297 const char *errors)
2298{
2299 PyObject *repr;
2300 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002301 DWORD mbcssize;
2302
2303 /* If there are no characters, bail now! */
2304 if (size==0)
2305 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002306
2307 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002308 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002309 if (mbcssize==0)
2310 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2311
2312 repr = PyString_FromStringAndSize(NULL, mbcssize);
2313 if (repr == NULL)
2314 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002315 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002316 return repr;
2317
2318 /* Do the conversion */
2319 s = PyString_AS_STRING(repr);
2320 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2321 Py_DECREF(repr);
2322 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2323 }
2324 return repr;
2325}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002326
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002327#endif /* MS_WIN32 */
2328
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329/* --- Character Mapping Codec -------------------------------------------- */
2330
2331static
2332int charmap_decoding_error(const char **source,
2333 Py_UNICODE **dest,
2334 const char *errors,
2335 const char *details)
2336{
2337 if ((errors == NULL) ||
2338 (strcmp(errors,"strict") == 0)) {
2339 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002340 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341 details);
2342 return -1;
2343 }
2344 else if (strcmp(errors,"ignore") == 0) {
2345 return 0;
2346 }
2347 else if (strcmp(errors,"replace") == 0) {
2348 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2349 (*dest)++;
2350 return 0;
2351 }
2352 else {
2353 PyErr_Format(PyExc_ValueError,
2354 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002355 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 errors);
2357 return -1;
2358 }
2359}
2360
2361PyObject *PyUnicode_DecodeCharmap(const char *s,
2362 int size,
2363 PyObject *mapping,
2364 const char *errors)
2365{
2366 PyUnicodeObject *v;
2367 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002368 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369
2370 /* Default to Latin-1 */
2371 if (mapping == NULL)
2372 return PyUnicode_DecodeLatin1(s, size, errors);
2373
2374 v = _PyUnicode_New(size);
2375 if (v == NULL)
2376 goto onError;
2377 if (size == 0)
2378 return (PyObject *)v;
2379 p = PyUnicode_AS_UNICODE(v);
2380 while (size-- > 0) {
2381 unsigned char ch = *s++;
2382 PyObject *w, *x;
2383
2384 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2385 w = PyInt_FromLong((long)ch);
2386 if (w == NULL)
2387 goto onError;
2388 x = PyObject_GetItem(mapping, w);
2389 Py_DECREF(w);
2390 if (x == NULL) {
2391 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002392 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002394 x = Py_None;
2395 Py_INCREF(x);
2396 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002397 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 }
2399
2400 /* Apply mapping */
2401 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002402 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 if (value < 0 || value > 65535) {
2404 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002405 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 Py_DECREF(x);
2407 goto onError;
2408 }
2409 *p++ = (Py_UNICODE)value;
2410 }
2411 else if (x == Py_None) {
2412 /* undefined mapping */
2413 if (charmap_decoding_error(&s, &p, errors,
2414 "character maps to <undefined>")) {
2415 Py_DECREF(x);
2416 goto onError;
2417 }
2418 }
2419 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002420 int targetsize = PyUnicode_GET_SIZE(x);
2421
2422 if (targetsize == 1)
2423 /* 1-1 mapping */
2424 *p++ = *PyUnicode_AS_UNICODE(x);
2425
2426 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002428 if (targetsize > extrachars) {
2429 /* resize first */
2430 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2431 int needed = (targetsize - extrachars) + \
2432 (targetsize << 2);
2433 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002434 if (_PyUnicode_Resize(&v,
2435 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002436 Py_DECREF(x);
2437 goto onError;
2438 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002439 p = PyUnicode_AS_UNICODE(v) + oldpos;
2440 }
2441 Py_UNICODE_COPY(p,
2442 PyUnicode_AS_UNICODE(x),
2443 targetsize);
2444 p += targetsize;
2445 extrachars -= targetsize;
2446 }
2447 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448 }
2449 else {
2450 /* wrong return value */
2451 PyErr_SetString(PyExc_TypeError,
2452 "character mapping must return integer, None or unicode");
2453 Py_DECREF(x);
2454 goto onError;
2455 }
2456 Py_DECREF(x);
2457 }
2458 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002459 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 goto onError;
2461 return (PyObject *)v;
2462
2463 onError:
2464 Py_XDECREF(v);
2465 return NULL;
2466}
2467
2468static
2469int charmap_encoding_error(const Py_UNICODE **source,
2470 char **dest,
2471 const char *errors,
2472 const char *details)
2473{
2474 if ((errors == NULL) ||
2475 (strcmp(errors,"strict") == 0)) {
2476 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002477 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 details);
2479 return -1;
2480 }
2481 else if (strcmp(errors,"ignore") == 0) {
2482 return 0;
2483 }
2484 else if (strcmp(errors,"replace") == 0) {
2485 **dest = '?';
2486 (*dest)++;
2487 return 0;
2488 }
2489 else {
2490 PyErr_Format(PyExc_ValueError,
2491 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002492 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493 errors);
2494 return -1;
2495 }
2496}
2497
2498PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2499 int size,
2500 PyObject *mapping,
2501 const char *errors)
2502{
2503 PyObject *v;
2504 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002505 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 /* Default to Latin-1 */
2508 if (mapping == NULL)
2509 return PyUnicode_EncodeLatin1(p, size, errors);
2510
2511 v = PyString_FromStringAndSize(NULL, size);
2512 if (v == NULL)
2513 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002514 if (size == 0)
2515 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 s = PyString_AS_STRING(v);
2517 while (size-- > 0) {
2518 Py_UNICODE ch = *p++;
2519 PyObject *w, *x;
2520
2521 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2522 w = PyInt_FromLong((long)ch);
2523 if (w == NULL)
2524 goto onError;
2525 x = PyObject_GetItem(mapping, w);
2526 Py_DECREF(w);
2527 if (x == NULL) {
2528 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002529 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002531 x = Py_None;
2532 Py_INCREF(x);
2533 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002534 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 }
2536
2537 /* Apply mapping */
2538 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002539 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 if (value < 0 || value > 255) {
2541 PyErr_SetString(PyExc_TypeError,
2542 "character mapping must be in range(256)");
2543 Py_DECREF(x);
2544 goto onError;
2545 }
2546 *s++ = (char)value;
2547 }
2548 else if (x == Py_None) {
2549 /* undefined mapping */
2550 if (charmap_encoding_error(&p, &s, errors,
2551 "character maps to <undefined>")) {
2552 Py_DECREF(x);
2553 goto onError;
2554 }
2555 }
2556 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002557 int targetsize = PyString_GET_SIZE(x);
2558
2559 if (targetsize == 1)
2560 /* 1-1 mapping */
2561 *s++ = *PyString_AS_STRING(x);
2562
2563 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002565 if (targetsize > extrachars) {
2566 /* resize first */
2567 int oldpos = (int)(s - PyString_AS_STRING(v));
2568 int needed = (targetsize - extrachars) + \
2569 (targetsize << 2);
2570 extrachars += needed;
2571 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002572 Py_DECREF(x);
2573 goto onError;
2574 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002575 s = PyString_AS_STRING(v) + oldpos;
2576 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002577 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002578 s += targetsize;
2579 extrachars -= targetsize;
2580 }
2581 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 }
2583 else {
2584 /* wrong return value */
2585 PyErr_SetString(PyExc_TypeError,
2586 "character mapping must return integer, None or unicode");
2587 Py_DECREF(x);
2588 goto onError;
2589 }
2590 Py_DECREF(x);
2591 }
2592 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2593 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2594 goto onError;
2595 return v;
2596
2597 onError:
2598 Py_DECREF(v);
2599 return NULL;
2600}
2601
2602PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2603 PyObject *mapping)
2604{
2605 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2606 PyErr_BadArgument();
2607 return NULL;
2608 }
2609 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2610 PyUnicode_GET_SIZE(unicode),
2611 mapping,
2612 NULL);
2613}
2614
2615static
2616int translate_error(const Py_UNICODE **source,
2617 Py_UNICODE **dest,
2618 const char *errors,
2619 const char *details)
2620{
2621 if ((errors == NULL) ||
2622 (strcmp(errors,"strict") == 0)) {
2623 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002624 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 details);
2626 return -1;
2627 }
2628 else if (strcmp(errors,"ignore") == 0) {
2629 return 0;
2630 }
2631 else if (strcmp(errors,"replace") == 0) {
2632 **dest = '?';
2633 (*dest)++;
2634 return 0;
2635 }
2636 else {
2637 PyErr_Format(PyExc_ValueError,
2638 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002639 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 errors);
2641 return -1;
2642 }
2643}
2644
2645PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2646 int size,
2647 PyObject *mapping,
2648 const char *errors)
2649{
2650 PyUnicodeObject *v;
2651 Py_UNICODE *p;
2652
2653 if (mapping == NULL) {
2654 PyErr_BadArgument();
2655 return NULL;
2656 }
2657
2658 /* Output will never be longer than input */
2659 v = _PyUnicode_New(size);
2660 if (v == NULL)
2661 goto onError;
2662 if (size == 0)
2663 goto done;
2664 p = PyUnicode_AS_UNICODE(v);
2665 while (size-- > 0) {
2666 Py_UNICODE ch = *s++;
2667 PyObject *w, *x;
2668
2669 /* Get mapping */
2670 w = PyInt_FromLong(ch);
2671 if (w == NULL)
2672 goto onError;
2673 x = PyObject_GetItem(mapping, w);
2674 Py_DECREF(w);
2675 if (x == NULL) {
2676 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2677 /* No mapping found: default to 1-1 mapping */
2678 PyErr_Clear();
2679 *p++ = ch;
2680 continue;
2681 }
2682 goto onError;
2683 }
2684
2685 /* Apply mapping */
2686 if (PyInt_Check(x))
2687 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2688 else if (x == Py_None) {
2689 /* undefined mapping */
2690 if (translate_error(&s, &p, errors,
2691 "character maps to <undefined>")) {
2692 Py_DECREF(x);
2693 goto onError;
2694 }
2695 }
2696 else if (PyUnicode_Check(x)) {
2697 if (PyUnicode_GET_SIZE(x) != 1) {
2698 /* 1-n mapping */
2699 PyErr_SetString(PyExc_NotImplementedError,
2700 "1-n mappings are currently not implemented");
2701 Py_DECREF(x);
2702 goto onError;
2703 }
2704 *p++ = *PyUnicode_AS_UNICODE(x);
2705 }
2706 else {
2707 /* wrong return value */
2708 PyErr_SetString(PyExc_TypeError,
2709 "translate mapping must return integer, None or unicode");
2710 Py_DECREF(x);
2711 goto onError;
2712 }
2713 Py_DECREF(x);
2714 }
2715 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002716 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718
2719 done:
2720 return (PyObject *)v;
2721
2722 onError:
2723 Py_XDECREF(v);
2724 return NULL;
2725}
2726
2727PyObject *PyUnicode_Translate(PyObject *str,
2728 PyObject *mapping,
2729 const char *errors)
2730{
2731 PyObject *result;
2732
2733 str = PyUnicode_FromObject(str);
2734 if (str == NULL)
2735 goto onError;
2736 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2737 PyUnicode_GET_SIZE(str),
2738 mapping,
2739 errors);
2740 Py_DECREF(str);
2741 return result;
2742
2743 onError:
2744 Py_XDECREF(str);
2745 return NULL;
2746}
2747
Guido van Rossum9e896b32000-04-05 20:11:21 +00002748/* --- Decimal Encoder ---------------------------------------------------- */
2749
2750int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2751 int length,
2752 char *output,
2753 const char *errors)
2754{
2755 Py_UNICODE *p, *end;
2756
2757 if (output == NULL) {
2758 PyErr_BadArgument();
2759 return -1;
2760 }
2761
2762 p = s;
2763 end = s + length;
2764 while (p < end) {
2765 register Py_UNICODE ch = *p++;
2766 int decimal;
2767
2768 if (Py_UNICODE_ISSPACE(ch)) {
2769 *output++ = ' ';
2770 continue;
2771 }
2772 decimal = Py_UNICODE_TODECIMAL(ch);
2773 if (decimal >= 0) {
2774 *output++ = '0' + decimal;
2775 continue;
2776 }
Guido van Rossumba477042000-04-06 18:18:10 +00002777 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002778 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002779 continue;
2780 }
2781 /* All other characters are considered invalid */
2782 if (errors == NULL || strcmp(errors, "strict") == 0) {
2783 PyErr_SetString(PyExc_ValueError,
2784 "invalid decimal Unicode string");
2785 goto onError;
2786 }
2787 else if (strcmp(errors, "ignore") == 0)
2788 continue;
2789 else if (strcmp(errors, "replace") == 0) {
2790 *output++ = '?';
2791 continue;
2792 }
2793 }
2794 /* 0-terminate the output string */
2795 *output++ = '\0';
2796 return 0;
2797
2798 onError:
2799 return -1;
2800}
2801
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802/* --- Helpers ------------------------------------------------------------ */
2803
2804static
2805int count(PyUnicodeObject *self,
2806 int start,
2807 int end,
2808 PyUnicodeObject *substring)
2809{
2810 int count = 0;
2811
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002812 if (start < 0)
2813 start += self->length;
2814 if (start < 0)
2815 start = 0;
2816 if (end > self->length)
2817 end = self->length;
2818 if (end < 0)
2819 end += self->length;
2820 if (end < 0)
2821 end = 0;
2822
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002823 if (substring->length == 0)
2824 return (end - start + 1);
2825
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 end -= substring->length;
2827
2828 while (start <= end)
2829 if (Py_UNICODE_MATCH(self, start, substring)) {
2830 count++;
2831 start += substring->length;
2832 } else
2833 start++;
2834
2835 return count;
2836}
2837
2838int PyUnicode_Count(PyObject *str,
2839 PyObject *substr,
2840 int start,
2841 int end)
2842{
2843 int result;
2844
2845 str = PyUnicode_FromObject(str);
2846 if (str == NULL)
2847 return -1;
2848 substr = PyUnicode_FromObject(substr);
2849 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002850 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 return -1;
2852 }
2853
2854 result = count((PyUnicodeObject *)str,
2855 start, end,
2856 (PyUnicodeObject *)substr);
2857
2858 Py_DECREF(str);
2859 Py_DECREF(substr);
2860 return result;
2861}
2862
2863static
2864int findstring(PyUnicodeObject *self,
2865 PyUnicodeObject *substring,
2866 int start,
2867 int end,
2868 int direction)
2869{
2870 if (start < 0)
2871 start += self->length;
2872 if (start < 0)
2873 start = 0;
2874
2875 if (substring->length == 0)
2876 return start;
2877
2878 if (end > self->length)
2879 end = self->length;
2880 if (end < 0)
2881 end += self->length;
2882 if (end < 0)
2883 end = 0;
2884
2885 end -= substring->length;
2886
2887 if (direction < 0) {
2888 for (; end >= start; end--)
2889 if (Py_UNICODE_MATCH(self, end, substring))
2890 return end;
2891 } else {
2892 for (; start <= end; start++)
2893 if (Py_UNICODE_MATCH(self, start, substring))
2894 return start;
2895 }
2896
2897 return -1;
2898}
2899
2900int PyUnicode_Find(PyObject *str,
2901 PyObject *substr,
2902 int start,
2903 int end,
2904 int direction)
2905{
2906 int result;
2907
2908 str = PyUnicode_FromObject(str);
2909 if (str == NULL)
2910 return -1;
2911 substr = PyUnicode_FromObject(substr);
2912 if (substr == NULL) {
2913 Py_DECREF(substr);
2914 return -1;
2915 }
2916
2917 result = findstring((PyUnicodeObject *)str,
2918 (PyUnicodeObject *)substr,
2919 start, end, direction);
2920 Py_DECREF(str);
2921 Py_DECREF(substr);
2922 return result;
2923}
2924
2925static
2926int tailmatch(PyUnicodeObject *self,
2927 PyUnicodeObject *substring,
2928 int start,
2929 int end,
2930 int direction)
2931{
2932 if (start < 0)
2933 start += self->length;
2934 if (start < 0)
2935 start = 0;
2936
2937 if (substring->length == 0)
2938 return 1;
2939
2940 if (end > self->length)
2941 end = self->length;
2942 if (end < 0)
2943 end += self->length;
2944 if (end < 0)
2945 end = 0;
2946
2947 end -= substring->length;
2948 if (end < start)
2949 return 0;
2950
2951 if (direction > 0) {
2952 if (Py_UNICODE_MATCH(self, end, substring))
2953 return 1;
2954 } else {
2955 if (Py_UNICODE_MATCH(self, start, substring))
2956 return 1;
2957 }
2958
2959 return 0;
2960}
2961
2962int PyUnicode_Tailmatch(PyObject *str,
2963 PyObject *substr,
2964 int start,
2965 int end,
2966 int direction)
2967{
2968 int result;
2969
2970 str = PyUnicode_FromObject(str);
2971 if (str == NULL)
2972 return -1;
2973 substr = PyUnicode_FromObject(substr);
2974 if (substr == NULL) {
2975 Py_DECREF(substr);
2976 return -1;
2977 }
2978
2979 result = tailmatch((PyUnicodeObject *)str,
2980 (PyUnicodeObject *)substr,
2981 start, end, direction);
2982 Py_DECREF(str);
2983 Py_DECREF(substr);
2984 return result;
2985}
2986
2987static
2988const Py_UNICODE *findchar(const Py_UNICODE *s,
2989 int size,
2990 Py_UNICODE ch)
2991{
2992 /* like wcschr, but doesn't stop at NULL characters */
2993
2994 while (size-- > 0) {
2995 if (*s == ch)
2996 return s;
2997 s++;
2998 }
2999
3000 return NULL;
3001}
3002
3003/* Apply fixfct filter to the Unicode object self and return a
3004 reference to the modified object */
3005
3006static
3007PyObject *fixup(PyUnicodeObject *self,
3008 int (*fixfct)(PyUnicodeObject *s))
3009{
3010
3011 PyUnicodeObject *u;
3012
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003013 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 if (u == NULL)
3015 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003016
3017 Py_UNICODE_COPY(u->str, self->str, self->length);
3018
Tim Peters7a29bd52001-09-12 03:03:31 +00003019 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 /* fixfct should return TRUE if it modified the buffer. If
3021 FALSE, return a reference to the original buffer instead
3022 (to save space, not time) */
3023 Py_INCREF(self);
3024 Py_DECREF(u);
3025 return (PyObject*) self;
3026 }
3027 return (PyObject*) u;
3028}
3029
3030static
3031int fixupper(PyUnicodeObject *self)
3032{
3033 int len = self->length;
3034 Py_UNICODE *s = self->str;
3035 int status = 0;
3036
3037 while (len-- > 0) {
3038 register Py_UNICODE ch;
3039
3040 ch = Py_UNICODE_TOUPPER(*s);
3041 if (ch != *s) {
3042 status = 1;
3043 *s = ch;
3044 }
3045 s++;
3046 }
3047
3048 return status;
3049}
3050
3051static
3052int fixlower(PyUnicodeObject *self)
3053{
3054 int len = self->length;
3055 Py_UNICODE *s = self->str;
3056 int status = 0;
3057
3058 while (len-- > 0) {
3059 register Py_UNICODE ch;
3060
3061 ch = Py_UNICODE_TOLOWER(*s);
3062 if (ch != *s) {
3063 status = 1;
3064 *s = ch;
3065 }
3066 s++;
3067 }
3068
3069 return status;
3070}
3071
3072static
3073int fixswapcase(PyUnicodeObject *self)
3074{
3075 int len = self->length;
3076 Py_UNICODE *s = self->str;
3077 int status = 0;
3078
3079 while (len-- > 0) {
3080 if (Py_UNICODE_ISUPPER(*s)) {
3081 *s = Py_UNICODE_TOLOWER(*s);
3082 status = 1;
3083 } else if (Py_UNICODE_ISLOWER(*s)) {
3084 *s = Py_UNICODE_TOUPPER(*s);
3085 status = 1;
3086 }
3087 s++;
3088 }
3089
3090 return status;
3091}
3092
3093static
3094int fixcapitalize(PyUnicodeObject *self)
3095{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003096 int len = self->length;
3097 Py_UNICODE *s = self->str;
3098 int status = 0;
3099
3100 if (len == 0)
3101 return 0;
3102 if (Py_UNICODE_ISLOWER(*s)) {
3103 *s = Py_UNICODE_TOUPPER(*s);
3104 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003106 s++;
3107 while (--len > 0) {
3108 if (Py_UNICODE_ISUPPER(*s)) {
3109 *s = Py_UNICODE_TOLOWER(*s);
3110 status = 1;
3111 }
3112 s++;
3113 }
3114 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115}
3116
3117static
3118int fixtitle(PyUnicodeObject *self)
3119{
3120 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3121 register Py_UNICODE *e;
3122 int previous_is_cased;
3123
3124 /* Shortcut for single character strings */
3125 if (PyUnicode_GET_SIZE(self) == 1) {
3126 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3127 if (*p != ch) {
3128 *p = ch;
3129 return 1;
3130 }
3131 else
3132 return 0;
3133 }
3134
3135 e = p + PyUnicode_GET_SIZE(self);
3136 previous_is_cased = 0;
3137 for (; p < e; p++) {
3138 register const Py_UNICODE ch = *p;
3139
3140 if (previous_is_cased)
3141 *p = Py_UNICODE_TOLOWER(ch);
3142 else
3143 *p = Py_UNICODE_TOTITLE(ch);
3144
3145 if (Py_UNICODE_ISLOWER(ch) ||
3146 Py_UNICODE_ISUPPER(ch) ||
3147 Py_UNICODE_ISTITLE(ch))
3148 previous_is_cased = 1;
3149 else
3150 previous_is_cased = 0;
3151 }
3152 return 1;
3153}
3154
3155PyObject *PyUnicode_Join(PyObject *separator,
3156 PyObject *seq)
3157{
3158 Py_UNICODE *sep;
3159 int seplen;
3160 PyUnicodeObject *res = NULL;
3161 int reslen = 0;
3162 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 int sz = 100;
3164 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003165 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166
Tim Peters2cfe3682001-05-05 05:36:48 +00003167 it = PyObject_GetIter(seq);
3168 if (it == NULL)
3169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170
3171 if (separator == NULL) {
3172 Py_UNICODE blank = ' ';
3173 sep = &blank;
3174 seplen = 1;
3175 }
3176 else {
3177 separator = PyUnicode_FromObject(separator);
3178 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003179 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180 sep = PyUnicode_AS_UNICODE(separator);
3181 seplen = PyUnicode_GET_SIZE(separator);
3182 }
3183
3184 res = _PyUnicode_New(sz);
3185 if (res == NULL)
3186 goto onError;
3187 p = PyUnicode_AS_UNICODE(res);
3188 reslen = 0;
3189
Tim Peters2cfe3682001-05-05 05:36:48 +00003190 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003192 PyObject *item = PyIter_Next(it);
3193 if (item == NULL) {
3194 if (PyErr_Occurred())
3195 goto onError;
3196 break;
3197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 if (!PyUnicode_Check(item)) {
3199 PyObject *v;
3200 v = PyUnicode_FromObject(item);
3201 Py_DECREF(item);
3202 item = v;
3203 if (item == NULL)
3204 goto onError;
3205 }
3206 itemlen = PyUnicode_GET_SIZE(item);
3207 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003208 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 goto onError;
3210 sz *= 2;
3211 p = PyUnicode_AS_UNICODE(res) + reslen;
3212 }
3213 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003214 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 p += seplen;
3216 reslen += seplen;
3217 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003218 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 p += itemlen;
3220 reslen += itemlen;
3221 Py_DECREF(item);
3222 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003223 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003224 goto onError;
3225
3226 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003227 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228 return (PyObject *)res;
3229
3230 onError:
3231 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003232 Py_XDECREF(res);
3233 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 return NULL;
3235}
3236
3237static
3238PyUnicodeObject *pad(PyUnicodeObject *self,
3239 int left,
3240 int right,
3241 Py_UNICODE fill)
3242{
3243 PyUnicodeObject *u;
3244
3245 if (left < 0)
3246 left = 0;
3247 if (right < 0)
3248 right = 0;
3249
Tim Peters7a29bd52001-09-12 03:03:31 +00003250 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251 Py_INCREF(self);
3252 return self;
3253 }
3254
3255 u = _PyUnicode_New(left + self->length + right);
3256 if (u) {
3257 if (left)
3258 Py_UNICODE_FILL(u->str, fill, left);
3259 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3260 if (right)
3261 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3262 }
3263
3264 return u;
3265}
3266
3267#define SPLIT_APPEND(data, left, right) \
3268 str = PyUnicode_FromUnicode(data + left, right - left); \
3269 if (!str) \
3270 goto onError; \
3271 if (PyList_Append(list, str)) { \
3272 Py_DECREF(str); \
3273 goto onError; \
3274 } \
3275 else \
3276 Py_DECREF(str);
3277
3278static
3279PyObject *split_whitespace(PyUnicodeObject *self,
3280 PyObject *list,
3281 int maxcount)
3282{
3283 register int i;
3284 register int j;
3285 int len = self->length;
3286 PyObject *str;
3287
3288 for (i = j = 0; i < len; ) {
3289 /* find a token */
3290 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3291 i++;
3292 j = i;
3293 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3294 i++;
3295 if (j < i) {
3296 if (maxcount-- <= 0)
3297 break;
3298 SPLIT_APPEND(self->str, j, i);
3299 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3300 i++;
3301 j = i;
3302 }
3303 }
3304 if (j < len) {
3305 SPLIT_APPEND(self->str, j, len);
3306 }
3307 return list;
3308
3309 onError:
3310 Py_DECREF(list);
3311 return NULL;
3312}
3313
3314PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003315 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316{
3317 register int i;
3318 register int j;
3319 int len;
3320 PyObject *list;
3321 PyObject *str;
3322 Py_UNICODE *data;
3323
3324 string = PyUnicode_FromObject(string);
3325 if (string == NULL)
3326 return NULL;
3327 data = PyUnicode_AS_UNICODE(string);
3328 len = PyUnicode_GET_SIZE(string);
3329
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 list = PyList_New(0);
3331 if (!list)
3332 goto onError;
3333
3334 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003335 int eol;
3336
Guido van Rossumd57fd912000-03-10 22:53:23 +00003337 /* Find a line and append it */
3338 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3339 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340
3341 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003342 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 if (i < len) {
3344 if (data[i] == '\r' && i + 1 < len &&
3345 data[i+1] == '\n')
3346 i += 2;
3347 else
3348 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003349 if (keepends)
3350 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003351 }
Guido van Rossum86662912000-04-11 15:38:46 +00003352 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 j = i;
3354 }
3355 if (j < len) {
3356 SPLIT_APPEND(data, j, len);
3357 }
3358
3359 Py_DECREF(string);
3360 return list;
3361
3362 onError:
3363 Py_DECREF(list);
3364 Py_DECREF(string);
3365 return NULL;
3366}
3367
3368static
3369PyObject *split_char(PyUnicodeObject *self,
3370 PyObject *list,
3371 Py_UNICODE ch,
3372 int maxcount)
3373{
3374 register int i;
3375 register int j;
3376 int len = self->length;
3377 PyObject *str;
3378
3379 for (i = j = 0; i < len; ) {
3380 if (self->str[i] == ch) {
3381 if (maxcount-- <= 0)
3382 break;
3383 SPLIT_APPEND(self->str, j, i);
3384 i = j = i + 1;
3385 } else
3386 i++;
3387 }
3388 if (j <= len) {
3389 SPLIT_APPEND(self->str, j, len);
3390 }
3391 return list;
3392
3393 onError:
3394 Py_DECREF(list);
3395 return NULL;
3396}
3397
3398static
3399PyObject *split_substring(PyUnicodeObject *self,
3400 PyObject *list,
3401 PyUnicodeObject *substring,
3402 int maxcount)
3403{
3404 register int i;
3405 register int j;
3406 int len = self->length;
3407 int sublen = substring->length;
3408 PyObject *str;
3409
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003410 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003411 if (Py_UNICODE_MATCH(self, i, substring)) {
3412 if (maxcount-- <= 0)
3413 break;
3414 SPLIT_APPEND(self->str, j, i);
3415 i = j = i + sublen;
3416 } else
3417 i++;
3418 }
3419 if (j <= len) {
3420 SPLIT_APPEND(self->str, j, len);
3421 }
3422 return list;
3423
3424 onError:
3425 Py_DECREF(list);
3426 return NULL;
3427}
3428
3429#undef SPLIT_APPEND
3430
3431static
3432PyObject *split(PyUnicodeObject *self,
3433 PyUnicodeObject *substring,
3434 int maxcount)
3435{
3436 PyObject *list;
3437
3438 if (maxcount < 0)
3439 maxcount = INT_MAX;
3440
3441 list = PyList_New(0);
3442 if (!list)
3443 return NULL;
3444
3445 if (substring == NULL)
3446 return split_whitespace(self,list,maxcount);
3447
3448 else if (substring->length == 1)
3449 return split_char(self,list,substring->str[0],maxcount);
3450
3451 else if (substring->length == 0) {
3452 Py_DECREF(list);
3453 PyErr_SetString(PyExc_ValueError, "empty separator");
3454 return NULL;
3455 }
3456 else
3457 return split_substring(self,list,substring,maxcount);
3458}
3459
3460static
3461PyObject *strip(PyUnicodeObject *self,
3462 int left,
3463 int right)
3464{
3465 Py_UNICODE *p = self->str;
3466 int start = 0;
3467 int end = self->length;
3468
3469 if (left)
3470 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3471 start++;
3472
3473 if (right)
3474 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3475 end--;
3476
Tim Peters7a29bd52001-09-12 03:03:31 +00003477 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003478 /* couldn't strip anything off, return original string */
3479 Py_INCREF(self);
3480 return (PyObject*) self;
3481 }
3482
3483 return (PyObject*) PyUnicode_FromUnicode(
3484 self->str + start,
3485 end - start
3486 );
3487}
3488
3489static
3490PyObject *replace(PyUnicodeObject *self,
3491 PyUnicodeObject *str1,
3492 PyUnicodeObject *str2,
3493 int maxcount)
3494{
3495 PyUnicodeObject *u;
3496
3497 if (maxcount < 0)
3498 maxcount = INT_MAX;
3499
3500 if (str1->length == 1 && str2->length == 1) {
3501 int i;
3502
3503 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003504 if (!findchar(self->str, self->length, str1->str[0]) &&
3505 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506 /* nothing to replace, return original string */
3507 Py_INCREF(self);
3508 u = self;
3509 } else {
3510 Py_UNICODE u1 = str1->str[0];
3511 Py_UNICODE u2 = str2->str[0];
3512
3513 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003514 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515 self->length
3516 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003517 if (u != NULL) {
3518 Py_UNICODE_COPY(u->str, self->str,
3519 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 for (i = 0; i < u->length; i++)
3521 if (u->str[i] == u1) {
3522 if (--maxcount < 0)
3523 break;
3524 u->str[i] = u2;
3525 }
3526 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003527 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003528
3529 } else {
3530 int n, i;
3531 Py_UNICODE *p;
3532
3533 /* replace strings */
3534 n = count(self, 0, self->length, str1);
3535 if (n > maxcount)
3536 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003537 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538 /* nothing to replace, return original string */
3539 Py_INCREF(self);
3540 u = self;
3541 } else {
3542 u = _PyUnicode_New(
3543 self->length + n * (str2->length - str1->length));
3544 if (u) {
3545 i = 0;
3546 p = u->str;
3547 while (i <= self->length - str1->length)
3548 if (Py_UNICODE_MATCH(self, i, str1)) {
3549 /* replace string segment */
3550 Py_UNICODE_COPY(p, str2->str, str2->length);
3551 p += str2->length;
3552 i += str1->length;
3553 if (--n <= 0) {
3554 /* copy remaining part */
3555 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3556 break;
3557 }
3558 } else
3559 *p++ = self->str[i++];
3560 }
3561 }
3562 }
3563
3564 return (PyObject *) u;
3565}
3566
3567/* --- Unicode Object Methods --------------------------------------------- */
3568
3569static char title__doc__[] =
3570"S.title() -> unicode\n\
3571\n\
3572Return a titlecased version of S, i.e. words start with title case\n\
3573characters, all remaining cased characters have lower case.";
3574
3575static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003576unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003578 return fixup(self, fixtitle);
3579}
3580
3581static char capitalize__doc__[] =
3582"S.capitalize() -> unicode\n\
3583\n\
3584Return a capitalized version of S, i.e. make the first character\n\
3585have upper case.";
3586
3587static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003588unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 return fixup(self, fixcapitalize);
3591}
3592
3593#if 0
3594static char capwords__doc__[] =
3595"S.capwords() -> unicode\n\
3596\n\
3597Apply .capitalize() to all words in S and return the result with\n\
3598normalized whitespace (all whitespace strings are replaced by ' ').";
3599
3600static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003601unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003602{
3603 PyObject *list;
3604 PyObject *item;
3605 int i;
3606
Guido van Rossumd57fd912000-03-10 22:53:23 +00003607 /* Split into words */
3608 list = split(self, NULL, -1);
3609 if (!list)
3610 return NULL;
3611
3612 /* Capitalize each word */
3613 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3614 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3615 fixcapitalize);
3616 if (item == NULL)
3617 goto onError;
3618 Py_DECREF(PyList_GET_ITEM(list, i));
3619 PyList_SET_ITEM(list, i, item);
3620 }
3621
3622 /* Join the words to form a new string */
3623 item = PyUnicode_Join(NULL, list);
3624
3625onError:
3626 Py_DECREF(list);
3627 return (PyObject *)item;
3628}
3629#endif
3630
3631static char center__doc__[] =
3632"S.center(width) -> unicode\n\
3633\n\
3634Return S centered in a Unicode string of length width. Padding is done\n\
3635using spaces.";
3636
3637static PyObject *
3638unicode_center(PyUnicodeObject *self, PyObject *args)
3639{
3640 int marg, left;
3641 int width;
3642
3643 if (!PyArg_ParseTuple(args, "i:center", &width))
3644 return NULL;
3645
Tim Peters7a29bd52001-09-12 03:03:31 +00003646 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003647 Py_INCREF(self);
3648 return (PyObject*) self;
3649 }
3650
3651 marg = width - self->length;
3652 left = marg / 2 + (marg & width & 1);
3653
3654 return (PyObject*) pad(self, left, marg - left, ' ');
3655}
3656
Marc-André Lemburge5034372000-08-08 08:04:29 +00003657#if 0
3658
3659/* This code should go into some future Unicode collation support
3660 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003661 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003662
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003663/* speedy UTF-16 code point order comparison */
3664/* gleaned from: */
3665/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3666
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003667static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003668{
3669 0, 0, 0, 0, 0, 0, 0, 0,
3670 0, 0, 0, 0, 0, 0, 0, 0,
3671 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003672 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003673};
3674
Guido van Rossumd57fd912000-03-10 22:53:23 +00003675static int
3676unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3677{
3678 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003679
Guido van Rossumd57fd912000-03-10 22:53:23 +00003680 Py_UNICODE *s1 = str1->str;
3681 Py_UNICODE *s2 = str2->str;
3682
3683 len1 = str1->length;
3684 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003685
Guido van Rossumd57fd912000-03-10 22:53:23 +00003686 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003687 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003688
3689 c1 = *s1++;
3690 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003691
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003692 if (c1 > (1<<11) * 26)
3693 c1 += utf16Fixup[c1>>11];
3694 if (c2 > (1<<11) * 26)
3695 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003696 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003697
3698 if (c1 != c2)
3699 return (c1 < c2) ? -1 : 1;
3700
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003701 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003702 }
3703
3704 return (len1 < len2) ? -1 : (len1 != len2);
3705}
3706
Marc-André Lemburge5034372000-08-08 08:04:29 +00003707#else
3708
3709static int
3710unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3711{
3712 register int len1, len2;
3713
3714 Py_UNICODE *s1 = str1->str;
3715 Py_UNICODE *s2 = str2->str;
3716
3717 len1 = str1->length;
3718 len2 = str2->length;
3719
3720 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003721 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003722
Fredrik Lundh45714e92001-06-26 16:39:36 +00003723 c1 = *s1++;
3724 c2 = *s2++;
3725
3726 if (c1 != c2)
3727 return (c1 < c2) ? -1 : 1;
3728
Marc-André Lemburge5034372000-08-08 08:04:29 +00003729 len1--; len2--;
3730 }
3731
3732 return (len1 < len2) ? -1 : (len1 != len2);
3733}
3734
3735#endif
3736
Guido van Rossumd57fd912000-03-10 22:53:23 +00003737int PyUnicode_Compare(PyObject *left,
3738 PyObject *right)
3739{
3740 PyUnicodeObject *u = NULL, *v = NULL;
3741 int result;
3742
3743 /* Coerce the two arguments */
3744 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3745 if (u == NULL)
3746 goto onError;
3747 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3748 if (v == NULL)
3749 goto onError;
3750
Thomas Wouters7e474022000-07-16 12:04:32 +00003751 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 if (v == u) {
3753 Py_DECREF(u);
3754 Py_DECREF(v);
3755 return 0;
3756 }
3757
3758 result = unicode_compare(u, v);
3759
3760 Py_DECREF(u);
3761 Py_DECREF(v);
3762 return result;
3763
3764onError:
3765 Py_XDECREF(u);
3766 Py_XDECREF(v);
3767 return -1;
3768}
3769
Guido van Rossum403d68b2000-03-13 15:55:09 +00003770int PyUnicode_Contains(PyObject *container,
3771 PyObject *element)
3772{
3773 PyUnicodeObject *u = NULL, *v = NULL;
3774 int result;
3775 register const Py_UNICODE *p, *e;
3776 register Py_UNICODE ch;
3777
3778 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003779 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003780 if (v == NULL) {
3781 PyErr_SetString(PyExc_TypeError,
3782 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003783 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003784 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003785 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3786 if (u == NULL) {
3787 Py_DECREF(v);
3788 goto onError;
3789 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003790
3791 /* Check v in u */
3792 if (PyUnicode_GET_SIZE(v) != 1) {
3793 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003794 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003795 goto onError;
3796 }
3797 ch = *PyUnicode_AS_UNICODE(v);
3798 p = PyUnicode_AS_UNICODE(u);
3799 e = p + PyUnicode_GET_SIZE(u);
3800 result = 0;
3801 while (p < e) {
3802 if (*p++ == ch) {
3803 result = 1;
3804 break;
3805 }
3806 }
3807
3808 Py_DECREF(u);
3809 Py_DECREF(v);
3810 return result;
3811
3812onError:
3813 Py_XDECREF(u);
3814 Py_XDECREF(v);
3815 return -1;
3816}
3817
Guido van Rossumd57fd912000-03-10 22:53:23 +00003818/* Concat to string or Unicode object giving a new Unicode object. */
3819
3820PyObject *PyUnicode_Concat(PyObject *left,
3821 PyObject *right)
3822{
3823 PyUnicodeObject *u = NULL, *v = NULL, *w;
3824
3825 /* Coerce the two arguments */
3826 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3827 if (u == NULL)
3828 goto onError;
3829 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3830 if (v == NULL)
3831 goto onError;
3832
3833 /* Shortcuts */
3834 if (v == unicode_empty) {
3835 Py_DECREF(v);
3836 return (PyObject *)u;
3837 }
3838 if (u == unicode_empty) {
3839 Py_DECREF(u);
3840 return (PyObject *)v;
3841 }
3842
3843 /* Concat the two Unicode strings */
3844 w = _PyUnicode_New(u->length + v->length);
3845 if (w == NULL)
3846 goto onError;
3847 Py_UNICODE_COPY(w->str, u->str, u->length);
3848 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3849
3850 Py_DECREF(u);
3851 Py_DECREF(v);
3852 return (PyObject *)w;
3853
3854onError:
3855 Py_XDECREF(u);
3856 Py_XDECREF(v);
3857 return NULL;
3858}
3859
3860static char count__doc__[] =
3861"S.count(sub[, start[, end]]) -> int\n\
3862\n\
3863Return the number of occurrences of substring sub in Unicode string\n\
3864S[start:end]. Optional arguments start and end are\n\
3865interpreted as in slice notation.";
3866
3867static PyObject *
3868unicode_count(PyUnicodeObject *self, PyObject *args)
3869{
3870 PyUnicodeObject *substring;
3871 int start = 0;
3872 int end = INT_MAX;
3873 PyObject *result;
3874
Guido van Rossumb8872e62000-05-09 14:14:27 +00003875 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3876 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877 return NULL;
3878
3879 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3880 (PyObject *)substring);
3881 if (substring == NULL)
3882 return NULL;
3883
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 if (start < 0)
3885 start += self->length;
3886 if (start < 0)
3887 start = 0;
3888 if (end > self->length)
3889 end = self->length;
3890 if (end < 0)
3891 end += self->length;
3892 if (end < 0)
3893 end = 0;
3894
3895 result = PyInt_FromLong((long) count(self, start, end, substring));
3896
3897 Py_DECREF(substring);
3898 return result;
3899}
3900
3901static char encode__doc__[] =
3902"S.encode([encoding[,errors]]) -> string\n\
3903\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003904Return an encoded string version of S. Default encoding is the current\n\
3905default string encoding. errors may be given to set a different error\n\
3906handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3907a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003908
3909static PyObject *
3910unicode_encode(PyUnicodeObject *self, PyObject *args)
3911{
3912 char *encoding = NULL;
3913 char *errors = NULL;
3914 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3915 return NULL;
3916 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3917}
3918
3919static char expandtabs__doc__[] =
3920"S.expandtabs([tabsize]) -> unicode\n\
3921\n\
3922Return a copy of S where all tab characters are expanded using spaces.\n\
3923If tabsize is not given, a tab size of 8 characters is assumed.";
3924
3925static PyObject*
3926unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3927{
3928 Py_UNICODE *e;
3929 Py_UNICODE *p;
3930 Py_UNICODE *q;
3931 int i, j;
3932 PyUnicodeObject *u;
3933 int tabsize = 8;
3934
3935 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3936 return NULL;
3937
Thomas Wouters7e474022000-07-16 12:04:32 +00003938 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003939 i = j = 0;
3940 e = self->str + self->length;
3941 for (p = self->str; p < e; p++)
3942 if (*p == '\t') {
3943 if (tabsize > 0)
3944 j += tabsize - (j % tabsize);
3945 }
3946 else {
3947 j++;
3948 if (*p == '\n' || *p == '\r') {
3949 i += j;
3950 j = 0;
3951 }
3952 }
3953
3954 /* Second pass: create output string and fill it */
3955 u = _PyUnicode_New(i + j);
3956 if (!u)
3957 return NULL;
3958
3959 j = 0;
3960 q = u->str;
3961
3962 for (p = self->str; p < e; p++)
3963 if (*p == '\t') {
3964 if (tabsize > 0) {
3965 i = tabsize - (j % tabsize);
3966 j += i;
3967 while (i--)
3968 *q++ = ' ';
3969 }
3970 }
3971 else {
3972 j++;
3973 *q++ = *p;
3974 if (*p == '\n' || *p == '\r')
3975 j = 0;
3976 }
3977
3978 return (PyObject*) u;
3979}
3980
3981static char find__doc__[] =
3982"S.find(sub [,start [,end]]) -> int\n\
3983\n\
3984Return the lowest index in S where substring sub is found,\n\
3985such that sub is contained within s[start,end]. Optional\n\
3986arguments start and end are interpreted as in slice notation.\n\
3987\n\
3988Return -1 on failure.";
3989
3990static PyObject *
3991unicode_find(PyUnicodeObject *self, PyObject *args)
3992{
3993 PyUnicodeObject *substring;
3994 int start = 0;
3995 int end = INT_MAX;
3996 PyObject *result;
3997
Guido van Rossumb8872e62000-05-09 14:14:27 +00003998 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3999 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004000 return NULL;
4001 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4002 (PyObject *)substring);
4003 if (substring == NULL)
4004 return NULL;
4005
4006 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4007
4008 Py_DECREF(substring);
4009 return result;
4010}
4011
4012static PyObject *
4013unicode_getitem(PyUnicodeObject *self, int index)
4014{
4015 if (index < 0 || index >= self->length) {
4016 PyErr_SetString(PyExc_IndexError, "string index out of range");
4017 return NULL;
4018 }
4019
4020 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4021}
4022
4023static long
4024unicode_hash(PyUnicodeObject *self)
4025{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004026 /* Since Unicode objects compare equal to their ASCII string
4027 counterparts, they should use the individual character values
4028 as basis for their hash value. This is needed to assure that
4029 strings and Unicode objects behave in the same way as
4030 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031
Fredrik Lundhdde61642000-07-10 18:27:47 +00004032 register int len;
4033 register Py_UNICODE *p;
4034 register long x;
4035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 if (self->hash != -1)
4037 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004038 len = PyUnicode_GET_SIZE(self);
4039 p = PyUnicode_AS_UNICODE(self);
4040 x = *p << 7;
4041 while (--len >= 0)
4042 x = (1000003*x) ^ *p++;
4043 x ^= PyUnicode_GET_SIZE(self);
4044 if (x == -1)
4045 x = -2;
4046 self->hash = x;
4047 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004048}
4049
4050static char index__doc__[] =
4051"S.index(sub [,start [,end]]) -> int\n\
4052\n\
4053Like S.find() but raise ValueError when the substring is not found.";
4054
4055static PyObject *
4056unicode_index(PyUnicodeObject *self, PyObject *args)
4057{
4058 int result;
4059 PyUnicodeObject *substring;
4060 int start = 0;
4061 int end = INT_MAX;
4062
Guido van Rossumb8872e62000-05-09 14:14:27 +00004063 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4064 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 return NULL;
4066
4067 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4068 (PyObject *)substring);
4069 if (substring == NULL)
4070 return NULL;
4071
4072 result = findstring(self, substring, start, end, 1);
4073
4074 Py_DECREF(substring);
4075 if (result < 0) {
4076 PyErr_SetString(PyExc_ValueError, "substring not found");
4077 return NULL;
4078 }
4079 return PyInt_FromLong(result);
4080}
4081
4082static char islower__doc__[] =
4083"S.islower() -> int\n\
4084\n\
4085Return 1 if all cased characters in S are lowercase and there is\n\
4086at least one cased character in S, 0 otherwise.";
4087
4088static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004089unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090{
4091 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4092 register const Py_UNICODE *e;
4093 int cased;
4094
Guido van Rossumd57fd912000-03-10 22:53:23 +00004095 /* Shortcut for single character strings */
4096 if (PyUnicode_GET_SIZE(self) == 1)
4097 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4098
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004099 /* Special case for empty strings */
4100 if (PyString_GET_SIZE(self) == 0)
4101 return PyInt_FromLong(0);
4102
Guido van Rossumd57fd912000-03-10 22:53:23 +00004103 e = p + PyUnicode_GET_SIZE(self);
4104 cased = 0;
4105 for (; p < e; p++) {
4106 register const Py_UNICODE ch = *p;
4107
4108 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4109 return PyInt_FromLong(0);
4110 else if (!cased && Py_UNICODE_ISLOWER(ch))
4111 cased = 1;
4112 }
4113 return PyInt_FromLong(cased);
4114}
4115
4116static char isupper__doc__[] =
4117"S.isupper() -> int\n\
4118\n\
4119Return 1 if all cased characters in S are uppercase and there is\n\
4120at least one cased character in S, 0 otherwise.";
4121
4122static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004123unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124{
4125 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4126 register const Py_UNICODE *e;
4127 int cased;
4128
Guido van Rossumd57fd912000-03-10 22:53:23 +00004129 /* Shortcut for single character strings */
4130 if (PyUnicode_GET_SIZE(self) == 1)
4131 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4132
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004133 /* Special case for empty strings */
4134 if (PyString_GET_SIZE(self) == 0)
4135 return PyInt_FromLong(0);
4136
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137 e = p + PyUnicode_GET_SIZE(self);
4138 cased = 0;
4139 for (; p < e; p++) {
4140 register const Py_UNICODE ch = *p;
4141
4142 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4143 return PyInt_FromLong(0);
4144 else if (!cased && Py_UNICODE_ISUPPER(ch))
4145 cased = 1;
4146 }
4147 return PyInt_FromLong(cased);
4148}
4149
4150static char istitle__doc__[] =
4151"S.istitle() -> int\n\
4152\n\
4153Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4154may only follow uncased characters and lowercase characters only cased\n\
4155ones. Return 0 otherwise.";
4156
4157static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004158unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004159{
4160 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4161 register const Py_UNICODE *e;
4162 int cased, previous_is_cased;
4163
Guido van Rossumd57fd912000-03-10 22:53:23 +00004164 /* Shortcut for single character strings */
4165 if (PyUnicode_GET_SIZE(self) == 1)
4166 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4167 (Py_UNICODE_ISUPPER(*p) != 0));
4168
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004169 /* Special case for empty strings */
4170 if (PyString_GET_SIZE(self) == 0)
4171 return PyInt_FromLong(0);
4172
Guido van Rossumd57fd912000-03-10 22:53:23 +00004173 e = p + PyUnicode_GET_SIZE(self);
4174 cased = 0;
4175 previous_is_cased = 0;
4176 for (; p < e; p++) {
4177 register const Py_UNICODE ch = *p;
4178
4179 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4180 if (previous_is_cased)
4181 return PyInt_FromLong(0);
4182 previous_is_cased = 1;
4183 cased = 1;
4184 }
4185 else if (Py_UNICODE_ISLOWER(ch)) {
4186 if (!previous_is_cased)
4187 return PyInt_FromLong(0);
4188 previous_is_cased = 1;
4189 cased = 1;
4190 }
4191 else
4192 previous_is_cased = 0;
4193 }
4194 return PyInt_FromLong(cased);
4195}
4196
4197static char isspace__doc__[] =
4198"S.isspace() -> int\n\
4199\n\
4200Return 1 if there are only whitespace characters in S,\n\
42010 otherwise.";
4202
4203static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004204unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205{
4206 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4207 register const Py_UNICODE *e;
4208
Guido van Rossumd57fd912000-03-10 22:53:23 +00004209 /* Shortcut for single character strings */
4210 if (PyUnicode_GET_SIZE(self) == 1 &&
4211 Py_UNICODE_ISSPACE(*p))
4212 return PyInt_FromLong(1);
4213
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004214 /* Special case for empty strings */
4215 if (PyString_GET_SIZE(self) == 0)
4216 return PyInt_FromLong(0);
4217
Guido van Rossumd57fd912000-03-10 22:53:23 +00004218 e = p + PyUnicode_GET_SIZE(self);
4219 for (; p < e; p++) {
4220 if (!Py_UNICODE_ISSPACE(*p))
4221 return PyInt_FromLong(0);
4222 }
4223 return PyInt_FromLong(1);
4224}
4225
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004226static char isalpha__doc__[] =
4227"S.isalpha() -> int\n\
4228\n\
4229Return 1 if all characters in S are alphabetic\n\
4230and there is at least one character in S, 0 otherwise.";
4231
4232static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004233unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004234{
4235 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4236 register const Py_UNICODE *e;
4237
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004238 /* Shortcut for single character strings */
4239 if (PyUnicode_GET_SIZE(self) == 1 &&
4240 Py_UNICODE_ISALPHA(*p))
4241 return PyInt_FromLong(1);
4242
4243 /* Special case for empty strings */
4244 if (PyString_GET_SIZE(self) == 0)
4245 return PyInt_FromLong(0);
4246
4247 e = p + PyUnicode_GET_SIZE(self);
4248 for (; p < e; p++) {
4249 if (!Py_UNICODE_ISALPHA(*p))
4250 return PyInt_FromLong(0);
4251 }
4252 return PyInt_FromLong(1);
4253}
4254
4255static char isalnum__doc__[] =
4256"S.isalnum() -> int\n\
4257\n\
4258Return 1 if all characters in S are alphanumeric\n\
4259and there is at least one character in S, 0 otherwise.";
4260
4261static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004262unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004263{
4264 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4265 register const Py_UNICODE *e;
4266
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004267 /* Shortcut for single character strings */
4268 if (PyUnicode_GET_SIZE(self) == 1 &&
4269 Py_UNICODE_ISALNUM(*p))
4270 return PyInt_FromLong(1);
4271
4272 /* Special case for empty strings */
4273 if (PyString_GET_SIZE(self) == 0)
4274 return PyInt_FromLong(0);
4275
4276 e = p + PyUnicode_GET_SIZE(self);
4277 for (; p < e; p++) {
4278 if (!Py_UNICODE_ISALNUM(*p))
4279 return PyInt_FromLong(0);
4280 }
4281 return PyInt_FromLong(1);
4282}
4283
Guido van Rossumd57fd912000-03-10 22:53:23 +00004284static char isdecimal__doc__[] =
4285"S.isdecimal() -> int\n\
4286\n\
4287Return 1 if there are only decimal characters in S,\n\
42880 otherwise.";
4289
4290static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004291unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292{
4293 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4294 register const Py_UNICODE *e;
4295
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296 /* Shortcut for single character strings */
4297 if (PyUnicode_GET_SIZE(self) == 1 &&
4298 Py_UNICODE_ISDECIMAL(*p))
4299 return PyInt_FromLong(1);
4300
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004301 /* Special case for empty strings */
4302 if (PyString_GET_SIZE(self) == 0)
4303 return PyInt_FromLong(0);
4304
Guido van Rossumd57fd912000-03-10 22:53:23 +00004305 e = p + PyUnicode_GET_SIZE(self);
4306 for (; p < e; p++) {
4307 if (!Py_UNICODE_ISDECIMAL(*p))
4308 return PyInt_FromLong(0);
4309 }
4310 return PyInt_FromLong(1);
4311}
4312
4313static char isdigit__doc__[] =
4314"S.isdigit() -> int\n\
4315\n\
4316Return 1 if there are only digit characters in S,\n\
43170 otherwise.";
4318
4319static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004320unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321{
4322 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4323 register const Py_UNICODE *e;
4324
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325 /* Shortcut for single character strings */
4326 if (PyUnicode_GET_SIZE(self) == 1 &&
4327 Py_UNICODE_ISDIGIT(*p))
4328 return PyInt_FromLong(1);
4329
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004330 /* Special case for empty strings */
4331 if (PyString_GET_SIZE(self) == 0)
4332 return PyInt_FromLong(0);
4333
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334 e = p + PyUnicode_GET_SIZE(self);
4335 for (; p < e; p++) {
4336 if (!Py_UNICODE_ISDIGIT(*p))
4337 return PyInt_FromLong(0);
4338 }
4339 return PyInt_FromLong(1);
4340}
4341
4342static char isnumeric__doc__[] =
4343"S.isnumeric() -> int\n\
4344\n\
4345Return 1 if there are only numeric characters in S,\n\
43460 otherwise.";
4347
4348static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004349unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350{
4351 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4352 register const Py_UNICODE *e;
4353
Guido van Rossumd57fd912000-03-10 22:53:23 +00004354 /* Shortcut for single character strings */
4355 if (PyUnicode_GET_SIZE(self) == 1 &&
4356 Py_UNICODE_ISNUMERIC(*p))
4357 return PyInt_FromLong(1);
4358
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004359 /* Special case for empty strings */
4360 if (PyString_GET_SIZE(self) == 0)
4361 return PyInt_FromLong(0);
4362
Guido van Rossumd57fd912000-03-10 22:53:23 +00004363 e = p + PyUnicode_GET_SIZE(self);
4364 for (; p < e; p++) {
4365 if (!Py_UNICODE_ISNUMERIC(*p))
4366 return PyInt_FromLong(0);
4367 }
4368 return PyInt_FromLong(1);
4369}
4370
4371static char join__doc__[] =
4372"S.join(sequence) -> unicode\n\
4373\n\
4374Return a string which is the concatenation of the strings in the\n\
4375sequence. The separator between elements is S.";
4376
4377static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004378unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004379{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004380 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004381}
4382
4383static int
4384unicode_length(PyUnicodeObject *self)
4385{
4386 return self->length;
4387}
4388
4389static char ljust__doc__[] =
4390"S.ljust(width) -> unicode\n\
4391\n\
4392Return S left justified in a Unicode string of length width. Padding is\n\
4393done using spaces.";
4394
4395static PyObject *
4396unicode_ljust(PyUnicodeObject *self, PyObject *args)
4397{
4398 int width;
4399 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4400 return NULL;
4401
Tim Peters7a29bd52001-09-12 03:03:31 +00004402 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403 Py_INCREF(self);
4404 return (PyObject*) self;
4405 }
4406
4407 return (PyObject*) pad(self, 0, width - self->length, ' ');
4408}
4409
4410static char lower__doc__[] =
4411"S.lower() -> unicode\n\
4412\n\
4413Return a copy of the string S converted to lowercase.";
4414
4415static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004416unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004418 return fixup(self, fixlower);
4419}
4420
4421static char lstrip__doc__[] =
4422"S.lstrip() -> unicode\n\
4423\n\
4424Return a copy of the string S with leading whitespace removed.";
4425
4426static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004427unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004429 return strip(self, 1, 0);
4430}
4431
4432static PyObject*
4433unicode_repeat(PyUnicodeObject *str, int len)
4434{
4435 PyUnicodeObject *u;
4436 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004437 int nchars;
4438 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439
4440 if (len < 0)
4441 len = 0;
4442
Tim Peters7a29bd52001-09-12 03:03:31 +00004443 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004444 /* no repeat, return original string */
4445 Py_INCREF(str);
4446 return (PyObject*) str;
4447 }
Tim Peters8f422462000-09-09 06:13:41 +00004448
4449 /* ensure # of chars needed doesn't overflow int and # of bytes
4450 * needed doesn't overflow size_t
4451 */
4452 nchars = len * str->length;
4453 if (len && nchars / len != str->length) {
4454 PyErr_SetString(PyExc_OverflowError,
4455 "repeated string is too long");
4456 return NULL;
4457 }
4458 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4459 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4460 PyErr_SetString(PyExc_OverflowError,
4461 "repeated string is too long");
4462 return NULL;
4463 }
4464 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004465 if (!u)
4466 return NULL;
4467
4468 p = u->str;
4469
4470 while (len-- > 0) {
4471 Py_UNICODE_COPY(p, str->str, str->length);
4472 p += str->length;
4473 }
4474
4475 return (PyObject*) u;
4476}
4477
4478PyObject *PyUnicode_Replace(PyObject *obj,
4479 PyObject *subobj,
4480 PyObject *replobj,
4481 int maxcount)
4482{
4483 PyObject *self;
4484 PyObject *str1;
4485 PyObject *str2;
4486 PyObject *result;
4487
4488 self = PyUnicode_FromObject(obj);
4489 if (self == NULL)
4490 return NULL;
4491 str1 = PyUnicode_FromObject(subobj);
4492 if (str1 == NULL) {
4493 Py_DECREF(self);
4494 return NULL;
4495 }
4496 str2 = PyUnicode_FromObject(replobj);
4497 if (str2 == NULL) {
4498 Py_DECREF(self);
4499 Py_DECREF(str1);
4500 return NULL;
4501 }
4502 result = replace((PyUnicodeObject *)self,
4503 (PyUnicodeObject *)str1,
4504 (PyUnicodeObject *)str2,
4505 maxcount);
4506 Py_DECREF(self);
4507 Py_DECREF(str1);
4508 Py_DECREF(str2);
4509 return result;
4510}
4511
4512static char replace__doc__[] =
4513"S.replace (old, new[, maxsplit]) -> unicode\n\
4514\n\
4515Return a copy of S with all occurrences of substring\n\
4516old replaced by new. If the optional argument maxsplit is\n\
4517given, only the first maxsplit occurrences are replaced.";
4518
4519static PyObject*
4520unicode_replace(PyUnicodeObject *self, PyObject *args)
4521{
4522 PyUnicodeObject *str1;
4523 PyUnicodeObject *str2;
4524 int maxcount = -1;
4525 PyObject *result;
4526
4527 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4528 return NULL;
4529 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4530 if (str1 == NULL)
4531 return NULL;
4532 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4533 if (str2 == NULL)
4534 return NULL;
4535
4536 result = replace(self, str1, str2, maxcount);
4537
4538 Py_DECREF(str1);
4539 Py_DECREF(str2);
4540 return result;
4541}
4542
4543static
4544PyObject *unicode_repr(PyObject *unicode)
4545{
4546 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4547 PyUnicode_GET_SIZE(unicode),
4548 1);
4549}
4550
4551static char rfind__doc__[] =
4552"S.rfind(sub [,start [,end]]) -> int\n\
4553\n\
4554Return the highest index in S where substring sub is found,\n\
4555such that sub is contained within s[start,end]. Optional\n\
4556arguments start and end are interpreted as in slice notation.\n\
4557\n\
4558Return -1 on failure.";
4559
4560static PyObject *
4561unicode_rfind(PyUnicodeObject *self, PyObject *args)
4562{
4563 PyUnicodeObject *substring;
4564 int start = 0;
4565 int end = INT_MAX;
4566 PyObject *result;
4567
Guido van Rossumb8872e62000-05-09 14:14:27 +00004568 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4569 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004570 return NULL;
4571 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4572 (PyObject *)substring);
4573 if (substring == NULL)
4574 return NULL;
4575
4576 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4577
4578 Py_DECREF(substring);
4579 return result;
4580}
4581
4582static char rindex__doc__[] =
4583"S.rindex(sub [,start [,end]]) -> int\n\
4584\n\
4585Like S.rfind() but raise ValueError when the substring is not found.";
4586
4587static PyObject *
4588unicode_rindex(PyUnicodeObject *self, PyObject *args)
4589{
4590 int result;
4591 PyUnicodeObject *substring;
4592 int start = 0;
4593 int end = INT_MAX;
4594
Guido van Rossumb8872e62000-05-09 14:14:27 +00004595 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4596 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 return NULL;
4598 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4599 (PyObject *)substring);
4600 if (substring == NULL)
4601 return NULL;
4602
4603 result = findstring(self, substring, start, end, -1);
4604
4605 Py_DECREF(substring);
4606 if (result < 0) {
4607 PyErr_SetString(PyExc_ValueError, "substring not found");
4608 return NULL;
4609 }
4610 return PyInt_FromLong(result);
4611}
4612
4613static char rjust__doc__[] =
4614"S.rjust(width) -> unicode\n\
4615\n\
4616Return S right justified in a Unicode string of length width. Padding is\n\
4617done using spaces.";
4618
4619static PyObject *
4620unicode_rjust(PyUnicodeObject *self, PyObject *args)
4621{
4622 int width;
4623 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4624 return NULL;
4625
Tim Peters7a29bd52001-09-12 03:03:31 +00004626 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 Py_INCREF(self);
4628 return (PyObject*) self;
4629 }
4630
4631 return (PyObject*) pad(self, width - self->length, 0, ' ');
4632}
4633
4634static char rstrip__doc__[] =
4635"S.rstrip() -> unicode\n\
4636\n\
4637Return a copy of the string S with trailing whitespace removed.";
4638
4639static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004640unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004641{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642 return strip(self, 0, 1);
4643}
4644
4645static PyObject*
4646unicode_slice(PyUnicodeObject *self, int start, int end)
4647{
4648 /* standard clamping */
4649 if (start < 0)
4650 start = 0;
4651 if (end < 0)
4652 end = 0;
4653 if (end > self->length)
4654 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004655 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656 /* full slice, return original string */
4657 Py_INCREF(self);
4658 return (PyObject*) self;
4659 }
4660 if (start > end)
4661 start = end;
4662 /* copy slice */
4663 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4664 end - start);
4665}
4666
4667PyObject *PyUnicode_Split(PyObject *s,
4668 PyObject *sep,
4669 int maxsplit)
4670{
4671 PyObject *result;
4672
4673 s = PyUnicode_FromObject(s);
4674 if (s == NULL)
4675 return NULL;
4676 if (sep != NULL) {
4677 sep = PyUnicode_FromObject(sep);
4678 if (sep == NULL) {
4679 Py_DECREF(s);
4680 return NULL;
4681 }
4682 }
4683
4684 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4685
4686 Py_DECREF(s);
4687 Py_XDECREF(sep);
4688 return result;
4689}
4690
4691static char split__doc__[] =
4692"S.split([sep [,maxsplit]]) -> list of strings\n\
4693\n\
4694Return a list of the words in S, using sep as the\n\
4695delimiter string. If maxsplit is given, at most maxsplit\n\
4696splits are done. If sep is not specified, any whitespace string\n\
4697is a separator.";
4698
4699static PyObject*
4700unicode_split(PyUnicodeObject *self, PyObject *args)
4701{
4702 PyObject *substring = Py_None;
4703 int maxcount = -1;
4704
4705 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4706 return NULL;
4707
4708 if (substring == Py_None)
4709 return split(self, NULL, maxcount);
4710 else if (PyUnicode_Check(substring))
4711 return split(self, (PyUnicodeObject *)substring, maxcount);
4712 else
4713 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4714}
4715
4716static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004717"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718\n\
4719Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004720Line breaks are not included in the resulting list unless keepends\n\
4721is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004722
4723static PyObject*
4724unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4725{
Guido van Rossum86662912000-04-11 15:38:46 +00004726 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727
Guido van Rossum86662912000-04-11 15:38:46 +00004728 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004729 return NULL;
4730
Guido van Rossum86662912000-04-11 15:38:46 +00004731 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732}
4733
4734static
4735PyObject *unicode_str(PyUnicodeObject *self)
4736{
Fred Drakee4315f52000-05-09 19:53:39 +00004737 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738}
4739
4740static char strip__doc__[] =
4741"S.strip() -> unicode\n\
4742\n\
4743Return a copy of S with leading and trailing whitespace removed.";
4744
4745static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004746unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 return strip(self, 1, 1);
4749}
4750
4751static char swapcase__doc__[] =
4752"S.swapcase() -> unicode\n\
4753\n\
4754Return a copy of S with uppercase characters converted to lowercase\n\
4755and vice versa.";
4756
4757static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004758unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 return fixup(self, fixswapcase);
4761}
4762
4763static char translate__doc__[] =
4764"S.translate(table) -> unicode\n\
4765\n\
4766Return a copy of the string S, where all characters have been mapped\n\
4767through the given translation table, which must be a mapping of\n\
4768Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4769are left untouched. Characters mapped to None are deleted.";
4770
4771static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004772unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 return PyUnicode_TranslateCharmap(self->str,
4775 self->length,
4776 table,
4777 "ignore");
4778}
4779
4780static char upper__doc__[] =
4781"S.upper() -> unicode\n\
4782\n\
4783Return a copy of S converted to uppercase.";
4784
4785static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004786unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 return fixup(self, fixupper);
4789}
4790
4791#if 0
4792static char zfill__doc__[] =
4793"S.zfill(width) -> unicode\n\
4794\n\
4795Pad a numeric string x with zeros on the left, to fill a field\n\
4796of the specified width. The string x is never truncated.";
4797
4798static PyObject *
4799unicode_zfill(PyUnicodeObject *self, PyObject *args)
4800{
4801 int fill;
4802 PyUnicodeObject *u;
4803
4804 int width;
4805 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4806 return NULL;
4807
4808 if (self->length >= width) {
4809 Py_INCREF(self);
4810 return (PyObject*) self;
4811 }
4812
4813 fill = width - self->length;
4814
4815 u = pad(self, fill, 0, '0');
4816
4817 if (u->str[fill] == '+' || u->str[fill] == '-') {
4818 /* move sign to beginning of string */
4819 u->str[0] = u->str[fill];
4820 u->str[fill] = '0';
4821 }
4822
4823 return (PyObject*) u;
4824}
4825#endif
4826
4827#if 0
4828static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004829unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004830{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004831 return PyInt_FromLong(unicode_freelist_size);
4832}
4833#endif
4834
4835static char startswith__doc__[] =
4836"S.startswith(prefix[, start[, end]]) -> int\n\
4837\n\
4838Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4839optional start, test S beginning at that position. With optional end, stop\n\
4840comparing S at that position.";
4841
4842static PyObject *
4843unicode_startswith(PyUnicodeObject *self,
4844 PyObject *args)
4845{
4846 PyUnicodeObject *substring;
4847 int start = 0;
4848 int end = INT_MAX;
4849 PyObject *result;
4850
Guido van Rossumb8872e62000-05-09 14:14:27 +00004851 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4852 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004853 return NULL;
4854 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4855 (PyObject *)substring);
4856 if (substring == NULL)
4857 return NULL;
4858
4859 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4860
4861 Py_DECREF(substring);
4862 return result;
4863}
4864
4865
4866static char endswith__doc__[] =
4867"S.endswith(suffix[, start[, end]]) -> int\n\
4868\n\
4869Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4870optional start, test S beginning at that position. With optional end, stop\n\
4871comparing S at that position.";
4872
4873static PyObject *
4874unicode_endswith(PyUnicodeObject *self,
4875 PyObject *args)
4876{
4877 PyUnicodeObject *substring;
4878 int start = 0;
4879 int end = INT_MAX;
4880 PyObject *result;
4881
Guido van Rossumb8872e62000-05-09 14:14:27 +00004882 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4883 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 return NULL;
4885 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4886 (PyObject *)substring);
4887 if (substring == NULL)
4888 return NULL;
4889
4890 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4891
4892 Py_DECREF(substring);
4893 return result;
4894}
4895
4896
4897static PyMethodDef unicode_methods[] = {
4898
4899 /* Order is according to common usage: often used methods should
4900 appear first, since lookup is done sequentially. */
4901
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004902 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4903 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4904 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4905 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4906 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4907 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4908 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4909 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4910 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4911 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4912 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4913 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4914 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4915 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4916/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4917 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4918 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4919 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4920 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4921 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4922 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4923 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4924 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4925 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4926 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4927 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4928 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4929 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4930 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4931 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4932 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4933 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4934 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4935 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4936 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004938 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4939 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004940#endif
4941
4942#if 0
4943 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004944 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004945#endif
4946
4947 {NULL, NULL}
4948};
4949
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950static PySequenceMethods unicode_as_sequence = {
4951 (inquiry) unicode_length, /* sq_length */
4952 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4953 (intargfunc) unicode_repeat, /* sq_repeat */
4954 (intargfunc) unicode_getitem, /* sq_item */
4955 (intintargfunc) unicode_slice, /* sq_slice */
4956 0, /* sq_ass_item */
4957 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004958 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959};
4960
4961static int
4962unicode_buffer_getreadbuf(PyUnicodeObject *self,
4963 int index,
4964 const void **ptr)
4965{
4966 if (index != 0) {
4967 PyErr_SetString(PyExc_SystemError,
4968 "accessing non-existent unicode segment");
4969 return -1;
4970 }
4971 *ptr = (void *) self->str;
4972 return PyUnicode_GET_DATA_SIZE(self);
4973}
4974
4975static int
4976unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4977 const void **ptr)
4978{
4979 PyErr_SetString(PyExc_TypeError,
4980 "cannot use unicode as modifyable buffer");
4981 return -1;
4982}
4983
4984static int
4985unicode_buffer_getsegcount(PyUnicodeObject *self,
4986 int *lenp)
4987{
4988 if (lenp)
4989 *lenp = PyUnicode_GET_DATA_SIZE(self);
4990 return 1;
4991}
4992
4993static int
4994unicode_buffer_getcharbuf(PyUnicodeObject *self,
4995 int index,
4996 const void **ptr)
4997{
4998 PyObject *str;
4999
5000 if (index != 0) {
5001 PyErr_SetString(PyExc_SystemError,
5002 "accessing non-existent unicode segment");
5003 return -1;
5004 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005005 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 if (str == NULL)
5007 return -1;
5008 *ptr = (void *) PyString_AS_STRING(str);
5009 return PyString_GET_SIZE(str);
5010}
5011
5012/* Helpers for PyUnicode_Format() */
5013
5014static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005015getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016{
5017 int argidx = *p_argidx;
5018 if (argidx < arglen) {
5019 (*p_argidx)++;
5020 if (arglen < 0)
5021 return args;
5022 else
5023 return PyTuple_GetItem(args, argidx);
5024 }
5025 PyErr_SetString(PyExc_TypeError,
5026 "not enough arguments for format string");
5027 return NULL;
5028}
5029
5030#define F_LJUST (1<<0)
5031#define F_SIGN (1<<1)
5032#define F_BLANK (1<<2)
5033#define F_ALT (1<<3)
5034#define F_ZERO (1<<4)
5035
5036static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005038{
5039 register int i;
5040 int len;
5041 va_list va;
5042 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005043 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044
5045 /* First, format the string as char array, then expand to Py_UNICODE
5046 array. */
5047 charbuffer = (char *)buffer;
5048 len = vsprintf(charbuffer, format, va);
5049 for (i = len - 1; i >= 0; i--)
5050 buffer[i] = (Py_UNICODE) charbuffer[i];
5051
5052 va_end(va);
5053 return len;
5054}
5055
5056static int
5057formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005058 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005059 int flags,
5060 int prec,
5061 int type,
5062 PyObject *v)
5063{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005064 /* fmt = '%#.' + `prec` + `type`
5065 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066 char fmt[20];
5067 double x;
5068
5069 x = PyFloat_AsDouble(v);
5070 if (x == -1.0 && PyErr_Occurred())
5071 return -1;
5072 if (prec < 0)
5073 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5075 type = 'g';
5076 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005077 /* worst case length calc to ensure no buffer overrun:
5078 fmt = %#.<prec>g
5079 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5080 for any double rep.)
5081 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5082 If prec=0 the effective precision is 1 (the leading digit is
5083 always given), therefore increase by one to 10+prec. */
5084 if (buflen <= (size_t)10 + (size_t)prec) {
5085 PyErr_SetString(PyExc_OverflowError,
5086 "formatted float is too long (precision too long?)");
5087 return -1;
5088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005089 return usprintf(buf, fmt, x);
5090}
5091
Tim Peters38fd5b62000-09-21 05:43:11 +00005092static PyObject*
5093formatlong(PyObject *val, int flags, int prec, int type)
5094{
5095 char *buf;
5096 int i, len;
5097 PyObject *str; /* temporary string object. */
5098 PyUnicodeObject *result;
5099
5100 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5101 if (!str)
5102 return NULL;
5103 result = _PyUnicode_New(len);
5104 for (i = 0; i < len; i++)
5105 result->str[i] = buf[i];
5106 result->str[len] = 0;
5107 Py_DECREF(str);
5108 return (PyObject*)result;
5109}
5110
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111static int
5112formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005113 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 int flags,
5115 int prec,
5116 int type,
5117 PyObject *v)
5118{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005119 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005120 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5121 + 1 + 1 = 24*/
5122 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005123 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005124 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005125
5126 x = PyInt_AsLong(v);
5127 if (x == -1 && PyErr_Occurred())
5128 return -1;
5129 if (prec < 0)
5130 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005131 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5132 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5133 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5134 PyErr_SetString(PyExc_OverflowError,
5135 "formatted integer is too long (precision too long?)");
5136 return -1;
5137 }
Tim Petersfff53252001-04-12 18:38:48 +00005138 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5139 * but we want it (for consistency with other %#x conversions, and
5140 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005141 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5142 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5143 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005144 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005145 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5146 /* Only way to know what the platform does is to try it. */
5147 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5148 if (fmt[1] != (char)type) {
5149 /* Supply our own leading 0x/0X -- needed under std C */
5150 use_native_c_format = 0;
5151 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5152 }
5153 }
5154 if (use_native_c_format)
5155 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 return usprintf(buf, fmt, x);
5157}
5158
5159static int
5160formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005161 size_t buflen,
5162 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005164 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005165 if (PyUnicode_Check(v)) {
5166 if (PyUnicode_GET_SIZE(v) != 1)
5167 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005171 else if (PyString_Check(v)) {
5172 if (PyString_GET_SIZE(v) != 1)
5173 goto onError;
5174 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5175 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176
5177 else {
5178 /* Integer input truncated to a character */
5179 long x;
5180 x = PyInt_AsLong(v);
5181 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005182 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 buf[0] = (char) x;
5184 }
5185 buf[1] = '\0';
5186 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005187
5188 onError:
5189 PyErr_SetString(PyExc_TypeError,
5190 "%c requires int or char");
5191 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192}
5193
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005194/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5195
5196 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5197 chars are formatted. XXX This is a magic number. Each formatting
5198 routine does bounds checking to ensure no overflow, but a better
5199 solution may be to malloc a buffer of appropriate size for each
5200 format. For now, the current solution is sufficient.
5201*/
5202#define FORMATBUFLEN (size_t)120
5203
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204PyObject *PyUnicode_Format(PyObject *format,
5205 PyObject *args)
5206{
5207 Py_UNICODE *fmt, *res;
5208 int fmtcnt, rescnt, reslen, arglen, argidx;
5209 int args_owned = 0;
5210 PyUnicodeObject *result = NULL;
5211 PyObject *dict = NULL;
5212 PyObject *uformat;
5213
5214 if (format == NULL || args == NULL) {
5215 PyErr_BadInternalCall();
5216 return NULL;
5217 }
5218 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005219 if (uformat == NULL)
5220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005221 fmt = PyUnicode_AS_UNICODE(uformat);
5222 fmtcnt = PyUnicode_GET_SIZE(uformat);
5223
5224 reslen = rescnt = fmtcnt + 100;
5225 result = _PyUnicode_New(reslen);
5226 if (result == NULL)
5227 goto onError;
5228 res = PyUnicode_AS_UNICODE(result);
5229
5230 if (PyTuple_Check(args)) {
5231 arglen = PyTuple_Size(args);
5232 argidx = 0;
5233 }
5234 else {
5235 arglen = -1;
5236 argidx = -2;
5237 }
5238 if (args->ob_type->tp_as_mapping)
5239 dict = args;
5240
5241 while (--fmtcnt >= 0) {
5242 if (*fmt != '%') {
5243 if (--rescnt < 0) {
5244 rescnt = fmtcnt + 100;
5245 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005246 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005247 return NULL;
5248 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5249 --rescnt;
5250 }
5251 *res++ = *fmt++;
5252 }
5253 else {
5254 /* Got a format specifier */
5255 int flags = 0;
5256 int width = -1;
5257 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 Py_UNICODE c = '\0';
5259 Py_UNICODE fill;
5260 PyObject *v = NULL;
5261 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005262 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 Py_UNICODE sign;
5264 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005265 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266
5267 fmt++;
5268 if (*fmt == '(') {
5269 Py_UNICODE *keystart;
5270 int keylen;
5271 PyObject *key;
5272 int pcount = 1;
5273
5274 if (dict == NULL) {
5275 PyErr_SetString(PyExc_TypeError,
5276 "format requires a mapping");
5277 goto onError;
5278 }
5279 ++fmt;
5280 --fmtcnt;
5281 keystart = fmt;
5282 /* Skip over balanced parentheses */
5283 while (pcount > 0 && --fmtcnt >= 0) {
5284 if (*fmt == ')')
5285 --pcount;
5286 else if (*fmt == '(')
5287 ++pcount;
5288 fmt++;
5289 }
5290 keylen = fmt - keystart - 1;
5291 if (fmtcnt < 0 || pcount > 0) {
5292 PyErr_SetString(PyExc_ValueError,
5293 "incomplete format key");
5294 goto onError;
5295 }
Fred Drakee4315f52000-05-09 19:53:39 +00005296 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005297 then looked up since Python uses strings to hold
5298 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005299 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005300 key = PyUnicode_EncodeUTF8(keystart,
5301 keylen,
5302 NULL);
5303 if (key == NULL)
5304 goto onError;
5305 if (args_owned) {
5306 Py_DECREF(args);
5307 args_owned = 0;
5308 }
5309 args = PyObject_GetItem(dict, key);
5310 Py_DECREF(key);
5311 if (args == NULL) {
5312 goto onError;
5313 }
5314 args_owned = 1;
5315 arglen = -1;
5316 argidx = -2;
5317 }
5318 while (--fmtcnt >= 0) {
5319 switch (c = *fmt++) {
5320 case '-': flags |= F_LJUST; continue;
5321 case '+': flags |= F_SIGN; continue;
5322 case ' ': flags |= F_BLANK; continue;
5323 case '#': flags |= F_ALT; continue;
5324 case '0': flags |= F_ZERO; continue;
5325 }
5326 break;
5327 }
5328 if (c == '*') {
5329 v = getnextarg(args, arglen, &argidx);
5330 if (v == NULL)
5331 goto onError;
5332 if (!PyInt_Check(v)) {
5333 PyErr_SetString(PyExc_TypeError,
5334 "* wants int");
5335 goto onError;
5336 }
5337 width = PyInt_AsLong(v);
5338 if (width < 0) {
5339 flags |= F_LJUST;
5340 width = -width;
5341 }
5342 if (--fmtcnt >= 0)
5343 c = *fmt++;
5344 }
5345 else if (c >= '0' && c <= '9') {
5346 width = c - '0';
5347 while (--fmtcnt >= 0) {
5348 c = *fmt++;
5349 if (c < '0' || c > '9')
5350 break;
5351 if ((width*10) / 10 != width) {
5352 PyErr_SetString(PyExc_ValueError,
5353 "width too big");
5354 goto onError;
5355 }
5356 width = width*10 + (c - '0');
5357 }
5358 }
5359 if (c == '.') {
5360 prec = 0;
5361 if (--fmtcnt >= 0)
5362 c = *fmt++;
5363 if (c == '*') {
5364 v = getnextarg(args, arglen, &argidx);
5365 if (v == NULL)
5366 goto onError;
5367 if (!PyInt_Check(v)) {
5368 PyErr_SetString(PyExc_TypeError,
5369 "* wants int");
5370 goto onError;
5371 }
5372 prec = PyInt_AsLong(v);
5373 if (prec < 0)
5374 prec = 0;
5375 if (--fmtcnt >= 0)
5376 c = *fmt++;
5377 }
5378 else if (c >= '0' && c <= '9') {
5379 prec = c - '0';
5380 while (--fmtcnt >= 0) {
5381 c = Py_CHARMASK(*fmt++);
5382 if (c < '0' || c > '9')
5383 break;
5384 if ((prec*10) / 10 != prec) {
5385 PyErr_SetString(PyExc_ValueError,
5386 "prec too big");
5387 goto onError;
5388 }
5389 prec = prec*10 + (c - '0');
5390 }
5391 }
5392 } /* prec */
5393 if (fmtcnt >= 0) {
5394 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005395 if (--fmtcnt >= 0)
5396 c = *fmt++;
5397 }
5398 }
5399 if (fmtcnt < 0) {
5400 PyErr_SetString(PyExc_ValueError,
5401 "incomplete format");
5402 goto onError;
5403 }
5404 if (c != '%') {
5405 v = getnextarg(args, arglen, &argidx);
5406 if (v == NULL)
5407 goto onError;
5408 }
5409 sign = 0;
5410 fill = ' ';
5411 switch (c) {
5412
5413 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005414 pbuf = formatbuf;
5415 /* presume that buffer length is at least 1 */
5416 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005417 len = 1;
5418 break;
5419
5420 case 's':
5421 case 'r':
5422 if (PyUnicode_Check(v) && c == 's') {
5423 temp = v;
5424 Py_INCREF(temp);
5425 }
5426 else {
5427 PyObject *unicode;
5428 if (c == 's')
5429 temp = PyObject_Str(v);
5430 else
5431 temp = PyObject_Repr(v);
5432 if (temp == NULL)
5433 goto onError;
5434 if (!PyString_Check(temp)) {
5435 /* XXX Note: this should never happen, since
5436 PyObject_Repr() and PyObject_Str() assure
5437 this */
5438 Py_DECREF(temp);
5439 PyErr_SetString(PyExc_TypeError,
5440 "%s argument has non-string str()");
5441 goto onError;
5442 }
Fred Drakee4315f52000-05-09 19:53:39 +00005443 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005444 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005445 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005446 "strict");
5447 Py_DECREF(temp);
5448 temp = unicode;
5449 if (temp == NULL)
5450 goto onError;
5451 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005452 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453 len = PyUnicode_GET_SIZE(temp);
5454 if (prec >= 0 && len > prec)
5455 len = prec;
5456 break;
5457
5458 case 'i':
5459 case 'd':
5460 case 'u':
5461 case 'o':
5462 case 'x':
5463 case 'X':
5464 if (c == 'i')
5465 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005466 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005467 temp = formatlong(v, flags, prec, c);
5468 if (!temp)
5469 goto onError;
5470 pbuf = PyUnicode_AS_UNICODE(temp);
5471 len = PyUnicode_GET_SIZE(temp);
5472 /* unbounded ints can always produce
5473 a sign character! */
5474 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005475 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005476 else {
5477 pbuf = formatbuf;
5478 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5479 flags, prec, c, v);
5480 if (len < 0)
5481 goto onError;
5482 /* only d conversion is signed */
5483 sign = c == 'd';
5484 }
5485 if (flags & F_ZERO)
5486 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005487 break;
5488
5489 case 'e':
5490 case 'E':
5491 case 'f':
5492 case 'g':
5493 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005494 pbuf = formatbuf;
5495 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5496 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 if (len < 0)
5498 goto onError;
5499 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005500 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005501 fill = '0';
5502 break;
5503
5504 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005505 pbuf = formatbuf;
5506 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 if (len < 0)
5508 goto onError;
5509 break;
5510
5511 default:
5512 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005513 "unsupported format character '%c' (0x%x) "
5514 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005515 (31<=c && c<=126) ? c : '?',
5516 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 goto onError;
5518 }
5519 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005520 if (*pbuf == '-' || *pbuf == '+') {
5521 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005522 len--;
5523 }
5524 else if (flags & F_SIGN)
5525 sign = '+';
5526 else if (flags & F_BLANK)
5527 sign = ' ';
5528 else
5529 sign = 0;
5530 }
5531 if (width < len)
5532 width = len;
5533 if (rescnt < width + (sign != 0)) {
5534 reslen -= rescnt;
5535 rescnt = width + fmtcnt + 100;
5536 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005537 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005538 return NULL;
5539 res = PyUnicode_AS_UNICODE(result)
5540 + reslen - rescnt;
5541 }
5542 if (sign) {
5543 if (fill != ' ')
5544 *res++ = sign;
5545 rescnt--;
5546 if (width > len)
5547 width--;
5548 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005549 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5550 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005551 assert(pbuf[1] == c);
5552 if (fill != ' ') {
5553 *res++ = *pbuf++;
5554 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005555 }
Tim Petersfff53252001-04-12 18:38:48 +00005556 rescnt -= 2;
5557 width -= 2;
5558 if (width < 0)
5559 width = 0;
5560 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005561 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005562 if (width > len && !(flags & F_LJUST)) {
5563 do {
5564 --rescnt;
5565 *res++ = fill;
5566 } while (--width > len);
5567 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005568 if (fill == ' ') {
5569 if (sign)
5570 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005571 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005572 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005573 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005574 *res++ = *pbuf++;
5575 *res++ = *pbuf++;
5576 }
5577 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005578 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005579 res += len;
5580 rescnt -= len;
5581 while (--width >= len) {
5582 --rescnt;
5583 *res++ = ' ';
5584 }
5585 if (dict && (argidx < arglen) && c != '%') {
5586 PyErr_SetString(PyExc_TypeError,
5587 "not all arguments converted");
5588 goto onError;
5589 }
5590 Py_XDECREF(temp);
5591 } /* '%' */
5592 } /* until end */
5593 if (argidx < arglen && !dict) {
5594 PyErr_SetString(PyExc_TypeError,
5595 "not all arguments converted");
5596 goto onError;
5597 }
5598
5599 if (args_owned) {
5600 Py_DECREF(args);
5601 }
5602 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005603 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005604 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005605 return (PyObject *)result;
5606
5607 onError:
5608 Py_XDECREF(result);
5609 Py_DECREF(uformat);
5610 if (args_owned) {
5611 Py_DECREF(args);
5612 }
5613 return NULL;
5614}
5615
5616static PyBufferProcs unicode_as_buffer = {
5617 (getreadbufferproc) unicode_buffer_getreadbuf,
5618 (getwritebufferproc) unicode_buffer_getwritebuf,
5619 (getsegcountproc) unicode_buffer_getsegcount,
5620 (getcharbufferproc) unicode_buffer_getcharbuf,
5621};
5622
Guido van Rossume023fe02001-08-30 03:12:59 +00005623staticforward PyObject *
5624unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5625
Tim Peters6d6c1a32001-08-02 04:15:00 +00005626static PyObject *
5627unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5628{
5629 PyObject *x = NULL;
5630 static char *kwlist[] = {"string", "encoding", "errors", 0};
5631 char *encoding = NULL;
5632 char *errors = NULL;
5633
Guido van Rossume023fe02001-08-30 03:12:59 +00005634 if (type != &PyUnicode_Type)
5635 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005636 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5637 kwlist, &x, &encoding, &errors))
5638 return NULL;
5639 if (x == NULL)
5640 return (PyObject *)_PyUnicode_New(0);
5641 return PyUnicode_FromEncodedObject(x, encoding, errors);
5642}
5643
Guido van Rossume023fe02001-08-30 03:12:59 +00005644static PyObject *
5645unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5646{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005647 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005648 int n;
5649
5650 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5651 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5652 if (tmp == NULL)
5653 return NULL;
5654 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005655 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5656 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005657 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005658 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5659 if (pnew->str == NULL) {
5660 _Py_ForgetReference((PyObject *)pnew);
5661 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005662 return NULL;
5663 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005664 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5665 pnew->length = n;
5666 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005667 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005668 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005669}
5670
Tim Peters6d6c1a32001-08-02 04:15:00 +00005671static char unicode_doc[] =
5672"unicode(string [, encoding[, errors]]) -> object\n\
5673\n\
5674Create a new Unicode object from the given encoded string.\n\
5675encoding defaults to the current default string encoding and \n\
5676errors, defining the error handling, to 'strict'.";
5677
Guido van Rossumd57fd912000-03-10 22:53:23 +00005678PyTypeObject PyUnicode_Type = {
5679 PyObject_HEAD_INIT(&PyType_Type)
5680 0, /* ob_size */
5681 "unicode", /* tp_name */
5682 sizeof(PyUnicodeObject), /* tp_size */
5683 0, /* tp_itemsize */
5684 /* Slots */
5685 (destructor)_PyUnicode_Free, /* tp_dealloc */
5686 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005687 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688 0, /* tp_setattr */
5689 (cmpfunc) unicode_compare, /* tp_compare */
5690 (reprfunc) unicode_repr, /* tp_repr */
5691 0, /* tp_as_number */
5692 &unicode_as_sequence, /* tp_as_sequence */
5693 0, /* tp_as_mapping */
5694 (hashfunc) unicode_hash, /* tp_hash*/
5695 0, /* tp_call*/
5696 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005697 PyObject_GenericGetAttr, /* tp_getattro */
5698 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005699 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005700 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005701 unicode_doc, /* tp_doc */
5702 0, /* tp_traverse */
5703 0, /* tp_clear */
5704 0, /* tp_richcompare */
5705 0, /* tp_weaklistoffset */
5706 0, /* tp_iter */
5707 0, /* tp_iternext */
5708 unicode_methods, /* tp_methods */
5709 0, /* tp_members */
5710 0, /* tp_getset */
5711 0, /* tp_base */
5712 0, /* tp_dict */
5713 0, /* tp_descr_get */
5714 0, /* tp_descr_set */
5715 0, /* tp_dictoffset */
5716 0, /* tp_init */
5717 0, /* tp_alloc */
5718 unicode_new, /* tp_new */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005719};
5720
5721/* Initialize the Unicode implementation */
5722
Thomas Wouters78890102000-07-22 19:25:51 +00005723void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005724{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005725 int i;
5726
Fred Drakee4315f52000-05-09 19:53:39 +00005727 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005728 unicode_freelist = NULL;
5729 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005731 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005732 for (i = 0; i < 256; i++)
5733 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734}
5735
5736/* Finalize the Unicode implementation */
5737
5738void
Thomas Wouters78890102000-07-22 19:25:51 +00005739_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005741 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005742 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005743
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005744 Py_XDECREF(unicode_empty);
5745 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005746
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005747 for (i = 0; i < 256; i++) {
5748 if (unicode_latin1[i]) {
5749 Py_DECREF(unicode_latin1[i]);
5750 unicode_latin1[i] = NULL;
5751 }
5752 }
5753
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005754 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005755 PyUnicodeObject *v = u;
5756 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005757 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005758 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005759 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005760 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005761 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005762 unicode_freelist = NULL;
5763 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005764}