blob: c8c07a613b1ce28bc0f9dcaccfd7ece3f0f43cf3 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000401 const char *s = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000405 int reclevel;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000406
407 if (obj == NULL) {
408 PyErr_BadInternalCall();
409 return NULL;
410 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000411
412 /* Coerce object */
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000413 for (reclevel = 0; reclevel < 2; reclevel++) {
414
415 if (PyUnicode_Check(obj)) {
416 if (encoding) {
417 PyErr_SetString(PyExc_TypeError,
418 "decoding Unicode is not supported");
419 goto onError;
420 }
421 if (PyUnicode_CheckExact(obj)) {
422 Py_INCREF(obj);
423 v = obj;
424 }
425 else {
426 /* For a subclass of unicode, return a true unicode object
427 with the same string value. */
428 v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
429 PyUnicode_GET_SIZE(obj));
430 }
431 goto done;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000432 }
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000433 else if (PyString_Check(obj)) {
434 s = PyString_AS_STRING(obj);
435 len = PyString_GET_SIZE(obj);
436 break;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000437 }
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000438 else {
439 PyObject *w;
440
441 /* Try char buffer interface */
442 if (PyObject_AsCharBuffer(obj, &s, &len))
443 PyErr_Clear();
444 else
445 break;
446
447 /* Mimic the behaviour of str(object) if everything else
448 fails (see PyObject_Str()); this also covers instances
449 which implement __str__. */
450 if (obj->ob_type->tp_str == NULL)
451 w = PyObject_Repr(obj);
452 else
453 w = (*obj->ob_type->tp_str)(obj);
454 if (w == NULL)
455 goto onError;
456 if (owned) {
457 Py_DECREF(obj);
458 }
459 obj = w;
460 owned = 1;
Tim Peters78e0fc72001-09-11 03:07:38 +0000461 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +0000464 if (s == NULL) {
465 PyErr_Format(PyExc_TypeError,
466 "coercing to Unicode: __str__ recursion limit exceeded "
467 "(last type: %.80s)",
468 obj->ob_type->tp_name);
469 goto onError;
470 }
471
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000472 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000473 if (len == 0) {
474 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 else
478 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000479
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000480 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000481 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000482 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000483 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000484 return v;
485
486 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000487 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000488 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000489 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000490 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491}
492
493PyObject *PyUnicode_Decode(const char *s,
494 int size,
495 const char *encoding,
496 const char *errors)
497{
498 PyObject *buffer = NULL, *unicode;
499
Fred Drakee4315f52000-05-09 19:53:39 +0000500 if (encoding == NULL)
501 encoding = PyUnicode_GetDefaultEncoding();
502
503 /* Shortcuts for common default encodings */
504 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000505 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000506 else if (strcmp(encoding, "latin-1") == 0)
507 return PyUnicode_DecodeLatin1(s, size, errors);
508 else if (strcmp(encoding, "ascii") == 0)
509 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000510
511 /* Decode via the codec registry */
512 buffer = PyBuffer_FromMemory((void *)s, size);
513 if (buffer == NULL)
514 goto onError;
515 unicode = PyCodec_Decode(buffer, encoding, errors);
516 if (unicode == NULL)
517 goto onError;
518 if (!PyUnicode_Check(unicode)) {
519 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000520 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000521 unicode->ob_type->tp_name);
522 Py_DECREF(unicode);
523 goto onError;
524 }
525 Py_DECREF(buffer);
526 return unicode;
527
528 onError:
529 Py_XDECREF(buffer);
530 return NULL;
531}
532
533PyObject *PyUnicode_Encode(const Py_UNICODE *s,
534 int size,
535 const char *encoding,
536 const char *errors)
537{
538 PyObject *v, *unicode;
539
540 unicode = PyUnicode_FromUnicode(s, size);
541 if (unicode == NULL)
542 return NULL;
543 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
544 Py_DECREF(unicode);
545 return v;
546}
547
548PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
549 const char *encoding,
550 const char *errors)
551{
552 PyObject *v;
553
554 if (!PyUnicode_Check(unicode)) {
555 PyErr_BadArgument();
556 goto onError;
557 }
Fred Drakee4315f52000-05-09 19:53:39 +0000558
559 if (encoding == NULL)
560 encoding = PyUnicode_GetDefaultEncoding();
561
562 /* Shortcuts for common default encodings */
563 if (errors == NULL) {
564 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000565 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000566 else if (strcmp(encoding, "latin-1") == 0)
567 return PyUnicode_AsLatin1String(unicode);
568 else if (strcmp(encoding, "ascii") == 0)
569 return PyUnicode_AsASCIIString(unicode);
570 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000571
572 /* Encode via the codec registry */
573 v = PyCodec_Encode(unicode, encoding, errors);
574 if (v == NULL)
575 goto onError;
576 /* XXX Should we really enforce this ? */
577 if (!PyString_Check(v)) {
578 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000579 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000580 v->ob_type->tp_name);
581 Py_DECREF(v);
582 goto onError;
583 }
584 return v;
585
586 onError:
587 return NULL;
588}
589
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000590PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
591 const char *errors)
592{
593 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
594
595 if (v)
596 return v;
597 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
598 if (v && errors == NULL)
599 ((PyUnicodeObject *)unicode)->defenc = v;
600 return v;
601}
602
Guido van Rossumd57fd912000-03-10 22:53:23 +0000603Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
604{
605 if (!PyUnicode_Check(unicode)) {
606 PyErr_BadArgument();
607 goto onError;
608 }
609 return PyUnicode_AS_UNICODE(unicode);
610
611 onError:
612 return NULL;
613}
614
615int PyUnicode_GetSize(PyObject *unicode)
616{
617 if (!PyUnicode_Check(unicode)) {
618 PyErr_BadArgument();
619 goto onError;
620 }
621 return PyUnicode_GET_SIZE(unicode);
622
623 onError:
624 return -1;
625}
626
Thomas Wouters78890102000-07-22 19:25:51 +0000627const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000628{
629 return unicode_default_encoding;
630}
631
632int PyUnicode_SetDefaultEncoding(const char *encoding)
633{
634 PyObject *v;
635
636 /* Make sure the encoding is valid. As side effect, this also
637 loads the encoding into the codec registry cache. */
638 v = _PyCodec_Lookup(encoding);
639 if (v == NULL)
640 goto onError;
641 Py_DECREF(v);
642 strncpy(unicode_default_encoding,
643 encoding,
644 sizeof(unicode_default_encoding));
645 return 0;
646
647 onError:
648 return -1;
649}
650
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000651/* --- UTF-7 Codec -------------------------------------------------------- */
652
653/* see RFC2152 for details */
654
655static
656char utf7_special[128] = {
657 /* indicate whether a UTF-7 character is special i.e. cannot be directly
658 encoded:
659 0 - not special
660 1 - special
661 2 - whitespace (optional)
662 3 - RFC2152 Set O (optional) */
663 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
664 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
665 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
667 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
669 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
671
672};
673
674#define SPECIAL(c, encodeO, encodeWS) \
675 (((c)>127 || utf7_special[(c)] == 1) || \
676 (encodeWS && (utf7_special[(c)] == 2)) || \
677 (encodeO && (utf7_special[(c)] == 3)))
678
679#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
680#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
681#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
682 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
683
684#define ENCODE(out, ch, bits) \
685 while (bits >= 6) { \
686 *out++ = B64(ch >> (bits-6)); \
687 bits -= 6; \
688 }
689
690#define DECODE(out, ch, bits, surrogate) \
691 while (bits >= 16) { \
692 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
693 bits -= 16; \
694 if (surrogate) { \
695 /* We have already generated an error for the high surrogate
696 so let's not bother seeing if the low surrogate is correct or not */\
697 surrogate = 0; \
698 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
699 /* This is a surrogate pair. Unfortunately we can't represent \
700 it in a 16-bit character */ \
701 surrogate = 1; \
702 errmsg = "code pairs are not supported"; \
703 goto utf7Error; \
704 } else { \
705 *out++ = outCh; \
706 } \
707 } \
708
709static
710int utf7_decoding_error(Py_UNICODE **dest,
711 const char *errors,
712 const char *details)
713{
714 if ((errors == NULL) ||
715 (strcmp(errors,"strict") == 0)) {
716 PyErr_Format(PyExc_UnicodeError,
717 "UTF-7 decoding error: %.400s",
718 details);
719 return -1;
720 }
721 else if (strcmp(errors,"ignore") == 0) {
722 return 0;
723 }
724 else if (strcmp(errors,"replace") == 0) {
725 if (dest != NULL) {
726 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
727 (*dest)++;
728 }
729 return 0;
730 }
731 else {
732 PyErr_Format(PyExc_ValueError,
733 "UTF-7 decoding error; unknown error handling code: %.400s",
734 errors);
735 return -1;
736 }
737}
738
739PyObject *PyUnicode_DecodeUTF7(const char *s,
740 int size,
741 const char *errors)
742{
743 const char *e;
744 PyUnicodeObject *unicode;
745 Py_UNICODE *p;
746 const char *errmsg = "";
747 int inShift = 0;
748 unsigned int bitsleft = 0;
749 unsigned long charsleft = 0;
750 int surrogate = 0;
751
752 unicode = _PyUnicode_New(size);
753 if (!unicode)
754 return NULL;
755 if (size == 0)
756 return (PyObject *)unicode;
757
758 p = unicode->str;
759 e = s + size;
760
761 while (s < e) {
762 Py_UNICODE ch = *s;
763
764 if (inShift) {
765 if ((ch == '-') || !B64CHAR(ch)) {
766 inShift = 0;
767 s++;
768
769 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
770 if (bitsleft >= 6) {
771 /* The shift sequence has a partial character in it. If
772 bitsleft < 6 then we could just classify it as padding
773 but that is not the case here */
774
775 errmsg = "partial character in shift sequence";
776 goto utf7Error;
777 }
778 /* According to RFC2152 the remaining bits should be zero. We
779 choose to signal an error/insert a replacement character
780 here so indicate the potential of a misencoded character. */
781
782 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
783 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
784 errmsg = "non-zero padding bits in shift sequence";
785 goto utf7Error;
786 }
787
788 if (ch == '-') {
789 if ((s < e) && (*(s) == '-')) {
790 *p++ = '-';
791 inShift = 1;
792 }
793 } else if (SPECIAL(ch,0,0)) {
794 errmsg = "unexpected special character";
795 goto utf7Error;
796 } else {
797 *p++ = ch;
798 }
799 } else {
800 charsleft = (charsleft << 6) | UB64(ch);
801 bitsleft += 6;
802 s++;
803 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
804 }
805 }
806 else if ( ch == '+' ) {
807 s++;
808 if (s < e && *s == '-') {
809 s++;
810 *p++ = '+';
811 } else
812 {
813 inShift = 1;
814 bitsleft = 0;
815 }
816 }
817 else if (SPECIAL(ch,0,0)) {
818 errmsg = "unexpected special character";
819 s++;
820 goto utf7Error;
821 }
822 else {
823 *p++ = ch;
824 s++;
825 }
826 continue;
827 utf7Error:
828 if (utf7_decoding_error(&p, errors, errmsg))
829 goto onError;
830 }
831
832 if (inShift) {
833 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
834 goto onError;
835 }
836
837 if (_PyUnicode_Resize(&unicode, p - unicode->str))
838 goto onError;
839
840 return (PyObject *)unicode;
841
842onError:
843 Py_DECREF(unicode);
844 return NULL;
845}
846
847
848PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
849 int size,
850 int encodeSetO,
851 int encodeWhiteSpace,
852 const char *errors)
853{
854 PyObject *v;
855 /* It might be possible to tighten this worst case */
856 unsigned int cbAllocated = 5 * size;
857 int inShift = 0;
858 int i = 0;
859 unsigned int bitsleft = 0;
860 unsigned long charsleft = 0;
861 char * out;
862 char * start;
863
864 if (size == 0)
865 return PyString_FromStringAndSize(NULL, 0);
866
867 v = PyString_FromStringAndSize(NULL, cbAllocated);
868 if (v == NULL)
869 return NULL;
870
871 start = out = PyString_AS_STRING(v);
872 for (;i < size; ++i) {
873 Py_UNICODE ch = s[i];
874
875 if (!inShift) {
876 if (ch == '+') {
877 *out++ = '+';
878 *out++ = '-';
879 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
880 charsleft = ch;
881 bitsleft = 16;
882 *out++ = '+';
883 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
884 inShift = bitsleft > 0;
885 } else {
886 *out++ = (char) ch;
887 }
888 } else {
889 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
890 *out++ = B64(charsleft << (6-bitsleft));
891 charsleft = 0;
892 bitsleft = 0;
893 /* Characters not in the BASE64 set implicitly unshift the sequence
894 so no '-' is required, except if the character is itself a '-' */
895 if (B64CHAR(ch) || ch == '-') {
896 *out++ = '-';
897 }
898 inShift = 0;
899 *out++ = (char) ch;
900 } else {
901 bitsleft += 16;
902 charsleft = (charsleft << 16) | ch;
903 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
904
905 /* If the next character is special then we dont' need to terminate
906 the shift sequence. If the next character is not a BASE64 character
907 or '-' then the shift sequence will be terminated implicitly and we
908 don't have to insert a '-'. */
909
910 if (bitsleft == 0) {
911 if (i + 1 < size) {
912 Py_UNICODE ch2 = s[i+1];
913
914 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
915
916 } else if (B64CHAR(ch2) || ch2 == '-') {
917 *out++ = '-';
918 inShift = 0;
919 } else {
920 inShift = 0;
921 }
922
923 }
924 else {
925 *out++ = '-';
926 inShift = 0;
927 }
928 }
929 }
930 }
931 }
932 if (bitsleft) {
933 *out++= B64(charsleft << (6-bitsleft) );
934 *out++ = '-';
935 }
936
937 if (_PyString_Resize(&v, out - start)) {
938 Py_DECREF(v);
939 return NULL;
940 }
941 return v;
942}
943
944#undef SPECIAL
945#undef B64
946#undef B64CHAR
947#undef UB64
948#undef ENCODE
949#undef DECODE
950
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951/* --- UTF-8 Codec -------------------------------------------------------- */
952
953static
954char utf8_code_length[256] = {
955 /* Map UTF-8 encoded prefix byte to sequence length. zero means
956 illegal prefix. see RFC 2279 for details */
957 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
958 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
959 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
960 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
961 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
962 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
963 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
964 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
965 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
966 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
967 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
969 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
970 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
971 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
972 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
973};
974
975static
976int utf8_decoding_error(const char **source,
977 Py_UNICODE **dest,
978 const char *errors,
979 const char *details)
980{
981 if ((errors == NULL) ||
982 (strcmp(errors,"strict") == 0)) {
983 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000984 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000985 details);
986 return -1;
987 }
988 else if (strcmp(errors,"ignore") == 0) {
989 (*source)++;
990 return 0;
991 }
992 else if (strcmp(errors,"replace") == 0) {
993 (*source)++;
994 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
995 (*dest)++;
996 return 0;
997 }
998 else {
999 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001000 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001001 errors);
1002 return -1;
1003 }
1004}
1005
Guido van Rossumd57fd912000-03-10 22:53:23 +00001006PyObject *PyUnicode_DecodeUTF8(const char *s,
1007 int size,
1008 const char *errors)
1009{
1010 int n;
1011 const char *e;
1012 PyUnicodeObject *unicode;
1013 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001014 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015
1016 /* Note: size will always be longer than the resulting Unicode
1017 character count */
1018 unicode = _PyUnicode_New(size);
1019 if (!unicode)
1020 return NULL;
1021 if (size == 0)
1022 return (PyObject *)unicode;
1023
1024 /* Unpack UTF-8 encoded data */
1025 p = unicode->str;
1026 e = s + size;
1027
1028 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001029 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030
1031 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001032 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033 s++;
1034 continue;
1035 }
1036
1037 n = utf8_code_length[ch];
1038
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001039 if (s + n > e) {
1040 errmsg = "unexpected end of data";
1041 goto utf8Error;
1042 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043
1044 switch (n) {
1045
1046 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001047 errmsg = "unexpected code byte";
1048 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001049
1050 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001051 errmsg = "internal error";
1052 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053
1054 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 if ((s[1] & 0xc0) != 0x80) {
1056 errmsg = "invalid data";
1057 goto utf8Error;
1058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001059 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001060 if (ch < 0x80) {
1061 errmsg = "illegal encoding";
1062 goto utf8Error;
1063 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001064 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001065 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001066 break;
1067
1068 case 3:
1069 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001070 (s[2] & 0xc0) != 0x80) {
1071 errmsg = "invalid data";
1072 goto utf8Error;
1073 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001075 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1076 errmsg = "illegal encoding";
1077 goto utf8Error;
1078 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001080 *p++ = (Py_UNICODE)ch;
1081 break;
1082
1083 case 4:
1084 if ((s[1] & 0xc0) != 0x80 ||
1085 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001086 (s[3] & 0xc0) != 0x80) {
1087 errmsg = "invalid data";
1088 goto utf8Error;
1089 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001090 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1091 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1092 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001094 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001095 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001096 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001097 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001098 errmsg = "illegal encoding";
1099 goto utf8Error;
1100 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001101#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001102 *p++ = (Py_UNICODE)ch;
1103#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001104 /* compute and append the two surrogates: */
1105
1106 /* translate from 10000..10FFFF to 0..FFFF */
1107 ch -= 0x10000;
1108
1109 /* high surrogate = top 10 bits added to D800 */
1110 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1111
1112 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001113 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001114#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 break;
1116
1117 default:
1118 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001119 errmsg = "unsupported Unicode code range";
1120 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 }
1122 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001123 continue;
1124
1125 utf8Error:
1126 if (utf8_decoding_error(&s, &p, errors, errmsg))
1127 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 }
1129
1130 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001131 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132 goto onError;
1133
1134 return (PyObject *)unicode;
1135
1136onError:
1137 Py_DECREF(unicode);
1138 return NULL;
1139}
1140
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001141/* Not used anymore, now that the encoder supports UTF-16
1142 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001143#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144static
1145int utf8_encoding_error(const Py_UNICODE **source,
1146 char **dest,
1147 const char *errors,
1148 const char *details)
1149{
1150 if ((errors == NULL) ||
1151 (strcmp(errors,"strict") == 0)) {
1152 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 details);
1155 return -1;
1156 }
1157 else if (strcmp(errors,"ignore") == 0) {
1158 return 0;
1159 }
1160 else if (strcmp(errors,"replace") == 0) {
1161 **dest = '?';
1162 (*dest)++;
1163 return 0;
1164 }
1165 else {
1166 PyErr_Format(PyExc_ValueError,
1167 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001168 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 errors);
1170 return -1;
1171 }
1172}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001173#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001174
1175PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1176 int size,
1177 const char *errors)
1178{
1179 PyObject *v;
1180 char *p;
1181 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001182 Py_UCS4 ch2;
1183 unsigned int cbAllocated = 3 * size;
1184 unsigned int cbWritten = 0;
1185 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001187 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 if (v == NULL)
1189 return NULL;
1190 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001191 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001192
1193 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001194 while (i < size) {
1195 Py_UCS4 ch = s[i++];
1196 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001198 cbWritten++;
1199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200 else if (ch < 0x0800) {
1201 *p++ = 0xc0 | (ch >> 6);
1202 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001203 cbWritten += 2;
1204 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001205 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001206 /* Check for high surrogate */
1207 if (0xD800 <= ch && ch <= 0xDBFF) {
1208 if (i != size) {
1209 ch2 = s[i];
1210 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1211
1212 if (cbWritten >= (cbAllocated - 4)) {
1213 /* Provide enough room for some more
1214 surrogates */
1215 cbAllocated += 4*10;
1216 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001217 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001218 }
1219
1220 /* combine the two values */
1221 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1222
1223 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001224 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001225 i++;
1226 cbWritten += 4;
1227 }
1228 }
1229 }
1230 else {
1231 *p++ = (char)(0xe0 | (ch >> 12));
1232 cbWritten += 3;
1233 }
1234 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1235 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001236 } else {
1237 *p++ = 0xf0 | (ch>>18);
1238 *p++ = 0x80 | ((ch>>12) & 0x3f);
1239 *p++ = 0x80 | ((ch>>6) & 0x3f);
1240 *p++ = 0x80 | (ch & 0x3f);
1241 cbWritten += 4;
1242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 }
1244 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001245 if (_PyString_Resize(&v, p - q))
1246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return v;
1248
1249 onError:
1250 Py_DECREF(v);
1251 return NULL;
1252}
1253
Guido van Rossumd57fd912000-03-10 22:53:23 +00001254PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1255{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 if (!PyUnicode_Check(unicode)) {
1257 PyErr_BadArgument();
1258 return NULL;
1259 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001260 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1261 PyUnicode_GET_SIZE(unicode),
1262 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263}
1264
1265/* --- UTF-16 Codec ------------------------------------------------------- */
1266
1267static
Tim Peters772747b2001-08-09 22:21:55 +00001268int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001269 const char *errors,
1270 const char *details)
1271{
1272 if ((errors == NULL) ||
1273 (strcmp(errors,"strict") == 0)) {
1274 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001275 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 details);
1277 return -1;
1278 }
1279 else if (strcmp(errors,"ignore") == 0) {
1280 return 0;
1281 }
1282 else if (strcmp(errors,"replace") == 0) {
1283 if (dest) {
1284 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1285 (*dest)++;
1286 }
1287 return 0;
1288 }
1289 else {
1290 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001291 "UTF-16 decoding error; "
1292 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 errors);
1294 return -1;
1295 }
1296}
1297
Tim Peters772747b2001-08-09 22:21:55 +00001298PyObject *
1299PyUnicode_DecodeUTF16(const char *s,
1300 int size,
1301 const char *errors,
1302 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303{
1304 PyUnicodeObject *unicode;
1305 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001306 const unsigned char *q, *e;
1307 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001308 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001309 /* Offsets from q for retrieving byte pairs in the right order. */
1310#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1311 int ihi = 1, ilo = 0;
1312#else
1313 int ihi = 0, ilo = 1;
1314#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001315
1316 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001317 if (size & 1) {
1318 if (utf16_decoding_error(NULL, errors, "truncated data"))
1319 return NULL;
1320 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 }
1322
1323 /* Note: size will always be longer than the resulting Unicode
1324 character count */
1325 unicode = _PyUnicode_New(size);
1326 if (!unicode)
1327 return NULL;
1328 if (size == 0)
1329 return (PyObject *)unicode;
1330
1331 /* Unpack UTF-16 encoded data */
1332 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001333 q = (unsigned char *)s;
1334 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335
1336 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001337 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001338
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001339 /* Check for BOM marks (U+FEFF) in the input and adjust current
1340 byte order setting accordingly. In native mode, the leading BOM
1341 mark is skipped, in all other modes, it is copied to the output
1342 stream as-is (giving a ZWNBSP character). */
1343 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001344 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001345#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001346 if (bom == 0xFEFF) {
1347 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001348 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001349 }
1350 else if (bom == 0xFFFE) {
1351 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001352 bo = 1;
1353 }
1354#else
Tim Peters772747b2001-08-09 22:21:55 +00001355 if (bom == 0xFEFF) {
1356 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001357 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001358 }
1359 else if (bom == 0xFFFE) {
1360 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001361 bo = -1;
1362 }
1363#endif
1364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365
Tim Peters772747b2001-08-09 22:21:55 +00001366 if (bo == -1) {
1367 /* force LE */
1368 ihi = 1;
1369 ilo = 0;
1370 }
1371 else if (bo == 1) {
1372 /* force BE */
1373 ihi = 0;
1374 ilo = 1;
1375 }
1376
1377 while (q < e) {
1378 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1379 q += 2;
1380
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 if (ch < 0xD800 || ch > 0xDFFF) {
1382 *p++ = ch;
1383 continue;
1384 }
1385
1386 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001387 if (q >= e) {
1388 errmsg = "unexpected end of data";
1389 goto utf16Error;
1390 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001391 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001392 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1393 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001394 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001395#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001396 *p++ = ch;
1397 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001398#else
1399 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001400#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001401 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001402 }
1403 else {
1404 errmsg = "illegal UTF-16 surrogate";
1405 goto utf16Error;
1406 }
1407
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001409 errmsg = "illegal encoding";
1410 /* Fall through to report the error */
1411
1412 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001413 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001414 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415 }
1416
1417 if (byteorder)
1418 *byteorder = bo;
1419
1420 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001421 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001422 goto onError;
1423
1424 return (PyObject *)unicode;
1425
1426onError:
1427 Py_DECREF(unicode);
1428 return NULL;
1429}
1430
Tim Peters772747b2001-08-09 22:21:55 +00001431PyObject *
1432PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1433 int size,
1434 const char *errors,
1435 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436{
1437 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001438 unsigned char *p;
1439 int i, pairs;
1440 /* Offsets from p for storing byte pairs in the right order. */
1441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1442 int ihi = 1, ilo = 0;
1443#else
1444 int ihi = 0, ilo = 1;
1445#endif
1446
1447#define STORECHAR(CH) \
1448 do { \
1449 p[ihi] = ((CH) >> 8) & 0xff; \
1450 p[ilo] = (CH) & 0xff; \
1451 p += 2; \
1452 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001454 for (i = pairs = 0; i < size; i++)
1455 if (s[i] >= 0x10000)
1456 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001458 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459 if (v == NULL)
1460 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001461
Tim Peters772747b2001-08-09 22:21:55 +00001462 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001464 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001465 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001466 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001467
1468 if (byteorder == -1) {
1469 /* force LE */
1470 ihi = 1;
1471 ilo = 0;
1472 }
1473 else if (byteorder == 1) {
1474 /* force BE */
1475 ihi = 0;
1476 ilo = 1;
1477 }
1478
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001479 while (size-- > 0) {
1480 Py_UNICODE ch = *s++;
1481 Py_UNICODE ch2 = 0;
1482 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001483 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1484 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485 }
Tim Peters772747b2001-08-09 22:21:55 +00001486 STORECHAR(ch);
1487 if (ch2)
1488 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001489 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001491#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001492}
1493
1494PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1495{
1496 if (!PyUnicode_Check(unicode)) {
1497 PyErr_BadArgument();
1498 return NULL;
1499 }
1500 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1501 PyUnicode_GET_SIZE(unicode),
1502 NULL,
1503 0);
1504}
1505
1506/* --- Unicode Escape Codec ----------------------------------------------- */
1507
1508static
1509int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001510 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 const char *errors,
1512 const char *details)
1513{
1514 if ((errors == NULL) ||
1515 (strcmp(errors,"strict") == 0)) {
1516 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001517 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001518 details);
1519 return -1;
1520 }
1521 else if (strcmp(errors,"ignore") == 0) {
1522 return 0;
1523 }
1524 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001525 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 return 0;
1527 }
1528 else {
1529 PyErr_Format(PyExc_ValueError,
1530 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001531 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 errors);
1533 return -1;
1534 }
1535}
1536
Fredrik Lundh06d12682001-01-24 07:59:11 +00001537static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001538
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1540 int size,
1541 const char *errors)
1542{
1543 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001544 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001546 char* message;
1547 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1548
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 /* Escaped strings will always be longer than the resulting
1550 Unicode string, so we start with size here and then reduce the
1551 length after conversion to the true value. */
1552 v = _PyUnicode_New(size);
1553 if (v == NULL)
1554 goto onError;
1555 if (size == 0)
1556 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001557
Guido van Rossumd57fd912000-03-10 22:53:23 +00001558 p = buf = PyUnicode_AS_UNICODE(v);
1559 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001560
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 while (s < end) {
1562 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001563 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001564 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565
1566 /* Non-escape characters are interpreted as Unicode ordinals */
1567 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001568 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001569 continue;
1570 }
1571
1572 /* \ - Escapes */
1573 s++;
1574 switch (*s++) {
1575
1576 /* \x escapes */
1577 case '\n': break;
1578 case '\\': *p++ = '\\'; break;
1579 case '\'': *p++ = '\''; break;
1580 case '\"': *p++ = '\"'; break;
1581 case 'b': *p++ = '\b'; break;
1582 case 'f': *p++ = '\014'; break; /* FF */
1583 case 't': *p++ = '\t'; break;
1584 case 'n': *p++ = '\n'; break;
1585 case 'r': *p++ = '\r'; break;
1586 case 'v': *p++ = '\013'; break; /* VT */
1587 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1588
1589 /* \OOO (octal) escapes */
1590 case '0': case '1': case '2': case '3':
1591 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001592 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001594 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001595 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001596 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001597 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001598 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 break;
1600
Fredrik Lundhccc74732001-02-18 22:13:49 +00001601 /* hex escapes */
1602 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001604 digits = 2;
1605 message = "truncated \\xXX escape";
1606 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001607
Fredrik Lundhccc74732001-02-18 22:13:49 +00001608 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001610 digits = 4;
1611 message = "truncated \\uXXXX escape";
1612 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613
Fredrik Lundhccc74732001-02-18 22:13:49 +00001614 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001615 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001616 digits = 8;
1617 message = "truncated \\UXXXXXXXX escape";
1618 hexescape:
1619 chr = 0;
1620 for (i = 0; i < digits; i++) {
1621 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001622 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001623 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001624 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001625 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001626 i++;
1627 break;
1628 }
1629 chr = (chr<<4) & ~0xF;
1630 if (c >= '0' && c <= '9')
1631 chr += c - '0';
1632 else if (c >= 'a' && c <= 'f')
1633 chr += 10 + c - 'a';
1634 else
1635 chr += 10 + c - 'A';
1636 }
1637 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001638 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001639 /* when we get here, chr is a 32-bit unicode character */
1640 if (chr <= 0xffff)
1641 /* UCS-2 character */
1642 *p++ = (Py_UNICODE) chr;
1643 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001644 /* UCS-4 character. Either store directly, or as
1645 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001646#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001647 *p++ = chr;
1648#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001649 chr -= 0x10000L;
1650 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001651 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001652#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001653 } else {
1654 if (unicodeescape_decoding_error(
1655 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001656 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001657 )
1658 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001659 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001660 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001661 break;
1662
1663 /* \N{name} */
1664 case 'N':
1665 message = "malformed \\N character escape";
1666 if (ucnhash_CAPI == NULL) {
1667 /* load the unicode data module */
1668 PyObject *m, *v;
1669 m = PyImport_ImportModule("unicodedata");
1670 if (m == NULL)
1671 goto ucnhashError;
1672 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1673 Py_DECREF(m);
1674 if (v == NULL)
1675 goto ucnhashError;
1676 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1677 Py_DECREF(v);
1678 if (ucnhash_CAPI == NULL)
1679 goto ucnhashError;
1680 }
1681 if (*s == '{') {
1682 const char *start = s+1;
1683 /* look for the closing brace */
1684 while (*s != '}' && s < end)
1685 s++;
1686 if (s > start && s < end && *s == '}') {
1687 /* found a name. look it up in the unicode database */
1688 message = "unknown Unicode character name";
1689 s++;
1690 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1691 goto store;
1692 }
1693 }
1694 if (unicodeescape_decoding_error(&s, &x, errors, message))
1695 goto onError;
1696 *p++ = x;
1697 break;
1698
1699 default:
1700 *p++ = '\\';
1701 *p++ = (unsigned char)s[-1];
1702 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001703 }
1704 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001705 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001706 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 return (PyObject *)v;
1708
Fredrik Lundhccc74732001-02-18 22:13:49 +00001709ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001710 PyErr_SetString(
1711 PyExc_UnicodeError,
1712 "\\N escapes not supported (can't load unicodedata module)"
1713 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001714 return NULL;
1715
Fredrik Lundhccc74732001-02-18 22:13:49 +00001716onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 Py_XDECREF(v);
1718 return NULL;
1719}
1720
1721/* Return a Unicode-Escape string version of the Unicode object.
1722
1723 If quotes is true, the string is enclosed in u"" or u'' quotes as
1724 appropriate.
1725
1726*/
1727
Barry Warsaw51ac5802000-03-20 16:36:48 +00001728static const Py_UNICODE *findchar(const Py_UNICODE *s,
1729 int size,
1730 Py_UNICODE ch);
1731
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732static
1733PyObject *unicodeescape_string(const Py_UNICODE *s,
1734 int size,
1735 int quotes)
1736{
1737 PyObject *repr;
1738 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001740 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741
1742 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1743 if (repr == NULL)
1744 return NULL;
1745
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001746 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747
1748 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 *p++ = 'u';
1750 *p++ = (findchar(s, size, '\'') &&
1751 !findchar(s, size, '"')) ? '"' : '\'';
1752 }
1753 while (size-- > 0) {
1754 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001755
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001757 if (quotes &&
1758 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 *p++ = '\\';
1760 *p++ = (char) ch;
1761 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001762
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001763#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001764 /* Map 21-bit characters to '\U00xxxxxx' */
1765 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001766 int offset = p - PyString_AS_STRING(repr);
1767
1768 /* Resize the string if necessary */
1769 if (offset + 12 > PyString_GET_SIZE(repr)) {
1770 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1771 goto onError;
1772 p = PyString_AS_STRING(repr) + offset;
1773 }
1774
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001775 *p++ = '\\';
1776 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001777 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1778 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1779 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1780 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1781 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1782 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1783 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001784 *p++ = hexdigit[ch & 0x0000000F];
1785 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001786 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001787#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001788 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1789 else if (ch >= 0xD800 && ch < 0xDC00) {
1790 Py_UNICODE ch2;
1791 Py_UCS4 ucs;
1792
1793 ch2 = *s++;
1794 size--;
1795 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1796 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1797 *p++ = '\\';
1798 *p++ = 'U';
1799 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1800 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1801 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1802 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1803 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1804 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1805 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1806 *p++ = hexdigit[ucs & 0x0000000F];
1807 continue;
1808 }
1809 /* Fall through: isolated surrogates are copied as-is */
1810 s--;
1811 size++;
1812 }
1813
Guido van Rossumd57fd912000-03-10 22:53:23 +00001814 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001815 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 *p++ = '\\';
1817 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001818 *p++ = hexdigit[(ch >> 12) & 0x000F];
1819 *p++ = hexdigit[(ch >> 8) & 0x000F];
1820 *p++ = hexdigit[(ch >> 4) & 0x000F];
1821 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001822 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001823
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001824 /* Map special whitespace to '\t', \n', '\r' */
1825 else if (ch == '\t') {
1826 *p++ = '\\';
1827 *p++ = 't';
1828 }
1829 else if (ch == '\n') {
1830 *p++ = '\\';
1831 *p++ = 'n';
1832 }
1833 else if (ch == '\r') {
1834 *p++ = '\\';
1835 *p++ = 'r';
1836 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001837
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001838 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 else if (ch < ' ' || ch >= 128) {
1840 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001841 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001842 *p++ = hexdigit[(ch >> 4) & 0x000F];
1843 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001845
Guido van Rossumd57fd912000-03-10 22:53:23 +00001846 /* Copy everything else as-is */
1847 else
1848 *p++ = (char) ch;
1849 }
1850 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001851 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001852
1853 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001854 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001855 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001856
1857 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001858
1859 onError:
1860 Py_DECREF(repr);
1861 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862}
1863
1864PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1865 int size)
1866{
1867 return unicodeescape_string(s, size, 0);
1868}
1869
1870PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1871{
1872 if (!PyUnicode_Check(unicode)) {
1873 PyErr_BadArgument();
1874 return NULL;
1875 }
1876 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1877 PyUnicode_GET_SIZE(unicode));
1878}
1879
1880/* --- Raw Unicode Escape Codec ------------------------------------------- */
1881
1882PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1883 int size,
1884 const char *errors)
1885{
1886 PyUnicodeObject *v;
1887 Py_UNICODE *p, *buf;
1888 const char *end;
1889 const char *bs;
1890
1891 /* Escaped strings will always be longer than the resulting
1892 Unicode string, so we start with size here and then reduce the
1893 length after conversion to the true value. */
1894 v = _PyUnicode_New(size);
1895 if (v == NULL)
1896 goto onError;
1897 if (size == 0)
1898 return (PyObject *)v;
1899 p = buf = PyUnicode_AS_UNICODE(v);
1900 end = s + size;
1901 while (s < end) {
1902 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001903 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 int i;
1905
1906 /* Non-escape characters are interpreted as Unicode ordinals */
1907 if (*s != '\\') {
1908 *p++ = (unsigned char)*s++;
1909 continue;
1910 }
1911
1912 /* \u-escapes are only interpreted iff the number of leading
1913 backslashes if odd */
1914 bs = s;
1915 for (;s < end;) {
1916 if (*s != '\\')
1917 break;
1918 *p++ = (unsigned char)*s++;
1919 }
1920 if (((s - bs) & 1) == 0 ||
1921 s >= end ||
1922 *s != 'u') {
1923 continue;
1924 }
1925 p--;
1926 s++;
1927
1928 /* \uXXXX with 4 hex digits */
1929 for (x = 0, i = 0; i < 4; i++) {
1930 c = (unsigned char)s[i];
1931 if (!isxdigit(c)) {
1932 if (unicodeescape_decoding_error(&s, &x, errors,
1933 "truncated \\uXXXX"))
1934 goto onError;
1935 i++;
1936 break;
1937 }
1938 x = (x<<4) & ~0xF;
1939 if (c >= '0' && c <= '9')
1940 x += c - '0';
1941 else if (c >= 'a' && c <= 'f')
1942 x += 10 + c - 'a';
1943 else
1944 x += 10 + c - 'A';
1945 }
1946 s += i;
1947 *p++ = x;
1948 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001949 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001950 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 return (PyObject *)v;
1952
1953 onError:
1954 Py_XDECREF(v);
1955 return NULL;
1956}
1957
1958PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1959 int size)
1960{
1961 PyObject *repr;
1962 char *p;
1963 char *q;
1964
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001965 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966
1967 repr = PyString_FromStringAndSize(NULL, 6 * size);
1968 if (repr == NULL)
1969 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001970 if (size == 0)
1971 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972
1973 p = q = PyString_AS_STRING(repr);
1974 while (size-- > 0) {
1975 Py_UNICODE ch = *s++;
1976 /* Map 16-bit characters to '\uxxxx' */
1977 if (ch >= 256) {
1978 *p++ = '\\';
1979 *p++ = 'u';
1980 *p++ = hexdigit[(ch >> 12) & 0xf];
1981 *p++ = hexdigit[(ch >> 8) & 0xf];
1982 *p++ = hexdigit[(ch >> 4) & 0xf];
1983 *p++ = hexdigit[ch & 15];
1984 }
1985 /* Copy everything else as-is */
1986 else
1987 *p++ = (char) ch;
1988 }
1989 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001990 if (_PyString_Resize(&repr, p - q))
1991 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001992
1993 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001994
1995 onError:
1996 Py_DECREF(repr);
1997 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998}
1999
2000PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
2001{
2002 if (!PyUnicode_Check(unicode)) {
2003 PyErr_BadArgument();
2004 return NULL;
2005 }
2006 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
2007 PyUnicode_GET_SIZE(unicode));
2008}
2009
2010/* --- Latin-1 Codec ------------------------------------------------------ */
2011
2012PyObject *PyUnicode_DecodeLatin1(const char *s,
2013 int size,
2014 const char *errors)
2015{
2016 PyUnicodeObject *v;
2017 Py_UNICODE *p;
2018
2019 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002020 if (size == 1 && *(unsigned char*)s < 256) {
2021 Py_UNICODE r = *(unsigned char*)s;
2022 return PyUnicode_FromUnicode(&r, 1);
2023 }
2024
Guido van Rossumd57fd912000-03-10 22:53:23 +00002025 v = _PyUnicode_New(size);
2026 if (v == NULL)
2027 goto onError;
2028 if (size == 0)
2029 return (PyObject *)v;
2030 p = PyUnicode_AS_UNICODE(v);
2031 while (size-- > 0)
2032 *p++ = (unsigned char)*s++;
2033 return (PyObject *)v;
2034
2035 onError:
2036 Py_XDECREF(v);
2037 return NULL;
2038}
2039
2040static
2041int latin1_encoding_error(const Py_UNICODE **source,
2042 char **dest,
2043 const char *errors,
2044 const char *details)
2045{
2046 if ((errors == NULL) ||
2047 (strcmp(errors,"strict") == 0)) {
2048 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002049 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002050 details);
2051 return -1;
2052 }
2053 else if (strcmp(errors,"ignore") == 0) {
2054 return 0;
2055 }
2056 else if (strcmp(errors,"replace") == 0) {
2057 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002058 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002059 return 0;
2060 }
2061 else {
2062 PyErr_Format(PyExc_ValueError,
2063 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002064 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 errors);
2066 return -1;
2067 }
2068}
2069
2070PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2071 int size,
2072 const char *errors)
2073{
2074 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002075 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002076
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 repr = PyString_FromStringAndSize(NULL, size);
2078 if (repr == NULL)
2079 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002080 if (size == 0)
2081 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002082
2083 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002084 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 while (size-- > 0) {
2086 Py_UNICODE ch = *p++;
2087 if (ch >= 256) {
2088 if (latin1_encoding_error(&p, &s, errors,
2089 "ordinal not in range(256)"))
2090 goto onError;
2091 }
2092 else
2093 *s++ = (char)ch;
2094 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002095 /* Resize if error handling skipped some characters */
2096 if (s - start < PyString_GET_SIZE(repr))
2097 if (_PyString_Resize(&repr, s - start))
2098 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 return repr;
2100
2101 onError:
2102 Py_DECREF(repr);
2103 return NULL;
2104}
2105
2106PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2107{
2108 if (!PyUnicode_Check(unicode)) {
2109 PyErr_BadArgument();
2110 return NULL;
2111 }
2112 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2113 PyUnicode_GET_SIZE(unicode),
2114 NULL);
2115}
2116
2117/* --- 7-bit ASCII Codec -------------------------------------------------- */
2118
2119static
2120int ascii_decoding_error(const char **source,
2121 Py_UNICODE **dest,
2122 const char *errors,
2123 const char *details)
2124{
2125 if ((errors == NULL) ||
2126 (strcmp(errors,"strict") == 0)) {
2127 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002128 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002129 details);
2130 return -1;
2131 }
2132 else if (strcmp(errors,"ignore") == 0) {
2133 return 0;
2134 }
2135 else if (strcmp(errors,"replace") == 0) {
2136 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2137 (*dest)++;
2138 return 0;
2139 }
2140 else {
2141 PyErr_Format(PyExc_ValueError,
2142 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002143 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002144 errors);
2145 return -1;
2146 }
2147}
2148
2149PyObject *PyUnicode_DecodeASCII(const char *s,
2150 int size,
2151 const char *errors)
2152{
2153 PyUnicodeObject *v;
2154 Py_UNICODE *p;
2155
2156 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002157 if (size == 1 && *(unsigned char*)s < 128) {
2158 Py_UNICODE r = *(unsigned char*)s;
2159 return PyUnicode_FromUnicode(&r, 1);
2160 }
2161
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162 v = _PyUnicode_New(size);
2163 if (v == NULL)
2164 goto onError;
2165 if (size == 0)
2166 return (PyObject *)v;
2167 p = PyUnicode_AS_UNICODE(v);
2168 while (size-- > 0) {
2169 register unsigned char c;
2170
2171 c = (unsigned char)*s++;
2172 if (c < 128)
2173 *p++ = c;
2174 else if (ascii_decoding_error(&s, &p, errors,
2175 "ordinal not in range(128)"))
2176 goto onError;
2177 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002178 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002179 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002180 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002181 return (PyObject *)v;
2182
2183 onError:
2184 Py_XDECREF(v);
2185 return NULL;
2186}
2187
2188static
2189int ascii_encoding_error(const Py_UNICODE **source,
2190 char **dest,
2191 const char *errors,
2192 const char *details)
2193{
2194 if ((errors == NULL) ||
2195 (strcmp(errors,"strict") == 0)) {
2196 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002197 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002198 details);
2199 return -1;
2200 }
2201 else if (strcmp(errors,"ignore") == 0) {
2202 return 0;
2203 }
2204 else if (strcmp(errors,"replace") == 0) {
2205 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002206 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002207 return 0;
2208 }
2209 else {
2210 PyErr_Format(PyExc_ValueError,
2211 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002212 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213 errors);
2214 return -1;
2215 }
2216}
2217
2218PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2219 int size,
2220 const char *errors)
2221{
2222 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002223 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002224
Guido van Rossumd57fd912000-03-10 22:53:23 +00002225 repr = PyString_FromStringAndSize(NULL, size);
2226 if (repr == NULL)
2227 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002228 if (size == 0)
2229 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002230
2231 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002232 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002233 while (size-- > 0) {
2234 Py_UNICODE ch = *p++;
2235 if (ch >= 128) {
2236 if (ascii_encoding_error(&p, &s, errors,
2237 "ordinal not in range(128)"))
2238 goto onError;
2239 }
2240 else
2241 *s++ = (char)ch;
2242 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002243 /* Resize if error handling skipped some characters */
2244 if (s - start < PyString_GET_SIZE(repr))
2245 if (_PyString_Resize(&repr, s - start))
2246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002247 return repr;
2248
2249 onError:
2250 Py_DECREF(repr);
2251 return NULL;
2252}
2253
2254PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2255{
2256 if (!PyUnicode_Check(unicode)) {
2257 PyErr_BadArgument();
2258 return NULL;
2259 }
2260 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2261 PyUnicode_GET_SIZE(unicode),
2262 NULL);
2263}
2264
Fredrik Lundh30831632001-06-26 15:11:00 +00002265#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002266
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002267/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002268
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002269PyObject *PyUnicode_DecodeMBCS(const char *s,
2270 int size,
2271 const char *errors)
2272{
2273 PyUnicodeObject *v;
2274 Py_UNICODE *p;
2275
2276 /* First get the size of the result */
2277 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002278 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002279 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2280
2281 v = _PyUnicode_New(usize);
2282 if (v == NULL)
2283 return NULL;
2284 if (usize == 0)
2285 return (PyObject *)v;
2286 p = PyUnicode_AS_UNICODE(v);
2287 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2288 Py_DECREF(v);
2289 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2290 }
2291
2292 return (PyObject *)v;
2293}
2294
2295PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2296 int size,
2297 const char *errors)
2298{
2299 PyObject *repr;
2300 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002301 DWORD mbcssize;
2302
2303 /* If there are no characters, bail now! */
2304 if (size==0)
2305 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002306
2307 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002308 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002309 if (mbcssize==0)
2310 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2311
2312 repr = PyString_FromStringAndSize(NULL, mbcssize);
2313 if (repr == NULL)
2314 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002315 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002316 return repr;
2317
2318 /* Do the conversion */
2319 s = PyString_AS_STRING(repr);
2320 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2321 Py_DECREF(repr);
2322 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2323 }
2324 return repr;
2325}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002326
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002327#endif /* MS_WIN32 */
2328
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329/* --- Character Mapping Codec -------------------------------------------- */
2330
2331static
2332int charmap_decoding_error(const char **source,
2333 Py_UNICODE **dest,
2334 const char *errors,
2335 const char *details)
2336{
2337 if ((errors == NULL) ||
2338 (strcmp(errors,"strict") == 0)) {
2339 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002340 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002341 details);
2342 return -1;
2343 }
2344 else if (strcmp(errors,"ignore") == 0) {
2345 return 0;
2346 }
2347 else if (strcmp(errors,"replace") == 0) {
2348 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2349 (*dest)++;
2350 return 0;
2351 }
2352 else {
2353 PyErr_Format(PyExc_ValueError,
2354 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002355 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356 errors);
2357 return -1;
2358 }
2359}
2360
2361PyObject *PyUnicode_DecodeCharmap(const char *s,
2362 int size,
2363 PyObject *mapping,
2364 const char *errors)
2365{
2366 PyUnicodeObject *v;
2367 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002368 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002369
2370 /* Default to Latin-1 */
2371 if (mapping == NULL)
2372 return PyUnicode_DecodeLatin1(s, size, errors);
2373
2374 v = _PyUnicode_New(size);
2375 if (v == NULL)
2376 goto onError;
2377 if (size == 0)
2378 return (PyObject *)v;
2379 p = PyUnicode_AS_UNICODE(v);
2380 while (size-- > 0) {
2381 unsigned char ch = *s++;
2382 PyObject *w, *x;
2383
2384 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2385 w = PyInt_FromLong((long)ch);
2386 if (w == NULL)
2387 goto onError;
2388 x = PyObject_GetItem(mapping, w);
2389 Py_DECREF(w);
2390 if (x == NULL) {
2391 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002392 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002394 x = Py_None;
2395 Py_INCREF(x);
2396 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002397 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002398 }
2399
2400 /* Apply mapping */
2401 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002402 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002403 if (value < 0 || value > 65535) {
2404 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002405 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002406 Py_DECREF(x);
2407 goto onError;
2408 }
2409 *p++ = (Py_UNICODE)value;
2410 }
2411 else if (x == Py_None) {
2412 /* undefined mapping */
2413 if (charmap_decoding_error(&s, &p, errors,
2414 "character maps to <undefined>")) {
2415 Py_DECREF(x);
2416 goto onError;
2417 }
2418 }
2419 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002420 int targetsize = PyUnicode_GET_SIZE(x);
2421
2422 if (targetsize == 1)
2423 /* 1-1 mapping */
2424 *p++ = *PyUnicode_AS_UNICODE(x);
2425
2426 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002427 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002428 if (targetsize > extrachars) {
2429 /* resize first */
2430 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2431 int needed = (targetsize - extrachars) + \
2432 (targetsize << 2);
2433 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002434 if (_PyUnicode_Resize(&v,
2435 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002436 Py_DECREF(x);
2437 goto onError;
2438 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002439 p = PyUnicode_AS_UNICODE(v) + oldpos;
2440 }
2441 Py_UNICODE_COPY(p,
2442 PyUnicode_AS_UNICODE(x),
2443 targetsize);
2444 p += targetsize;
2445 extrachars -= targetsize;
2446 }
2447 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002448 }
2449 else {
2450 /* wrong return value */
2451 PyErr_SetString(PyExc_TypeError,
2452 "character mapping must return integer, None or unicode");
2453 Py_DECREF(x);
2454 goto onError;
2455 }
2456 Py_DECREF(x);
2457 }
2458 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002459 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002460 goto onError;
2461 return (PyObject *)v;
2462
2463 onError:
2464 Py_XDECREF(v);
2465 return NULL;
2466}
2467
2468static
2469int charmap_encoding_error(const Py_UNICODE **source,
2470 char **dest,
2471 const char *errors,
2472 const char *details)
2473{
2474 if ((errors == NULL) ||
2475 (strcmp(errors,"strict") == 0)) {
2476 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002477 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002478 details);
2479 return -1;
2480 }
2481 else if (strcmp(errors,"ignore") == 0) {
2482 return 0;
2483 }
2484 else if (strcmp(errors,"replace") == 0) {
2485 **dest = '?';
2486 (*dest)++;
2487 return 0;
2488 }
2489 else {
2490 PyErr_Format(PyExc_ValueError,
2491 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002492 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493 errors);
2494 return -1;
2495 }
2496}
2497
2498PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2499 int size,
2500 PyObject *mapping,
2501 const char *errors)
2502{
2503 PyObject *v;
2504 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002505 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002506
2507 /* Default to Latin-1 */
2508 if (mapping == NULL)
2509 return PyUnicode_EncodeLatin1(p, size, errors);
2510
2511 v = PyString_FromStringAndSize(NULL, size);
2512 if (v == NULL)
2513 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002514 if (size == 0)
2515 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002516 s = PyString_AS_STRING(v);
2517 while (size-- > 0) {
2518 Py_UNICODE ch = *p++;
2519 PyObject *w, *x;
2520
2521 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2522 w = PyInt_FromLong((long)ch);
2523 if (w == NULL)
2524 goto onError;
2525 x = PyObject_GetItem(mapping, w);
2526 Py_DECREF(w);
2527 if (x == NULL) {
2528 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002529 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002530 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002531 x = Py_None;
2532 Py_INCREF(x);
2533 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002534 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 }
2536
2537 /* Apply mapping */
2538 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002539 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 if (value < 0 || value > 255) {
2541 PyErr_SetString(PyExc_TypeError,
2542 "character mapping must be in range(256)");
2543 Py_DECREF(x);
2544 goto onError;
2545 }
2546 *s++ = (char)value;
2547 }
2548 else if (x == Py_None) {
2549 /* undefined mapping */
2550 if (charmap_encoding_error(&p, &s, errors,
2551 "character maps to <undefined>")) {
2552 Py_DECREF(x);
2553 goto onError;
2554 }
2555 }
2556 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002557 int targetsize = PyString_GET_SIZE(x);
2558
2559 if (targetsize == 1)
2560 /* 1-1 mapping */
2561 *s++ = *PyString_AS_STRING(x);
2562
2563 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002564 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002565 if (targetsize > extrachars) {
2566 /* resize first */
2567 int oldpos = (int)(s - PyString_AS_STRING(v));
2568 int needed = (targetsize - extrachars) + \
2569 (targetsize << 2);
2570 extrachars += needed;
2571 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002572 Py_DECREF(x);
2573 goto onError;
2574 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002575 s = PyString_AS_STRING(v) + oldpos;
2576 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002577 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002578 s += targetsize;
2579 extrachars -= targetsize;
2580 }
2581 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002582 }
2583 else {
2584 /* wrong return value */
2585 PyErr_SetString(PyExc_TypeError,
2586 "character mapping must return integer, None or unicode");
2587 Py_DECREF(x);
2588 goto onError;
2589 }
2590 Py_DECREF(x);
2591 }
2592 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2593 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2594 goto onError;
2595 return v;
2596
2597 onError:
2598 Py_DECREF(v);
2599 return NULL;
2600}
2601
2602PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2603 PyObject *mapping)
2604{
2605 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2606 PyErr_BadArgument();
2607 return NULL;
2608 }
2609 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2610 PyUnicode_GET_SIZE(unicode),
2611 mapping,
2612 NULL);
2613}
2614
2615static
2616int translate_error(const Py_UNICODE **source,
2617 Py_UNICODE **dest,
2618 const char *errors,
2619 const char *details)
2620{
2621 if ((errors == NULL) ||
2622 (strcmp(errors,"strict") == 0)) {
2623 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002624 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002625 details);
2626 return -1;
2627 }
2628 else if (strcmp(errors,"ignore") == 0) {
2629 return 0;
2630 }
2631 else if (strcmp(errors,"replace") == 0) {
2632 **dest = '?';
2633 (*dest)++;
2634 return 0;
2635 }
2636 else {
2637 PyErr_Format(PyExc_ValueError,
2638 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002639 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002640 errors);
2641 return -1;
2642 }
2643}
2644
2645PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2646 int size,
2647 PyObject *mapping,
2648 const char *errors)
2649{
2650 PyUnicodeObject *v;
2651 Py_UNICODE *p;
2652
2653 if (mapping == NULL) {
2654 PyErr_BadArgument();
2655 return NULL;
2656 }
2657
2658 /* Output will never be longer than input */
2659 v = _PyUnicode_New(size);
2660 if (v == NULL)
2661 goto onError;
2662 if (size == 0)
2663 goto done;
2664 p = PyUnicode_AS_UNICODE(v);
2665 while (size-- > 0) {
2666 Py_UNICODE ch = *s++;
2667 PyObject *w, *x;
2668
2669 /* Get mapping */
2670 w = PyInt_FromLong(ch);
2671 if (w == NULL)
2672 goto onError;
2673 x = PyObject_GetItem(mapping, w);
2674 Py_DECREF(w);
2675 if (x == NULL) {
2676 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2677 /* No mapping found: default to 1-1 mapping */
2678 PyErr_Clear();
2679 *p++ = ch;
2680 continue;
2681 }
2682 goto onError;
2683 }
2684
2685 /* Apply mapping */
2686 if (PyInt_Check(x))
2687 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2688 else if (x == Py_None) {
2689 /* undefined mapping */
2690 if (translate_error(&s, &p, errors,
2691 "character maps to <undefined>")) {
2692 Py_DECREF(x);
2693 goto onError;
2694 }
2695 }
2696 else if (PyUnicode_Check(x)) {
2697 if (PyUnicode_GET_SIZE(x) != 1) {
2698 /* 1-n mapping */
2699 PyErr_SetString(PyExc_NotImplementedError,
2700 "1-n mappings are currently not implemented");
2701 Py_DECREF(x);
2702 goto onError;
2703 }
2704 *p++ = *PyUnicode_AS_UNICODE(x);
2705 }
2706 else {
2707 /* wrong return value */
2708 PyErr_SetString(PyExc_TypeError,
2709 "translate mapping must return integer, None or unicode");
2710 Py_DECREF(x);
2711 goto onError;
2712 }
2713 Py_DECREF(x);
2714 }
2715 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002716 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002718
2719 done:
2720 return (PyObject *)v;
2721
2722 onError:
2723 Py_XDECREF(v);
2724 return NULL;
2725}
2726
2727PyObject *PyUnicode_Translate(PyObject *str,
2728 PyObject *mapping,
2729 const char *errors)
2730{
2731 PyObject *result;
2732
2733 str = PyUnicode_FromObject(str);
2734 if (str == NULL)
2735 goto onError;
2736 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2737 PyUnicode_GET_SIZE(str),
2738 mapping,
2739 errors);
2740 Py_DECREF(str);
2741 return result;
2742
2743 onError:
2744 Py_XDECREF(str);
2745 return NULL;
2746}
2747
Guido van Rossum9e896b32000-04-05 20:11:21 +00002748/* --- Decimal Encoder ---------------------------------------------------- */
2749
2750int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2751 int length,
2752 char *output,
2753 const char *errors)
2754{
2755 Py_UNICODE *p, *end;
2756
2757 if (output == NULL) {
2758 PyErr_BadArgument();
2759 return -1;
2760 }
2761
2762 p = s;
2763 end = s + length;
2764 while (p < end) {
2765 register Py_UNICODE ch = *p++;
2766 int decimal;
2767
2768 if (Py_UNICODE_ISSPACE(ch)) {
2769 *output++ = ' ';
2770 continue;
2771 }
2772 decimal = Py_UNICODE_TODECIMAL(ch);
2773 if (decimal >= 0) {
2774 *output++ = '0' + decimal;
2775 continue;
2776 }
Guido van Rossumba477042000-04-06 18:18:10 +00002777 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002778 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002779 continue;
2780 }
2781 /* All other characters are considered invalid */
2782 if (errors == NULL || strcmp(errors, "strict") == 0) {
2783 PyErr_SetString(PyExc_ValueError,
2784 "invalid decimal Unicode string");
2785 goto onError;
2786 }
2787 else if (strcmp(errors, "ignore") == 0)
2788 continue;
2789 else if (strcmp(errors, "replace") == 0) {
2790 *output++ = '?';
2791 continue;
2792 }
2793 }
2794 /* 0-terminate the output string */
2795 *output++ = '\0';
2796 return 0;
2797
2798 onError:
2799 return -1;
2800}
2801
Guido van Rossumd57fd912000-03-10 22:53:23 +00002802/* --- Helpers ------------------------------------------------------------ */
2803
2804static
2805int count(PyUnicodeObject *self,
2806 int start,
2807 int end,
2808 PyUnicodeObject *substring)
2809{
2810 int count = 0;
2811
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002812 if (start < 0)
2813 start += self->length;
2814 if (start < 0)
2815 start = 0;
2816 if (end > self->length)
2817 end = self->length;
2818 if (end < 0)
2819 end += self->length;
2820 if (end < 0)
2821 end = 0;
2822
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002823 if (substring->length == 0)
2824 return (end - start + 1);
2825
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 end -= substring->length;
2827
2828 while (start <= end)
2829 if (Py_UNICODE_MATCH(self, start, substring)) {
2830 count++;
2831 start += substring->length;
2832 } else
2833 start++;
2834
2835 return count;
2836}
2837
2838int PyUnicode_Count(PyObject *str,
2839 PyObject *substr,
2840 int start,
2841 int end)
2842{
2843 int result;
2844
2845 str = PyUnicode_FromObject(str);
2846 if (str == NULL)
2847 return -1;
2848 substr = PyUnicode_FromObject(substr);
2849 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002850 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 return -1;
2852 }
2853
2854 result = count((PyUnicodeObject *)str,
2855 start, end,
2856 (PyUnicodeObject *)substr);
2857
2858 Py_DECREF(str);
2859 Py_DECREF(substr);
2860 return result;
2861}
2862
2863static
2864int findstring(PyUnicodeObject *self,
2865 PyUnicodeObject *substring,
2866 int start,
2867 int end,
2868 int direction)
2869{
2870 if (start < 0)
2871 start += self->length;
2872 if (start < 0)
2873 start = 0;
2874
2875 if (substring->length == 0)
2876 return start;
2877
2878 if (end > self->length)
2879 end = self->length;
2880 if (end < 0)
2881 end += self->length;
2882 if (end < 0)
2883 end = 0;
2884
2885 end -= substring->length;
2886
2887 if (direction < 0) {
2888 for (; end >= start; end--)
2889 if (Py_UNICODE_MATCH(self, end, substring))
2890 return end;
2891 } else {
2892 for (; start <= end; start++)
2893 if (Py_UNICODE_MATCH(self, start, substring))
2894 return start;
2895 }
2896
2897 return -1;
2898}
2899
2900int PyUnicode_Find(PyObject *str,
2901 PyObject *substr,
2902 int start,
2903 int end,
2904 int direction)
2905{
2906 int result;
2907
2908 str = PyUnicode_FromObject(str);
2909 if (str == NULL)
2910 return -1;
2911 substr = PyUnicode_FromObject(substr);
2912 if (substr == NULL) {
2913 Py_DECREF(substr);
2914 return -1;
2915 }
2916
2917 result = findstring((PyUnicodeObject *)str,
2918 (PyUnicodeObject *)substr,
2919 start, end, direction);
2920 Py_DECREF(str);
2921 Py_DECREF(substr);
2922 return result;
2923}
2924
2925static
2926int tailmatch(PyUnicodeObject *self,
2927 PyUnicodeObject *substring,
2928 int start,
2929 int end,
2930 int direction)
2931{
2932 if (start < 0)
2933 start += self->length;
2934 if (start < 0)
2935 start = 0;
2936
2937 if (substring->length == 0)
2938 return 1;
2939
2940 if (end > self->length)
2941 end = self->length;
2942 if (end < 0)
2943 end += self->length;
2944 if (end < 0)
2945 end = 0;
2946
2947 end -= substring->length;
2948 if (end < start)
2949 return 0;
2950
2951 if (direction > 0) {
2952 if (Py_UNICODE_MATCH(self, end, substring))
2953 return 1;
2954 } else {
2955 if (Py_UNICODE_MATCH(self, start, substring))
2956 return 1;
2957 }
2958
2959 return 0;
2960}
2961
2962int PyUnicode_Tailmatch(PyObject *str,
2963 PyObject *substr,
2964 int start,
2965 int end,
2966 int direction)
2967{
2968 int result;
2969
2970 str = PyUnicode_FromObject(str);
2971 if (str == NULL)
2972 return -1;
2973 substr = PyUnicode_FromObject(substr);
2974 if (substr == NULL) {
2975 Py_DECREF(substr);
2976 return -1;
2977 }
2978
2979 result = tailmatch((PyUnicodeObject *)str,
2980 (PyUnicodeObject *)substr,
2981 start, end, direction);
2982 Py_DECREF(str);
2983 Py_DECREF(substr);
2984 return result;
2985}
2986
2987static
2988const Py_UNICODE *findchar(const Py_UNICODE *s,
2989 int size,
2990 Py_UNICODE ch)
2991{
2992 /* like wcschr, but doesn't stop at NULL characters */
2993
2994 while (size-- > 0) {
2995 if (*s == ch)
2996 return s;
2997 s++;
2998 }
2999
3000 return NULL;
3001}
3002
3003/* Apply fixfct filter to the Unicode object self and return a
3004 reference to the modified object */
3005
3006static
3007PyObject *fixup(PyUnicodeObject *self,
3008 int (*fixfct)(PyUnicodeObject *s))
3009{
3010
3011 PyUnicodeObject *u;
3012
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003013 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003014 if (u == NULL)
3015 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003016
3017 Py_UNICODE_COPY(u->str, self->str, self->length);
3018
Tim Peters7a29bd52001-09-12 03:03:31 +00003019 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003020 /* fixfct should return TRUE if it modified the buffer. If
3021 FALSE, return a reference to the original buffer instead
3022 (to save space, not time) */
3023 Py_INCREF(self);
3024 Py_DECREF(u);
3025 return (PyObject*) self;
3026 }
3027 return (PyObject*) u;
3028}
3029
3030static
3031int fixupper(PyUnicodeObject *self)
3032{
3033 int len = self->length;
3034 Py_UNICODE *s = self->str;
3035 int status = 0;
3036
3037 while (len-- > 0) {
3038 register Py_UNICODE ch;
3039
3040 ch = Py_UNICODE_TOUPPER(*s);
3041 if (ch != *s) {
3042 status = 1;
3043 *s = ch;
3044 }
3045 s++;
3046 }
3047
3048 return status;
3049}
3050
3051static
3052int fixlower(PyUnicodeObject *self)
3053{
3054 int len = self->length;
3055 Py_UNICODE *s = self->str;
3056 int status = 0;
3057
3058 while (len-- > 0) {
3059 register Py_UNICODE ch;
3060
3061 ch = Py_UNICODE_TOLOWER(*s);
3062 if (ch != *s) {
3063 status = 1;
3064 *s = ch;
3065 }
3066 s++;
3067 }
3068
3069 return status;
3070}
3071
3072static
3073int fixswapcase(PyUnicodeObject *self)
3074{
3075 int len = self->length;
3076 Py_UNICODE *s = self->str;
3077 int status = 0;
3078
3079 while (len-- > 0) {
3080 if (Py_UNICODE_ISUPPER(*s)) {
3081 *s = Py_UNICODE_TOLOWER(*s);
3082 status = 1;
3083 } else if (Py_UNICODE_ISLOWER(*s)) {
3084 *s = Py_UNICODE_TOUPPER(*s);
3085 status = 1;
3086 }
3087 s++;
3088 }
3089
3090 return status;
3091}
3092
3093static
3094int fixcapitalize(PyUnicodeObject *self)
3095{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003096 int len = self->length;
3097 Py_UNICODE *s = self->str;
3098 int status = 0;
3099
3100 if (len == 0)
3101 return 0;
3102 if (Py_UNICODE_ISLOWER(*s)) {
3103 *s = Py_UNICODE_TOUPPER(*s);
3104 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003105 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003106 s++;
3107 while (--len > 0) {
3108 if (Py_UNICODE_ISUPPER(*s)) {
3109 *s = Py_UNICODE_TOLOWER(*s);
3110 status = 1;
3111 }
3112 s++;
3113 }
3114 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003115}
3116
3117static
3118int fixtitle(PyUnicodeObject *self)
3119{
3120 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3121 register Py_UNICODE *e;
3122 int previous_is_cased;
3123
3124 /* Shortcut for single character strings */
3125 if (PyUnicode_GET_SIZE(self) == 1) {
3126 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3127 if (*p != ch) {
3128 *p = ch;
3129 return 1;
3130 }
3131 else
3132 return 0;
3133 }
3134
3135 e = p + PyUnicode_GET_SIZE(self);
3136 previous_is_cased = 0;
3137 for (; p < e; p++) {
3138 register const Py_UNICODE ch = *p;
3139
3140 if (previous_is_cased)
3141 *p = Py_UNICODE_TOLOWER(ch);
3142 else
3143 *p = Py_UNICODE_TOTITLE(ch);
3144
3145 if (Py_UNICODE_ISLOWER(ch) ||
3146 Py_UNICODE_ISUPPER(ch) ||
3147 Py_UNICODE_ISTITLE(ch))
3148 previous_is_cased = 1;
3149 else
3150 previous_is_cased = 0;
3151 }
3152 return 1;
3153}
3154
3155PyObject *PyUnicode_Join(PyObject *separator,
3156 PyObject *seq)
3157{
3158 Py_UNICODE *sep;
3159 int seplen;
3160 PyUnicodeObject *res = NULL;
3161 int reslen = 0;
3162 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003163 int sz = 100;
3164 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003165 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003166
Tim Peters2cfe3682001-05-05 05:36:48 +00003167 it = PyObject_GetIter(seq);
3168 if (it == NULL)
3169 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003170
3171 if (separator == NULL) {
3172 Py_UNICODE blank = ' ';
3173 sep = &blank;
3174 seplen = 1;
3175 }
3176 else {
3177 separator = PyUnicode_FromObject(separator);
3178 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003179 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180 sep = PyUnicode_AS_UNICODE(separator);
3181 seplen = PyUnicode_GET_SIZE(separator);
3182 }
3183
3184 res = _PyUnicode_New(sz);
3185 if (res == NULL)
3186 goto onError;
3187 p = PyUnicode_AS_UNICODE(res);
3188 reslen = 0;
3189
Tim Peters2cfe3682001-05-05 05:36:48 +00003190 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003192 PyObject *item = PyIter_Next(it);
3193 if (item == NULL) {
3194 if (PyErr_Occurred())
3195 goto onError;
3196 break;
3197 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003198 if (!PyUnicode_Check(item)) {
3199 PyObject *v;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003200 if (!PyString_Check(item)) {
3201 PyErr_Format(PyExc_TypeError,
3202 "sequence item %i: expected string or Unicode,"
3203 " %.80s found",
3204 i, item->ob_type->tp_name);
3205 Py_DECREF(item);
3206 goto onError;
3207 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 v = PyUnicode_FromObject(item);
3209 Py_DECREF(item);
3210 item = v;
3211 if (item == NULL)
3212 goto onError;
3213 }
3214 itemlen = PyUnicode_GET_SIZE(item);
3215 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003216 if (_PyUnicode_Resize(&res, sz*2)) {
3217 Py_DECREF(item);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003218 goto onError;
Marc-André Lemburg3508e302001-09-20 17:22:58 +00003219 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003220 sz *= 2;
3221 p = PyUnicode_AS_UNICODE(res) + reslen;
3222 }
3223 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003224 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003225 p += seplen;
3226 reslen += seplen;
3227 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003228 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003229 p += itemlen;
3230 reslen += itemlen;
3231 Py_DECREF(item);
3232 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003233 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234 goto onError;
3235
3236 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003237 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 return (PyObject *)res;
3239
3240 onError:
3241 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003242 Py_XDECREF(res);
3243 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 return NULL;
3245}
3246
3247static
3248PyUnicodeObject *pad(PyUnicodeObject *self,
3249 int left,
3250 int right,
3251 Py_UNICODE fill)
3252{
3253 PyUnicodeObject *u;
3254
3255 if (left < 0)
3256 left = 0;
3257 if (right < 0)
3258 right = 0;
3259
Tim Peters7a29bd52001-09-12 03:03:31 +00003260 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 Py_INCREF(self);
3262 return self;
3263 }
3264
3265 u = _PyUnicode_New(left + self->length + right);
3266 if (u) {
3267 if (left)
3268 Py_UNICODE_FILL(u->str, fill, left);
3269 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3270 if (right)
3271 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3272 }
3273
3274 return u;
3275}
3276
3277#define SPLIT_APPEND(data, left, right) \
3278 str = PyUnicode_FromUnicode(data + left, right - left); \
3279 if (!str) \
3280 goto onError; \
3281 if (PyList_Append(list, str)) { \
3282 Py_DECREF(str); \
3283 goto onError; \
3284 } \
3285 else \
3286 Py_DECREF(str);
3287
3288static
3289PyObject *split_whitespace(PyUnicodeObject *self,
3290 PyObject *list,
3291 int maxcount)
3292{
3293 register int i;
3294 register int j;
3295 int len = self->length;
3296 PyObject *str;
3297
3298 for (i = j = 0; i < len; ) {
3299 /* find a token */
3300 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3301 i++;
3302 j = i;
3303 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3304 i++;
3305 if (j < i) {
3306 if (maxcount-- <= 0)
3307 break;
3308 SPLIT_APPEND(self->str, j, i);
3309 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3310 i++;
3311 j = i;
3312 }
3313 }
3314 if (j < len) {
3315 SPLIT_APPEND(self->str, j, len);
3316 }
3317 return list;
3318
3319 onError:
3320 Py_DECREF(list);
3321 return NULL;
3322}
3323
3324PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003325 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326{
3327 register int i;
3328 register int j;
3329 int len;
3330 PyObject *list;
3331 PyObject *str;
3332 Py_UNICODE *data;
3333
3334 string = PyUnicode_FromObject(string);
3335 if (string == NULL)
3336 return NULL;
3337 data = PyUnicode_AS_UNICODE(string);
3338 len = PyUnicode_GET_SIZE(string);
3339
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 list = PyList_New(0);
3341 if (!list)
3342 goto onError;
3343
3344 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003345 int eol;
3346
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347 /* Find a line and append it */
3348 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3349 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350
3351 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003352 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 if (i < len) {
3354 if (data[i] == '\r' && i + 1 < len &&
3355 data[i+1] == '\n')
3356 i += 2;
3357 else
3358 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003359 if (keepends)
3360 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003361 }
Guido van Rossum86662912000-04-11 15:38:46 +00003362 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003363 j = i;
3364 }
3365 if (j < len) {
3366 SPLIT_APPEND(data, j, len);
3367 }
3368
3369 Py_DECREF(string);
3370 return list;
3371
3372 onError:
3373 Py_DECREF(list);
3374 Py_DECREF(string);
3375 return NULL;
3376}
3377
3378static
3379PyObject *split_char(PyUnicodeObject *self,
3380 PyObject *list,
3381 Py_UNICODE ch,
3382 int maxcount)
3383{
3384 register int i;
3385 register int j;
3386 int len = self->length;
3387 PyObject *str;
3388
3389 for (i = j = 0; i < len; ) {
3390 if (self->str[i] == ch) {
3391 if (maxcount-- <= 0)
3392 break;
3393 SPLIT_APPEND(self->str, j, i);
3394 i = j = i + 1;
3395 } else
3396 i++;
3397 }
3398 if (j <= len) {
3399 SPLIT_APPEND(self->str, j, len);
3400 }
3401 return list;
3402
3403 onError:
3404 Py_DECREF(list);
3405 return NULL;
3406}
3407
3408static
3409PyObject *split_substring(PyUnicodeObject *self,
3410 PyObject *list,
3411 PyUnicodeObject *substring,
3412 int maxcount)
3413{
3414 register int i;
3415 register int j;
3416 int len = self->length;
3417 int sublen = substring->length;
3418 PyObject *str;
3419
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003420 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003421 if (Py_UNICODE_MATCH(self, i, substring)) {
3422 if (maxcount-- <= 0)
3423 break;
3424 SPLIT_APPEND(self->str, j, i);
3425 i = j = i + sublen;
3426 } else
3427 i++;
3428 }
3429 if (j <= len) {
3430 SPLIT_APPEND(self->str, j, len);
3431 }
3432 return list;
3433
3434 onError:
3435 Py_DECREF(list);
3436 return NULL;
3437}
3438
3439#undef SPLIT_APPEND
3440
3441static
3442PyObject *split(PyUnicodeObject *self,
3443 PyUnicodeObject *substring,
3444 int maxcount)
3445{
3446 PyObject *list;
3447
3448 if (maxcount < 0)
3449 maxcount = INT_MAX;
3450
3451 list = PyList_New(0);
3452 if (!list)
3453 return NULL;
3454
3455 if (substring == NULL)
3456 return split_whitespace(self,list,maxcount);
3457
3458 else if (substring->length == 1)
3459 return split_char(self,list,substring->str[0],maxcount);
3460
3461 else if (substring->length == 0) {
3462 Py_DECREF(list);
3463 PyErr_SetString(PyExc_ValueError, "empty separator");
3464 return NULL;
3465 }
3466 else
3467 return split_substring(self,list,substring,maxcount);
3468}
3469
3470static
3471PyObject *strip(PyUnicodeObject *self,
3472 int left,
3473 int right)
3474{
3475 Py_UNICODE *p = self->str;
3476 int start = 0;
3477 int end = self->length;
3478
3479 if (left)
3480 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3481 start++;
3482
3483 if (right)
3484 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3485 end--;
3486
Tim Peters7a29bd52001-09-12 03:03:31 +00003487 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003488 /* couldn't strip anything off, return original string */
3489 Py_INCREF(self);
3490 return (PyObject*) self;
3491 }
3492
3493 return (PyObject*) PyUnicode_FromUnicode(
3494 self->str + start,
3495 end - start
3496 );
3497}
3498
3499static
3500PyObject *replace(PyUnicodeObject *self,
3501 PyUnicodeObject *str1,
3502 PyUnicodeObject *str2,
3503 int maxcount)
3504{
3505 PyUnicodeObject *u;
3506
3507 if (maxcount < 0)
3508 maxcount = INT_MAX;
3509
3510 if (str1->length == 1 && str2->length == 1) {
3511 int i;
3512
3513 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003514 if (!findchar(self->str, self->length, str1->str[0]) &&
3515 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 /* nothing to replace, return original string */
3517 Py_INCREF(self);
3518 u = self;
3519 } else {
3520 Py_UNICODE u1 = str1->str[0];
3521 Py_UNICODE u2 = str2->str[0];
3522
3523 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003524 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 self->length
3526 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003527 if (u != NULL) {
3528 Py_UNICODE_COPY(u->str, self->str,
3529 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 for (i = 0; i < u->length; i++)
3531 if (u->str[i] == u1) {
3532 if (--maxcount < 0)
3533 break;
3534 u->str[i] = u2;
3535 }
3536 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003538
3539 } else {
3540 int n, i;
3541 Py_UNICODE *p;
3542
3543 /* replace strings */
3544 n = count(self, 0, self->length, str1);
3545 if (n > maxcount)
3546 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003547 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003548 /* nothing to replace, return original string */
3549 Py_INCREF(self);
3550 u = self;
3551 } else {
3552 u = _PyUnicode_New(
3553 self->length + n * (str2->length - str1->length));
3554 if (u) {
3555 i = 0;
3556 p = u->str;
3557 while (i <= self->length - str1->length)
3558 if (Py_UNICODE_MATCH(self, i, str1)) {
3559 /* replace string segment */
3560 Py_UNICODE_COPY(p, str2->str, str2->length);
3561 p += str2->length;
3562 i += str1->length;
3563 if (--n <= 0) {
3564 /* copy remaining part */
3565 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3566 break;
3567 }
3568 } else
3569 *p++ = self->str[i++];
3570 }
3571 }
3572 }
3573
3574 return (PyObject *) u;
3575}
3576
3577/* --- Unicode Object Methods --------------------------------------------- */
3578
3579static char title__doc__[] =
3580"S.title() -> unicode\n\
3581\n\
3582Return a titlecased version of S, i.e. words start with title case\n\
3583characters, all remaining cased characters have lower case.";
3584
3585static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003586unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003588 return fixup(self, fixtitle);
3589}
3590
3591static char capitalize__doc__[] =
3592"S.capitalize() -> unicode\n\
3593\n\
3594Return a capitalized version of S, i.e. make the first character\n\
3595have upper case.";
3596
3597static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003598unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003599{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 return fixup(self, fixcapitalize);
3601}
3602
3603#if 0
3604static char capwords__doc__[] =
3605"S.capwords() -> unicode\n\
3606\n\
3607Apply .capitalize() to all words in S and return the result with\n\
3608normalized whitespace (all whitespace strings are replaced by ' ').";
3609
3610static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003611unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612{
3613 PyObject *list;
3614 PyObject *item;
3615 int i;
3616
Guido van Rossumd57fd912000-03-10 22:53:23 +00003617 /* Split into words */
3618 list = split(self, NULL, -1);
3619 if (!list)
3620 return NULL;
3621
3622 /* Capitalize each word */
3623 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3624 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3625 fixcapitalize);
3626 if (item == NULL)
3627 goto onError;
3628 Py_DECREF(PyList_GET_ITEM(list, i));
3629 PyList_SET_ITEM(list, i, item);
3630 }
3631
3632 /* Join the words to form a new string */
3633 item = PyUnicode_Join(NULL, list);
3634
3635onError:
3636 Py_DECREF(list);
3637 return (PyObject *)item;
3638}
3639#endif
3640
3641static char center__doc__[] =
3642"S.center(width) -> unicode\n\
3643\n\
3644Return S centered in a Unicode string of length width. Padding is done\n\
3645using spaces.";
3646
3647static PyObject *
3648unicode_center(PyUnicodeObject *self, PyObject *args)
3649{
3650 int marg, left;
3651 int width;
3652
3653 if (!PyArg_ParseTuple(args, "i:center", &width))
3654 return NULL;
3655
Tim Peters7a29bd52001-09-12 03:03:31 +00003656 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 Py_INCREF(self);
3658 return (PyObject*) self;
3659 }
3660
3661 marg = width - self->length;
3662 left = marg / 2 + (marg & width & 1);
3663
3664 return (PyObject*) pad(self, left, marg - left, ' ');
3665}
3666
Marc-André Lemburge5034372000-08-08 08:04:29 +00003667#if 0
3668
3669/* This code should go into some future Unicode collation support
3670 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003671 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003672
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003673/* speedy UTF-16 code point order comparison */
3674/* gleaned from: */
3675/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3676
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003677static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003678{
3679 0, 0, 0, 0, 0, 0, 0, 0,
3680 0, 0, 0, 0, 0, 0, 0, 0,
3681 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003682 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003683};
3684
Guido van Rossumd57fd912000-03-10 22:53:23 +00003685static int
3686unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3687{
3688 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003689
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 Py_UNICODE *s1 = str1->str;
3691 Py_UNICODE *s2 = str2->str;
3692
3693 len1 = str1->length;
3694 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003695
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003697 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003698
3699 c1 = *s1++;
3700 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003701
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003702 if (c1 > (1<<11) * 26)
3703 c1 += utf16Fixup[c1>>11];
3704 if (c2 > (1<<11) * 26)
3705 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003706 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003707
3708 if (c1 != c2)
3709 return (c1 < c2) ? -1 : 1;
3710
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003711 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003712 }
3713
3714 return (len1 < len2) ? -1 : (len1 != len2);
3715}
3716
Marc-André Lemburge5034372000-08-08 08:04:29 +00003717#else
3718
3719static int
3720unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3721{
3722 register int len1, len2;
3723
3724 Py_UNICODE *s1 = str1->str;
3725 Py_UNICODE *s2 = str2->str;
3726
3727 len1 = str1->length;
3728 len2 = str2->length;
3729
3730 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003731 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003732
Fredrik Lundh45714e92001-06-26 16:39:36 +00003733 c1 = *s1++;
3734 c2 = *s2++;
3735
3736 if (c1 != c2)
3737 return (c1 < c2) ? -1 : 1;
3738
Marc-André Lemburge5034372000-08-08 08:04:29 +00003739 len1--; len2--;
3740 }
3741
3742 return (len1 < len2) ? -1 : (len1 != len2);
3743}
3744
3745#endif
3746
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747int PyUnicode_Compare(PyObject *left,
3748 PyObject *right)
3749{
3750 PyUnicodeObject *u = NULL, *v = NULL;
3751 int result;
3752
3753 /* Coerce the two arguments */
3754 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3755 if (u == NULL)
3756 goto onError;
3757 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3758 if (v == NULL)
3759 goto onError;
3760
Thomas Wouters7e474022000-07-16 12:04:32 +00003761 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003762 if (v == u) {
3763 Py_DECREF(u);
3764 Py_DECREF(v);
3765 return 0;
3766 }
3767
3768 result = unicode_compare(u, v);
3769
3770 Py_DECREF(u);
3771 Py_DECREF(v);
3772 return result;
3773
3774onError:
3775 Py_XDECREF(u);
3776 Py_XDECREF(v);
3777 return -1;
3778}
3779
Guido van Rossum403d68b2000-03-13 15:55:09 +00003780int PyUnicode_Contains(PyObject *container,
3781 PyObject *element)
3782{
3783 PyUnicodeObject *u = NULL, *v = NULL;
3784 int result;
3785 register const Py_UNICODE *p, *e;
3786 register Py_UNICODE ch;
3787
3788 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003789 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003790 if (v == NULL) {
3791 PyErr_SetString(PyExc_TypeError,
3792 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003793 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003794 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003795 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3796 if (u == NULL) {
3797 Py_DECREF(v);
3798 goto onError;
3799 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003800
3801 /* Check v in u */
3802 if (PyUnicode_GET_SIZE(v) != 1) {
3803 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003804 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003805 goto onError;
3806 }
3807 ch = *PyUnicode_AS_UNICODE(v);
3808 p = PyUnicode_AS_UNICODE(u);
3809 e = p + PyUnicode_GET_SIZE(u);
3810 result = 0;
3811 while (p < e) {
3812 if (*p++ == ch) {
3813 result = 1;
3814 break;
3815 }
3816 }
3817
3818 Py_DECREF(u);
3819 Py_DECREF(v);
3820 return result;
3821
3822onError:
3823 Py_XDECREF(u);
3824 Py_XDECREF(v);
3825 return -1;
3826}
3827
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828/* Concat to string or Unicode object giving a new Unicode object. */
3829
3830PyObject *PyUnicode_Concat(PyObject *left,
3831 PyObject *right)
3832{
3833 PyUnicodeObject *u = NULL, *v = NULL, *w;
3834
3835 /* Coerce the two arguments */
3836 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3837 if (u == NULL)
3838 goto onError;
3839 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3840 if (v == NULL)
3841 goto onError;
3842
3843 /* Shortcuts */
3844 if (v == unicode_empty) {
3845 Py_DECREF(v);
3846 return (PyObject *)u;
3847 }
3848 if (u == unicode_empty) {
3849 Py_DECREF(u);
3850 return (PyObject *)v;
3851 }
3852
3853 /* Concat the two Unicode strings */
3854 w = _PyUnicode_New(u->length + v->length);
3855 if (w == NULL)
3856 goto onError;
3857 Py_UNICODE_COPY(w->str, u->str, u->length);
3858 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3859
3860 Py_DECREF(u);
3861 Py_DECREF(v);
3862 return (PyObject *)w;
3863
3864onError:
3865 Py_XDECREF(u);
3866 Py_XDECREF(v);
3867 return NULL;
3868}
3869
3870static char count__doc__[] =
3871"S.count(sub[, start[, end]]) -> int\n\
3872\n\
3873Return the number of occurrences of substring sub in Unicode string\n\
3874S[start:end]. Optional arguments start and end are\n\
3875interpreted as in slice notation.";
3876
3877static PyObject *
3878unicode_count(PyUnicodeObject *self, PyObject *args)
3879{
3880 PyUnicodeObject *substring;
3881 int start = 0;
3882 int end = INT_MAX;
3883 PyObject *result;
3884
Guido van Rossumb8872e62000-05-09 14:14:27 +00003885 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3886 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 return NULL;
3888
3889 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3890 (PyObject *)substring);
3891 if (substring == NULL)
3892 return NULL;
3893
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894 if (start < 0)
3895 start += self->length;
3896 if (start < 0)
3897 start = 0;
3898 if (end > self->length)
3899 end = self->length;
3900 if (end < 0)
3901 end += self->length;
3902 if (end < 0)
3903 end = 0;
3904
3905 result = PyInt_FromLong((long) count(self, start, end, substring));
3906
3907 Py_DECREF(substring);
3908 return result;
3909}
3910
3911static char encode__doc__[] =
3912"S.encode([encoding[,errors]]) -> string\n\
3913\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003914Return an encoded string version of S. Default encoding is the current\n\
3915default string encoding. errors may be given to set a different error\n\
3916handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3917a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003918
3919static PyObject *
3920unicode_encode(PyUnicodeObject *self, PyObject *args)
3921{
3922 char *encoding = NULL;
3923 char *errors = NULL;
3924 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3925 return NULL;
3926 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3927}
3928
3929static char expandtabs__doc__[] =
3930"S.expandtabs([tabsize]) -> unicode\n\
3931\n\
3932Return a copy of S where all tab characters are expanded using spaces.\n\
3933If tabsize is not given, a tab size of 8 characters is assumed.";
3934
3935static PyObject*
3936unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3937{
3938 Py_UNICODE *e;
3939 Py_UNICODE *p;
3940 Py_UNICODE *q;
3941 int i, j;
3942 PyUnicodeObject *u;
3943 int tabsize = 8;
3944
3945 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3946 return NULL;
3947
Thomas Wouters7e474022000-07-16 12:04:32 +00003948 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003949 i = j = 0;
3950 e = self->str + self->length;
3951 for (p = self->str; p < e; p++)
3952 if (*p == '\t') {
3953 if (tabsize > 0)
3954 j += tabsize - (j % tabsize);
3955 }
3956 else {
3957 j++;
3958 if (*p == '\n' || *p == '\r') {
3959 i += j;
3960 j = 0;
3961 }
3962 }
3963
3964 /* Second pass: create output string and fill it */
3965 u = _PyUnicode_New(i + j);
3966 if (!u)
3967 return NULL;
3968
3969 j = 0;
3970 q = u->str;
3971
3972 for (p = self->str; p < e; p++)
3973 if (*p == '\t') {
3974 if (tabsize > 0) {
3975 i = tabsize - (j % tabsize);
3976 j += i;
3977 while (i--)
3978 *q++ = ' ';
3979 }
3980 }
3981 else {
3982 j++;
3983 *q++ = *p;
3984 if (*p == '\n' || *p == '\r')
3985 j = 0;
3986 }
3987
3988 return (PyObject*) u;
3989}
3990
3991static char find__doc__[] =
3992"S.find(sub [,start [,end]]) -> int\n\
3993\n\
3994Return the lowest index in S where substring sub is found,\n\
3995such that sub is contained within s[start,end]. Optional\n\
3996arguments start and end are interpreted as in slice notation.\n\
3997\n\
3998Return -1 on failure.";
3999
4000static PyObject *
4001unicode_find(PyUnicodeObject *self, PyObject *args)
4002{
4003 PyUnicodeObject *substring;
4004 int start = 0;
4005 int end = INT_MAX;
4006 PyObject *result;
4007
Guido van Rossumb8872e62000-05-09 14:14:27 +00004008 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
4009 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004010 return NULL;
4011 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4012 (PyObject *)substring);
4013 if (substring == NULL)
4014 return NULL;
4015
4016 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
4017
4018 Py_DECREF(substring);
4019 return result;
4020}
4021
4022static PyObject *
4023unicode_getitem(PyUnicodeObject *self, int index)
4024{
4025 if (index < 0 || index >= self->length) {
4026 PyErr_SetString(PyExc_IndexError, "string index out of range");
4027 return NULL;
4028 }
4029
4030 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4031}
4032
4033static long
4034unicode_hash(PyUnicodeObject *self)
4035{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004036 /* Since Unicode objects compare equal to their ASCII string
4037 counterparts, they should use the individual character values
4038 as basis for their hash value. This is needed to assure that
4039 strings and Unicode objects behave in the same way as
4040 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004041
Fredrik Lundhdde61642000-07-10 18:27:47 +00004042 register int len;
4043 register Py_UNICODE *p;
4044 register long x;
4045
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046 if (self->hash != -1)
4047 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004048 len = PyUnicode_GET_SIZE(self);
4049 p = PyUnicode_AS_UNICODE(self);
4050 x = *p << 7;
4051 while (--len >= 0)
4052 x = (1000003*x) ^ *p++;
4053 x ^= PyUnicode_GET_SIZE(self);
4054 if (x == -1)
4055 x = -2;
4056 self->hash = x;
4057 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058}
4059
4060static char index__doc__[] =
4061"S.index(sub [,start [,end]]) -> int\n\
4062\n\
4063Like S.find() but raise ValueError when the substring is not found.";
4064
4065static PyObject *
4066unicode_index(PyUnicodeObject *self, PyObject *args)
4067{
4068 int result;
4069 PyUnicodeObject *substring;
4070 int start = 0;
4071 int end = INT_MAX;
4072
Guido van Rossumb8872e62000-05-09 14:14:27 +00004073 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4074 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 return NULL;
4076
4077 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4078 (PyObject *)substring);
4079 if (substring == NULL)
4080 return NULL;
4081
4082 result = findstring(self, substring, start, end, 1);
4083
4084 Py_DECREF(substring);
4085 if (result < 0) {
4086 PyErr_SetString(PyExc_ValueError, "substring not found");
4087 return NULL;
4088 }
4089 return PyInt_FromLong(result);
4090}
4091
4092static char islower__doc__[] =
4093"S.islower() -> int\n\
4094\n\
4095Return 1 if all cased characters in S are lowercase and there is\n\
4096at least one cased character in S, 0 otherwise.";
4097
4098static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004099unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004100{
4101 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4102 register const Py_UNICODE *e;
4103 int cased;
4104
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105 /* Shortcut for single character strings */
4106 if (PyUnicode_GET_SIZE(self) == 1)
4107 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4108
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004109 /* Special case for empty strings */
4110 if (PyString_GET_SIZE(self) == 0)
4111 return PyInt_FromLong(0);
4112
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 e = p + PyUnicode_GET_SIZE(self);
4114 cased = 0;
4115 for (; p < e; p++) {
4116 register const Py_UNICODE ch = *p;
4117
4118 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4119 return PyInt_FromLong(0);
4120 else if (!cased && Py_UNICODE_ISLOWER(ch))
4121 cased = 1;
4122 }
4123 return PyInt_FromLong(cased);
4124}
4125
4126static char isupper__doc__[] =
4127"S.isupper() -> int\n\
4128\n\
4129Return 1 if all cased characters in S are uppercase and there is\n\
4130at least one cased character in S, 0 otherwise.";
4131
4132static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004133unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004134{
4135 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4136 register const Py_UNICODE *e;
4137 int cased;
4138
Guido van Rossumd57fd912000-03-10 22:53:23 +00004139 /* Shortcut for single character strings */
4140 if (PyUnicode_GET_SIZE(self) == 1)
4141 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4142
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004143 /* Special case for empty strings */
4144 if (PyString_GET_SIZE(self) == 0)
4145 return PyInt_FromLong(0);
4146
Guido van Rossumd57fd912000-03-10 22:53:23 +00004147 e = p + PyUnicode_GET_SIZE(self);
4148 cased = 0;
4149 for (; p < e; p++) {
4150 register const Py_UNICODE ch = *p;
4151
4152 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4153 return PyInt_FromLong(0);
4154 else if (!cased && Py_UNICODE_ISUPPER(ch))
4155 cased = 1;
4156 }
4157 return PyInt_FromLong(cased);
4158}
4159
4160static char istitle__doc__[] =
4161"S.istitle() -> int\n\
4162\n\
4163Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4164may only follow uncased characters and lowercase characters only cased\n\
4165ones. Return 0 otherwise.";
4166
4167static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004168unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169{
4170 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4171 register const Py_UNICODE *e;
4172 int cased, previous_is_cased;
4173
Guido van Rossumd57fd912000-03-10 22:53:23 +00004174 /* Shortcut for single character strings */
4175 if (PyUnicode_GET_SIZE(self) == 1)
4176 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4177 (Py_UNICODE_ISUPPER(*p) != 0));
4178
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004179 /* Special case for empty strings */
4180 if (PyString_GET_SIZE(self) == 0)
4181 return PyInt_FromLong(0);
4182
Guido van Rossumd57fd912000-03-10 22:53:23 +00004183 e = p + PyUnicode_GET_SIZE(self);
4184 cased = 0;
4185 previous_is_cased = 0;
4186 for (; p < e; p++) {
4187 register const Py_UNICODE ch = *p;
4188
4189 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4190 if (previous_is_cased)
4191 return PyInt_FromLong(0);
4192 previous_is_cased = 1;
4193 cased = 1;
4194 }
4195 else if (Py_UNICODE_ISLOWER(ch)) {
4196 if (!previous_is_cased)
4197 return PyInt_FromLong(0);
4198 previous_is_cased = 1;
4199 cased = 1;
4200 }
4201 else
4202 previous_is_cased = 0;
4203 }
4204 return PyInt_FromLong(cased);
4205}
4206
4207static char isspace__doc__[] =
4208"S.isspace() -> int\n\
4209\n\
4210Return 1 if there are only whitespace characters in S,\n\
42110 otherwise.";
4212
4213static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004214unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004215{
4216 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4217 register const Py_UNICODE *e;
4218
Guido van Rossumd57fd912000-03-10 22:53:23 +00004219 /* Shortcut for single character strings */
4220 if (PyUnicode_GET_SIZE(self) == 1 &&
4221 Py_UNICODE_ISSPACE(*p))
4222 return PyInt_FromLong(1);
4223
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004224 /* Special case for empty strings */
4225 if (PyString_GET_SIZE(self) == 0)
4226 return PyInt_FromLong(0);
4227
Guido van Rossumd57fd912000-03-10 22:53:23 +00004228 e = p + PyUnicode_GET_SIZE(self);
4229 for (; p < e; p++) {
4230 if (!Py_UNICODE_ISSPACE(*p))
4231 return PyInt_FromLong(0);
4232 }
4233 return PyInt_FromLong(1);
4234}
4235
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004236static char isalpha__doc__[] =
4237"S.isalpha() -> int\n\
4238\n\
4239Return 1 if all characters in S are alphabetic\n\
4240and there is at least one character in S, 0 otherwise.";
4241
4242static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004243unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004244{
4245 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4246 register const Py_UNICODE *e;
4247
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004248 /* Shortcut for single character strings */
4249 if (PyUnicode_GET_SIZE(self) == 1 &&
4250 Py_UNICODE_ISALPHA(*p))
4251 return PyInt_FromLong(1);
4252
4253 /* Special case for empty strings */
4254 if (PyString_GET_SIZE(self) == 0)
4255 return PyInt_FromLong(0);
4256
4257 e = p + PyUnicode_GET_SIZE(self);
4258 for (; p < e; p++) {
4259 if (!Py_UNICODE_ISALPHA(*p))
4260 return PyInt_FromLong(0);
4261 }
4262 return PyInt_FromLong(1);
4263}
4264
4265static char isalnum__doc__[] =
4266"S.isalnum() -> int\n\
4267\n\
4268Return 1 if all characters in S are alphanumeric\n\
4269and there is at least one character in S, 0 otherwise.";
4270
4271static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004272unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004273{
4274 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4275 register const Py_UNICODE *e;
4276
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004277 /* Shortcut for single character strings */
4278 if (PyUnicode_GET_SIZE(self) == 1 &&
4279 Py_UNICODE_ISALNUM(*p))
4280 return PyInt_FromLong(1);
4281
4282 /* Special case for empty strings */
4283 if (PyString_GET_SIZE(self) == 0)
4284 return PyInt_FromLong(0);
4285
4286 e = p + PyUnicode_GET_SIZE(self);
4287 for (; p < e; p++) {
4288 if (!Py_UNICODE_ISALNUM(*p))
4289 return PyInt_FromLong(0);
4290 }
4291 return PyInt_FromLong(1);
4292}
4293
Guido van Rossumd57fd912000-03-10 22:53:23 +00004294static char isdecimal__doc__[] =
4295"S.isdecimal() -> int\n\
4296\n\
4297Return 1 if there are only decimal characters in S,\n\
42980 otherwise.";
4299
4300static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004301unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302{
4303 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4304 register const Py_UNICODE *e;
4305
Guido van Rossumd57fd912000-03-10 22:53:23 +00004306 /* Shortcut for single character strings */
4307 if (PyUnicode_GET_SIZE(self) == 1 &&
4308 Py_UNICODE_ISDECIMAL(*p))
4309 return PyInt_FromLong(1);
4310
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004311 /* Special case for empty strings */
4312 if (PyString_GET_SIZE(self) == 0)
4313 return PyInt_FromLong(0);
4314
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 e = p + PyUnicode_GET_SIZE(self);
4316 for (; p < e; p++) {
4317 if (!Py_UNICODE_ISDECIMAL(*p))
4318 return PyInt_FromLong(0);
4319 }
4320 return PyInt_FromLong(1);
4321}
4322
4323static char isdigit__doc__[] =
4324"S.isdigit() -> int\n\
4325\n\
4326Return 1 if there are only digit characters in S,\n\
43270 otherwise.";
4328
4329static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004330unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331{
4332 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4333 register const Py_UNICODE *e;
4334
Guido van Rossumd57fd912000-03-10 22:53:23 +00004335 /* Shortcut for single character strings */
4336 if (PyUnicode_GET_SIZE(self) == 1 &&
4337 Py_UNICODE_ISDIGIT(*p))
4338 return PyInt_FromLong(1);
4339
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004340 /* Special case for empty strings */
4341 if (PyString_GET_SIZE(self) == 0)
4342 return PyInt_FromLong(0);
4343
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344 e = p + PyUnicode_GET_SIZE(self);
4345 for (; p < e; p++) {
4346 if (!Py_UNICODE_ISDIGIT(*p))
4347 return PyInt_FromLong(0);
4348 }
4349 return PyInt_FromLong(1);
4350}
4351
4352static char isnumeric__doc__[] =
4353"S.isnumeric() -> int\n\
4354\n\
4355Return 1 if there are only numeric characters in S,\n\
43560 otherwise.";
4357
4358static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004359unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004360{
4361 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4362 register const Py_UNICODE *e;
4363
Guido van Rossumd57fd912000-03-10 22:53:23 +00004364 /* Shortcut for single character strings */
4365 if (PyUnicode_GET_SIZE(self) == 1 &&
4366 Py_UNICODE_ISNUMERIC(*p))
4367 return PyInt_FromLong(1);
4368
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004369 /* Special case for empty strings */
4370 if (PyString_GET_SIZE(self) == 0)
4371 return PyInt_FromLong(0);
4372
Guido van Rossumd57fd912000-03-10 22:53:23 +00004373 e = p + PyUnicode_GET_SIZE(self);
4374 for (; p < e; p++) {
4375 if (!Py_UNICODE_ISNUMERIC(*p))
4376 return PyInt_FromLong(0);
4377 }
4378 return PyInt_FromLong(1);
4379}
4380
4381static char join__doc__[] =
4382"S.join(sequence) -> unicode\n\
4383\n\
4384Return a string which is the concatenation of the strings in the\n\
4385sequence. The separator between elements is S.";
4386
4387static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004388unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004389{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004390 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004391}
4392
4393static int
4394unicode_length(PyUnicodeObject *self)
4395{
4396 return self->length;
4397}
4398
4399static char ljust__doc__[] =
4400"S.ljust(width) -> unicode\n\
4401\n\
4402Return S left justified in a Unicode string of length width. Padding is\n\
4403done using spaces.";
4404
4405static PyObject *
4406unicode_ljust(PyUnicodeObject *self, PyObject *args)
4407{
4408 int width;
4409 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4410 return NULL;
4411
Tim Peters7a29bd52001-09-12 03:03:31 +00004412 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004413 Py_INCREF(self);
4414 return (PyObject*) self;
4415 }
4416
4417 return (PyObject*) pad(self, 0, width - self->length, ' ');
4418}
4419
4420static char lower__doc__[] =
4421"S.lower() -> unicode\n\
4422\n\
4423Return a copy of the string S converted to lowercase.";
4424
4425static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004426unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004428 return fixup(self, fixlower);
4429}
4430
4431static char lstrip__doc__[] =
4432"S.lstrip() -> unicode\n\
4433\n\
4434Return a copy of the string S with leading whitespace removed.";
4435
4436static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004437unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004438{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 return strip(self, 1, 0);
4440}
4441
4442static PyObject*
4443unicode_repeat(PyUnicodeObject *str, int len)
4444{
4445 PyUnicodeObject *u;
4446 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004447 int nchars;
4448 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004449
4450 if (len < 0)
4451 len = 0;
4452
Tim Peters7a29bd52001-09-12 03:03:31 +00004453 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 /* no repeat, return original string */
4455 Py_INCREF(str);
4456 return (PyObject*) str;
4457 }
Tim Peters8f422462000-09-09 06:13:41 +00004458
4459 /* ensure # of chars needed doesn't overflow int and # of bytes
4460 * needed doesn't overflow size_t
4461 */
4462 nchars = len * str->length;
4463 if (len && nchars / len != str->length) {
4464 PyErr_SetString(PyExc_OverflowError,
4465 "repeated string is too long");
4466 return NULL;
4467 }
4468 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4469 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4470 PyErr_SetString(PyExc_OverflowError,
4471 "repeated string is too long");
4472 return NULL;
4473 }
4474 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004475 if (!u)
4476 return NULL;
4477
4478 p = u->str;
4479
4480 while (len-- > 0) {
4481 Py_UNICODE_COPY(p, str->str, str->length);
4482 p += str->length;
4483 }
4484
4485 return (PyObject*) u;
4486}
4487
4488PyObject *PyUnicode_Replace(PyObject *obj,
4489 PyObject *subobj,
4490 PyObject *replobj,
4491 int maxcount)
4492{
4493 PyObject *self;
4494 PyObject *str1;
4495 PyObject *str2;
4496 PyObject *result;
4497
4498 self = PyUnicode_FromObject(obj);
4499 if (self == NULL)
4500 return NULL;
4501 str1 = PyUnicode_FromObject(subobj);
4502 if (str1 == NULL) {
4503 Py_DECREF(self);
4504 return NULL;
4505 }
4506 str2 = PyUnicode_FromObject(replobj);
4507 if (str2 == NULL) {
4508 Py_DECREF(self);
4509 Py_DECREF(str1);
4510 return NULL;
4511 }
4512 result = replace((PyUnicodeObject *)self,
4513 (PyUnicodeObject *)str1,
4514 (PyUnicodeObject *)str2,
4515 maxcount);
4516 Py_DECREF(self);
4517 Py_DECREF(str1);
4518 Py_DECREF(str2);
4519 return result;
4520}
4521
4522static char replace__doc__[] =
4523"S.replace (old, new[, maxsplit]) -> unicode\n\
4524\n\
4525Return a copy of S with all occurrences of substring\n\
4526old replaced by new. If the optional argument maxsplit is\n\
4527given, only the first maxsplit occurrences are replaced.";
4528
4529static PyObject*
4530unicode_replace(PyUnicodeObject *self, PyObject *args)
4531{
4532 PyUnicodeObject *str1;
4533 PyUnicodeObject *str2;
4534 int maxcount = -1;
4535 PyObject *result;
4536
4537 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4538 return NULL;
4539 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4540 if (str1 == NULL)
4541 return NULL;
4542 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4543 if (str2 == NULL)
4544 return NULL;
4545
4546 result = replace(self, str1, str2, maxcount);
4547
4548 Py_DECREF(str1);
4549 Py_DECREF(str2);
4550 return result;
4551}
4552
4553static
4554PyObject *unicode_repr(PyObject *unicode)
4555{
4556 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4557 PyUnicode_GET_SIZE(unicode),
4558 1);
4559}
4560
4561static char rfind__doc__[] =
4562"S.rfind(sub [,start [,end]]) -> int\n\
4563\n\
4564Return the highest index in S where substring sub is found,\n\
4565such that sub is contained within s[start,end]. Optional\n\
4566arguments start and end are interpreted as in slice notation.\n\
4567\n\
4568Return -1 on failure.";
4569
4570static PyObject *
4571unicode_rfind(PyUnicodeObject *self, PyObject *args)
4572{
4573 PyUnicodeObject *substring;
4574 int start = 0;
4575 int end = INT_MAX;
4576 PyObject *result;
4577
Guido van Rossumb8872e62000-05-09 14:14:27 +00004578 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4579 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004580 return NULL;
4581 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4582 (PyObject *)substring);
4583 if (substring == NULL)
4584 return NULL;
4585
4586 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4587
4588 Py_DECREF(substring);
4589 return result;
4590}
4591
4592static char rindex__doc__[] =
4593"S.rindex(sub [,start [,end]]) -> int\n\
4594\n\
4595Like S.rfind() but raise ValueError when the substring is not found.";
4596
4597static PyObject *
4598unicode_rindex(PyUnicodeObject *self, PyObject *args)
4599{
4600 int result;
4601 PyUnicodeObject *substring;
4602 int start = 0;
4603 int end = INT_MAX;
4604
Guido van Rossumb8872e62000-05-09 14:14:27 +00004605 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4606 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607 return NULL;
4608 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4609 (PyObject *)substring);
4610 if (substring == NULL)
4611 return NULL;
4612
4613 result = findstring(self, substring, start, end, -1);
4614
4615 Py_DECREF(substring);
4616 if (result < 0) {
4617 PyErr_SetString(PyExc_ValueError, "substring not found");
4618 return NULL;
4619 }
4620 return PyInt_FromLong(result);
4621}
4622
4623static char rjust__doc__[] =
4624"S.rjust(width) -> unicode\n\
4625\n\
4626Return S right justified in a Unicode string of length width. Padding is\n\
4627done using spaces.";
4628
4629static PyObject *
4630unicode_rjust(PyUnicodeObject *self, PyObject *args)
4631{
4632 int width;
4633 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4634 return NULL;
4635
Tim Peters7a29bd52001-09-12 03:03:31 +00004636 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 Py_INCREF(self);
4638 return (PyObject*) self;
4639 }
4640
4641 return (PyObject*) pad(self, width - self->length, 0, ' ');
4642}
4643
4644static char rstrip__doc__[] =
4645"S.rstrip() -> unicode\n\
4646\n\
4647Return a copy of the string S with trailing whitespace removed.";
4648
4649static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004650unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004651{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 return strip(self, 0, 1);
4653}
4654
4655static PyObject*
4656unicode_slice(PyUnicodeObject *self, int start, int end)
4657{
4658 /* standard clamping */
4659 if (start < 0)
4660 start = 0;
4661 if (end < 0)
4662 end = 0;
4663 if (end > self->length)
4664 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004665 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004666 /* full slice, return original string */
4667 Py_INCREF(self);
4668 return (PyObject*) self;
4669 }
4670 if (start > end)
4671 start = end;
4672 /* copy slice */
4673 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4674 end - start);
4675}
4676
4677PyObject *PyUnicode_Split(PyObject *s,
4678 PyObject *sep,
4679 int maxsplit)
4680{
4681 PyObject *result;
4682
4683 s = PyUnicode_FromObject(s);
4684 if (s == NULL)
4685 return NULL;
4686 if (sep != NULL) {
4687 sep = PyUnicode_FromObject(sep);
4688 if (sep == NULL) {
4689 Py_DECREF(s);
4690 return NULL;
4691 }
4692 }
4693
4694 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4695
4696 Py_DECREF(s);
4697 Py_XDECREF(sep);
4698 return result;
4699}
4700
4701static char split__doc__[] =
4702"S.split([sep [,maxsplit]]) -> list of strings\n\
4703\n\
4704Return a list of the words in S, using sep as the\n\
4705delimiter string. If maxsplit is given, at most maxsplit\n\
4706splits are done. If sep is not specified, any whitespace string\n\
4707is a separator.";
4708
4709static PyObject*
4710unicode_split(PyUnicodeObject *self, PyObject *args)
4711{
4712 PyObject *substring = Py_None;
4713 int maxcount = -1;
4714
4715 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4716 return NULL;
4717
4718 if (substring == Py_None)
4719 return split(self, NULL, maxcount);
4720 else if (PyUnicode_Check(substring))
4721 return split(self, (PyUnicodeObject *)substring, maxcount);
4722 else
4723 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4724}
4725
4726static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004727"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728\n\
4729Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004730Line breaks are not included in the resulting list unless keepends\n\
4731is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732
4733static PyObject*
4734unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4735{
Guido van Rossum86662912000-04-11 15:38:46 +00004736 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737
Guido van Rossum86662912000-04-11 15:38:46 +00004738 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 return NULL;
4740
Guido van Rossum86662912000-04-11 15:38:46 +00004741 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742}
4743
4744static
4745PyObject *unicode_str(PyUnicodeObject *self)
4746{
Fred Drakee4315f52000-05-09 19:53:39 +00004747 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748}
4749
4750static char strip__doc__[] =
4751"S.strip() -> unicode\n\
4752\n\
4753Return a copy of S with leading and trailing whitespace removed.";
4754
4755static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004756unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758 return strip(self, 1, 1);
4759}
4760
4761static char swapcase__doc__[] =
4762"S.swapcase() -> unicode\n\
4763\n\
4764Return a copy of S with uppercase characters converted to lowercase\n\
4765and vice versa.";
4766
4767static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004768unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 return fixup(self, fixswapcase);
4771}
4772
4773static char translate__doc__[] =
4774"S.translate(table) -> unicode\n\
4775\n\
4776Return a copy of the string S, where all characters have been mapped\n\
4777through the given translation table, which must be a mapping of\n\
4778Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4779are left untouched. Characters mapped to None are deleted.";
4780
4781static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004782unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004784 return PyUnicode_TranslateCharmap(self->str,
4785 self->length,
4786 table,
4787 "ignore");
4788}
4789
4790static char upper__doc__[] =
4791"S.upper() -> unicode\n\
4792\n\
4793Return a copy of S converted to uppercase.";
4794
4795static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004796unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 return fixup(self, fixupper);
4799}
4800
4801#if 0
4802static char zfill__doc__[] =
4803"S.zfill(width) -> unicode\n\
4804\n\
4805Pad a numeric string x with zeros on the left, to fill a field\n\
4806of the specified width. The string x is never truncated.";
4807
4808static PyObject *
4809unicode_zfill(PyUnicodeObject *self, PyObject *args)
4810{
4811 int fill;
4812 PyUnicodeObject *u;
4813
4814 int width;
4815 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4816 return NULL;
4817
4818 if (self->length >= width) {
4819 Py_INCREF(self);
4820 return (PyObject*) self;
4821 }
4822
4823 fill = width - self->length;
4824
4825 u = pad(self, fill, 0, '0');
4826
4827 if (u->str[fill] == '+' || u->str[fill] == '-') {
4828 /* move sign to beginning of string */
4829 u->str[0] = u->str[fill];
4830 u->str[fill] = '0';
4831 }
4832
4833 return (PyObject*) u;
4834}
4835#endif
4836
4837#if 0
4838static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004839unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 return PyInt_FromLong(unicode_freelist_size);
4842}
4843#endif
4844
4845static char startswith__doc__[] =
4846"S.startswith(prefix[, start[, end]]) -> int\n\
4847\n\
4848Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4849optional start, test S beginning at that position. With optional end, stop\n\
4850comparing S at that position.";
4851
4852static PyObject *
4853unicode_startswith(PyUnicodeObject *self,
4854 PyObject *args)
4855{
4856 PyUnicodeObject *substring;
4857 int start = 0;
4858 int end = INT_MAX;
4859 PyObject *result;
4860
Guido van Rossumb8872e62000-05-09 14:14:27 +00004861 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4862 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 return NULL;
4864 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4865 (PyObject *)substring);
4866 if (substring == NULL)
4867 return NULL;
4868
4869 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4870
4871 Py_DECREF(substring);
4872 return result;
4873}
4874
4875
4876static char endswith__doc__[] =
4877"S.endswith(suffix[, start[, end]]) -> int\n\
4878\n\
4879Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4880optional start, test S beginning at that position. With optional end, stop\n\
4881comparing S at that position.";
4882
4883static PyObject *
4884unicode_endswith(PyUnicodeObject *self,
4885 PyObject *args)
4886{
4887 PyUnicodeObject *substring;
4888 int start = 0;
4889 int end = INT_MAX;
4890 PyObject *result;
4891
Guido van Rossumb8872e62000-05-09 14:14:27 +00004892 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4893 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004894 return NULL;
4895 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4896 (PyObject *)substring);
4897 if (substring == NULL)
4898 return NULL;
4899
4900 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4901
4902 Py_DECREF(substring);
4903 return result;
4904}
4905
4906
4907static PyMethodDef unicode_methods[] = {
4908
4909 /* Order is according to common usage: often used methods should
4910 appear first, since lookup is done sequentially. */
4911
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004912 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4913 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4914 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4915 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4916 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4917 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4918 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4919 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4920 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4921 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4922 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4923 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4924 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4925 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4926/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4927 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4928 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4929 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4930 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4931 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4932 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4933 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4934 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4935 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4936 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4937 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4938 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4939 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4940 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4941 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4942 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4943 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4944 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4945 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4946 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004947#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004948 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4949 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950#endif
4951
4952#if 0
4953 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004954 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004955#endif
4956
4957 {NULL, NULL}
4958};
4959
Guido van Rossumd57fd912000-03-10 22:53:23 +00004960static PySequenceMethods unicode_as_sequence = {
4961 (inquiry) unicode_length, /* sq_length */
4962 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4963 (intargfunc) unicode_repeat, /* sq_repeat */
4964 (intargfunc) unicode_getitem, /* sq_item */
4965 (intintargfunc) unicode_slice, /* sq_slice */
4966 0, /* sq_ass_item */
4967 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004968 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969};
4970
4971static int
4972unicode_buffer_getreadbuf(PyUnicodeObject *self,
4973 int index,
4974 const void **ptr)
4975{
4976 if (index != 0) {
4977 PyErr_SetString(PyExc_SystemError,
4978 "accessing non-existent unicode segment");
4979 return -1;
4980 }
4981 *ptr = (void *) self->str;
4982 return PyUnicode_GET_DATA_SIZE(self);
4983}
4984
4985static int
4986unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4987 const void **ptr)
4988{
4989 PyErr_SetString(PyExc_TypeError,
4990 "cannot use unicode as modifyable buffer");
4991 return -1;
4992}
4993
4994static int
4995unicode_buffer_getsegcount(PyUnicodeObject *self,
4996 int *lenp)
4997{
4998 if (lenp)
4999 *lenp = PyUnicode_GET_DATA_SIZE(self);
5000 return 1;
5001}
5002
5003static int
5004unicode_buffer_getcharbuf(PyUnicodeObject *self,
5005 int index,
5006 const void **ptr)
5007{
5008 PyObject *str;
5009
5010 if (index != 0) {
5011 PyErr_SetString(PyExc_SystemError,
5012 "accessing non-existent unicode segment");
5013 return -1;
5014 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005015 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005016 if (str == NULL)
5017 return -1;
5018 *ptr = (void *) PyString_AS_STRING(str);
5019 return PyString_GET_SIZE(str);
5020}
5021
5022/* Helpers for PyUnicode_Format() */
5023
5024static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005025getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005026{
5027 int argidx = *p_argidx;
5028 if (argidx < arglen) {
5029 (*p_argidx)++;
5030 if (arglen < 0)
5031 return args;
5032 else
5033 return PyTuple_GetItem(args, argidx);
5034 }
5035 PyErr_SetString(PyExc_TypeError,
5036 "not enough arguments for format string");
5037 return NULL;
5038}
5039
5040#define F_LJUST (1<<0)
5041#define F_SIGN (1<<1)
5042#define F_BLANK (1<<2)
5043#define F_ALT (1<<3)
5044#define F_ZERO (1<<4)
5045
5046static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048{
5049 register int i;
5050 int len;
5051 va_list va;
5052 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054
5055 /* First, format the string as char array, then expand to Py_UNICODE
5056 array. */
5057 charbuffer = (char *)buffer;
5058 len = vsprintf(charbuffer, format, va);
5059 for (i = len - 1; i >= 0; i--)
5060 buffer[i] = (Py_UNICODE) charbuffer[i];
5061
5062 va_end(va);
5063 return len;
5064}
5065
5066static int
5067formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005068 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069 int flags,
5070 int prec,
5071 int type,
5072 PyObject *v)
5073{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005074 /* fmt = '%#.' + `prec` + `type`
5075 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 char fmt[20];
5077 double x;
5078
5079 x = PyFloat_AsDouble(v);
5080 if (x == -1.0 && PyErr_Occurred())
5081 return -1;
5082 if (prec < 0)
5083 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005084 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5085 type = 'g';
5086 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005087 /* worst case length calc to ensure no buffer overrun:
5088 fmt = %#.<prec>g
5089 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5090 for any double rep.)
5091 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5092 If prec=0 the effective precision is 1 (the leading digit is
5093 always given), therefore increase by one to 10+prec. */
5094 if (buflen <= (size_t)10 + (size_t)prec) {
5095 PyErr_SetString(PyExc_OverflowError,
5096 "formatted float is too long (precision too long?)");
5097 return -1;
5098 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005099 return usprintf(buf, fmt, x);
5100}
5101
Tim Peters38fd5b62000-09-21 05:43:11 +00005102static PyObject*
5103formatlong(PyObject *val, int flags, int prec, int type)
5104{
5105 char *buf;
5106 int i, len;
5107 PyObject *str; /* temporary string object. */
5108 PyUnicodeObject *result;
5109
5110 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5111 if (!str)
5112 return NULL;
5113 result = _PyUnicode_New(len);
5114 for (i = 0; i < len; i++)
5115 result->str[i] = buf[i];
5116 result->str[len] = 0;
5117 Py_DECREF(str);
5118 return (PyObject*)result;
5119}
5120
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121static int
5122formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005123 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 int flags,
5125 int prec,
5126 int type,
5127 PyObject *v)
5128{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005129 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005130 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5131 + 1 + 1 = 24*/
5132 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005133 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005134 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005135
5136 x = PyInt_AsLong(v);
5137 if (x == -1 && PyErr_Occurred())
5138 return -1;
5139 if (prec < 0)
5140 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005141 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5142 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5143 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5144 PyErr_SetString(PyExc_OverflowError,
5145 "formatted integer is too long (precision too long?)");
5146 return -1;
5147 }
Tim Petersfff53252001-04-12 18:38:48 +00005148 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5149 * but we want it (for consistency with other %#x conversions, and
5150 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005151 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5152 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5153 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005154 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005155 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5156 /* Only way to know what the platform does is to try it. */
5157 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5158 if (fmt[1] != (char)type) {
5159 /* Supply our own leading 0x/0X -- needed under std C */
5160 use_native_c_format = 0;
5161 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5162 }
5163 }
5164 if (use_native_c_format)
5165 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 return usprintf(buf, fmt, x);
5167}
5168
5169static int
5170formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005171 size_t buflen,
5172 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005173{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005174 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005175 if (PyUnicode_Check(v)) {
5176 if (PyUnicode_GET_SIZE(v) != 1)
5177 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005179 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005180
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005181 else if (PyString_Check(v)) {
5182 if (PyString_GET_SIZE(v) != 1)
5183 goto onError;
5184 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5185 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186
5187 else {
5188 /* Integer input truncated to a character */
5189 long x;
5190 x = PyInt_AsLong(v);
5191 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 buf[0] = (char) x;
5194 }
5195 buf[1] = '\0';
5196 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005197
5198 onError:
5199 PyErr_SetString(PyExc_TypeError,
5200 "%c requires int or char");
5201 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202}
5203
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005204/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5205
5206 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5207 chars are formatted. XXX This is a magic number. Each formatting
5208 routine does bounds checking to ensure no overflow, but a better
5209 solution may be to malloc a buffer of appropriate size for each
5210 format. For now, the current solution is sufficient.
5211*/
5212#define FORMATBUFLEN (size_t)120
5213
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214PyObject *PyUnicode_Format(PyObject *format,
5215 PyObject *args)
5216{
5217 Py_UNICODE *fmt, *res;
5218 int fmtcnt, rescnt, reslen, arglen, argidx;
5219 int args_owned = 0;
5220 PyUnicodeObject *result = NULL;
5221 PyObject *dict = NULL;
5222 PyObject *uformat;
5223
5224 if (format == NULL || args == NULL) {
5225 PyErr_BadInternalCall();
5226 return NULL;
5227 }
5228 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005229 if (uformat == NULL)
5230 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231 fmt = PyUnicode_AS_UNICODE(uformat);
5232 fmtcnt = PyUnicode_GET_SIZE(uformat);
5233
5234 reslen = rescnt = fmtcnt + 100;
5235 result = _PyUnicode_New(reslen);
5236 if (result == NULL)
5237 goto onError;
5238 res = PyUnicode_AS_UNICODE(result);
5239
5240 if (PyTuple_Check(args)) {
5241 arglen = PyTuple_Size(args);
5242 argidx = 0;
5243 }
5244 else {
5245 arglen = -1;
5246 argidx = -2;
5247 }
5248 if (args->ob_type->tp_as_mapping)
5249 dict = args;
5250
5251 while (--fmtcnt >= 0) {
5252 if (*fmt != '%') {
5253 if (--rescnt < 0) {
5254 rescnt = fmtcnt + 100;
5255 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005256 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005257 return NULL;
5258 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5259 --rescnt;
5260 }
5261 *res++ = *fmt++;
5262 }
5263 else {
5264 /* Got a format specifier */
5265 int flags = 0;
5266 int width = -1;
5267 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005268 Py_UNICODE c = '\0';
5269 Py_UNICODE fill;
5270 PyObject *v = NULL;
5271 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005272 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 Py_UNICODE sign;
5274 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005275 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005276
5277 fmt++;
5278 if (*fmt == '(') {
5279 Py_UNICODE *keystart;
5280 int keylen;
5281 PyObject *key;
5282 int pcount = 1;
5283
5284 if (dict == NULL) {
5285 PyErr_SetString(PyExc_TypeError,
5286 "format requires a mapping");
5287 goto onError;
5288 }
5289 ++fmt;
5290 --fmtcnt;
5291 keystart = fmt;
5292 /* Skip over balanced parentheses */
5293 while (pcount > 0 && --fmtcnt >= 0) {
5294 if (*fmt == ')')
5295 --pcount;
5296 else if (*fmt == '(')
5297 ++pcount;
5298 fmt++;
5299 }
5300 keylen = fmt - keystart - 1;
5301 if (fmtcnt < 0 || pcount > 0) {
5302 PyErr_SetString(PyExc_ValueError,
5303 "incomplete format key");
5304 goto onError;
5305 }
Fred Drakee4315f52000-05-09 19:53:39 +00005306 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 then looked up since Python uses strings to hold
5308 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005309 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005310 key = PyUnicode_EncodeUTF8(keystart,
5311 keylen,
5312 NULL);
5313 if (key == NULL)
5314 goto onError;
5315 if (args_owned) {
5316 Py_DECREF(args);
5317 args_owned = 0;
5318 }
5319 args = PyObject_GetItem(dict, key);
5320 Py_DECREF(key);
5321 if (args == NULL) {
5322 goto onError;
5323 }
5324 args_owned = 1;
5325 arglen = -1;
5326 argidx = -2;
5327 }
5328 while (--fmtcnt >= 0) {
5329 switch (c = *fmt++) {
5330 case '-': flags |= F_LJUST; continue;
5331 case '+': flags |= F_SIGN; continue;
5332 case ' ': flags |= F_BLANK; continue;
5333 case '#': flags |= F_ALT; continue;
5334 case '0': flags |= F_ZERO; continue;
5335 }
5336 break;
5337 }
5338 if (c == '*') {
5339 v = getnextarg(args, arglen, &argidx);
5340 if (v == NULL)
5341 goto onError;
5342 if (!PyInt_Check(v)) {
5343 PyErr_SetString(PyExc_TypeError,
5344 "* wants int");
5345 goto onError;
5346 }
5347 width = PyInt_AsLong(v);
5348 if (width < 0) {
5349 flags |= F_LJUST;
5350 width = -width;
5351 }
5352 if (--fmtcnt >= 0)
5353 c = *fmt++;
5354 }
5355 else if (c >= '0' && c <= '9') {
5356 width = c - '0';
5357 while (--fmtcnt >= 0) {
5358 c = *fmt++;
5359 if (c < '0' || c > '9')
5360 break;
5361 if ((width*10) / 10 != width) {
5362 PyErr_SetString(PyExc_ValueError,
5363 "width too big");
5364 goto onError;
5365 }
5366 width = width*10 + (c - '0');
5367 }
5368 }
5369 if (c == '.') {
5370 prec = 0;
5371 if (--fmtcnt >= 0)
5372 c = *fmt++;
5373 if (c == '*') {
5374 v = getnextarg(args, arglen, &argidx);
5375 if (v == NULL)
5376 goto onError;
5377 if (!PyInt_Check(v)) {
5378 PyErr_SetString(PyExc_TypeError,
5379 "* wants int");
5380 goto onError;
5381 }
5382 prec = PyInt_AsLong(v);
5383 if (prec < 0)
5384 prec = 0;
5385 if (--fmtcnt >= 0)
5386 c = *fmt++;
5387 }
5388 else if (c >= '0' && c <= '9') {
5389 prec = c - '0';
5390 while (--fmtcnt >= 0) {
5391 c = Py_CHARMASK(*fmt++);
5392 if (c < '0' || c > '9')
5393 break;
5394 if ((prec*10) / 10 != prec) {
5395 PyErr_SetString(PyExc_ValueError,
5396 "prec too big");
5397 goto onError;
5398 }
5399 prec = prec*10 + (c - '0');
5400 }
5401 }
5402 } /* prec */
5403 if (fmtcnt >= 0) {
5404 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005405 if (--fmtcnt >= 0)
5406 c = *fmt++;
5407 }
5408 }
5409 if (fmtcnt < 0) {
5410 PyErr_SetString(PyExc_ValueError,
5411 "incomplete format");
5412 goto onError;
5413 }
5414 if (c != '%') {
5415 v = getnextarg(args, arglen, &argidx);
5416 if (v == NULL)
5417 goto onError;
5418 }
5419 sign = 0;
5420 fill = ' ';
5421 switch (c) {
5422
5423 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005424 pbuf = formatbuf;
5425 /* presume that buffer length is at least 1 */
5426 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005427 len = 1;
5428 break;
5429
5430 case 's':
5431 case 'r':
5432 if (PyUnicode_Check(v) && c == 's') {
5433 temp = v;
5434 Py_INCREF(temp);
5435 }
5436 else {
5437 PyObject *unicode;
5438 if (c == 's')
5439 temp = PyObject_Str(v);
5440 else
5441 temp = PyObject_Repr(v);
5442 if (temp == NULL)
5443 goto onError;
5444 if (!PyString_Check(temp)) {
5445 /* XXX Note: this should never happen, since
5446 PyObject_Repr() and PyObject_Str() assure
5447 this */
5448 Py_DECREF(temp);
5449 PyErr_SetString(PyExc_TypeError,
5450 "%s argument has non-string str()");
5451 goto onError;
5452 }
Fred Drakee4315f52000-05-09 19:53:39 +00005453 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005454 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005455 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005456 "strict");
5457 Py_DECREF(temp);
5458 temp = unicode;
5459 if (temp == NULL)
5460 goto onError;
5461 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005462 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005463 len = PyUnicode_GET_SIZE(temp);
5464 if (prec >= 0 && len > prec)
5465 len = prec;
5466 break;
5467
5468 case 'i':
5469 case 'd':
5470 case 'u':
5471 case 'o':
5472 case 'x':
5473 case 'X':
5474 if (c == 'i')
5475 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005476 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005477 temp = formatlong(v, flags, prec, c);
5478 if (!temp)
5479 goto onError;
5480 pbuf = PyUnicode_AS_UNICODE(temp);
5481 len = PyUnicode_GET_SIZE(temp);
5482 /* unbounded ints can always produce
5483 a sign character! */
5484 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005485 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005486 else {
5487 pbuf = formatbuf;
5488 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5489 flags, prec, c, v);
5490 if (len < 0)
5491 goto onError;
5492 /* only d conversion is signed */
5493 sign = c == 'd';
5494 }
5495 if (flags & F_ZERO)
5496 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005497 break;
5498
5499 case 'e':
5500 case 'E':
5501 case 'f':
5502 case 'g':
5503 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005504 pbuf = formatbuf;
5505 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5506 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005507 if (len < 0)
5508 goto onError;
5509 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005510 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005511 fill = '0';
5512 break;
5513
5514 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005515 pbuf = formatbuf;
5516 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005517 if (len < 0)
5518 goto onError;
5519 break;
5520
5521 default:
5522 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005523 "unsupported format character '%c' (0x%x) "
5524 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005525 (31<=c && c<=126) ? c : '?',
5526 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005527 goto onError;
5528 }
5529 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005530 if (*pbuf == '-' || *pbuf == '+') {
5531 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005532 len--;
5533 }
5534 else if (flags & F_SIGN)
5535 sign = '+';
5536 else if (flags & F_BLANK)
5537 sign = ' ';
5538 else
5539 sign = 0;
5540 }
5541 if (width < len)
5542 width = len;
5543 if (rescnt < width + (sign != 0)) {
5544 reslen -= rescnt;
5545 rescnt = width + fmtcnt + 100;
5546 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005547 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005548 return NULL;
5549 res = PyUnicode_AS_UNICODE(result)
5550 + reslen - rescnt;
5551 }
5552 if (sign) {
5553 if (fill != ' ')
5554 *res++ = sign;
5555 rescnt--;
5556 if (width > len)
5557 width--;
5558 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005559 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5560 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005561 assert(pbuf[1] == c);
5562 if (fill != ' ') {
5563 *res++ = *pbuf++;
5564 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005565 }
Tim Petersfff53252001-04-12 18:38:48 +00005566 rescnt -= 2;
5567 width -= 2;
5568 if (width < 0)
5569 width = 0;
5570 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005571 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005572 if (width > len && !(flags & F_LJUST)) {
5573 do {
5574 --rescnt;
5575 *res++ = fill;
5576 } while (--width > len);
5577 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005578 if (fill == ' ') {
5579 if (sign)
5580 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005581 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005582 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005583 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005584 *res++ = *pbuf++;
5585 *res++ = *pbuf++;
5586 }
5587 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005588 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005589 res += len;
5590 rescnt -= len;
5591 while (--width >= len) {
5592 --rescnt;
5593 *res++ = ' ';
5594 }
5595 if (dict && (argidx < arglen) && c != '%') {
5596 PyErr_SetString(PyExc_TypeError,
5597 "not all arguments converted");
5598 goto onError;
5599 }
5600 Py_XDECREF(temp);
5601 } /* '%' */
5602 } /* until end */
5603 if (argidx < arglen && !dict) {
5604 PyErr_SetString(PyExc_TypeError,
5605 "not all arguments converted");
5606 goto onError;
5607 }
5608
5609 if (args_owned) {
5610 Py_DECREF(args);
5611 }
5612 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005613 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005614 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005615 return (PyObject *)result;
5616
5617 onError:
5618 Py_XDECREF(result);
5619 Py_DECREF(uformat);
5620 if (args_owned) {
5621 Py_DECREF(args);
5622 }
5623 return NULL;
5624}
5625
5626static PyBufferProcs unicode_as_buffer = {
5627 (getreadbufferproc) unicode_buffer_getreadbuf,
5628 (getwritebufferproc) unicode_buffer_getwritebuf,
5629 (getsegcountproc) unicode_buffer_getsegcount,
5630 (getcharbufferproc) unicode_buffer_getcharbuf,
5631};
5632
Guido van Rossume023fe02001-08-30 03:12:59 +00005633staticforward PyObject *
5634unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5635
Tim Peters6d6c1a32001-08-02 04:15:00 +00005636static PyObject *
5637unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5638{
5639 PyObject *x = NULL;
5640 static char *kwlist[] = {"string", "encoding", "errors", 0};
5641 char *encoding = NULL;
5642 char *errors = NULL;
5643
Guido van Rossume023fe02001-08-30 03:12:59 +00005644 if (type != &PyUnicode_Type)
5645 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005646 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5647 kwlist, &x, &encoding, &errors))
5648 return NULL;
5649 if (x == NULL)
5650 return (PyObject *)_PyUnicode_New(0);
5651 return PyUnicode_FromEncodedObject(x, encoding, errors);
5652}
5653
Guido van Rossume023fe02001-08-30 03:12:59 +00005654static PyObject *
5655unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5656{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005657 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005658 int n;
5659
5660 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5661 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5662 if (tmp == NULL)
5663 return NULL;
5664 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005665 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5666 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005667 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005668 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5669 if (pnew->str == NULL) {
5670 _Py_ForgetReference((PyObject *)pnew);
5671 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005672 return NULL;
5673 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005674 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5675 pnew->length = n;
5676 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005677 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005678 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005679}
5680
Tim Peters6d6c1a32001-08-02 04:15:00 +00005681static char unicode_doc[] =
5682"unicode(string [, encoding[, errors]]) -> object\n\
5683\n\
5684Create a new Unicode object from the given encoded string.\n\
5685encoding defaults to the current default string encoding and \n\
5686errors, defining the error handling, to 'strict'.";
5687
Guido van Rossumd57fd912000-03-10 22:53:23 +00005688PyTypeObject PyUnicode_Type = {
5689 PyObject_HEAD_INIT(&PyType_Type)
5690 0, /* ob_size */
5691 "unicode", /* tp_name */
5692 sizeof(PyUnicodeObject), /* tp_size */
5693 0, /* tp_itemsize */
5694 /* Slots */
5695 (destructor)_PyUnicode_Free, /* tp_dealloc */
5696 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005697 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698 0, /* tp_setattr */
5699 (cmpfunc) unicode_compare, /* tp_compare */
5700 (reprfunc) unicode_repr, /* tp_repr */
5701 0, /* tp_as_number */
5702 &unicode_as_sequence, /* tp_as_sequence */
5703 0, /* tp_as_mapping */
5704 (hashfunc) unicode_hash, /* tp_hash*/
5705 0, /* tp_call*/
5706 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005707 PyObject_GenericGetAttr, /* tp_getattro */
5708 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005709 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005710 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005711 unicode_doc, /* tp_doc */
5712 0, /* tp_traverse */
5713 0, /* tp_clear */
5714 0, /* tp_richcompare */
5715 0, /* tp_weaklistoffset */
5716 0, /* tp_iter */
5717 0, /* tp_iternext */
5718 unicode_methods, /* tp_methods */
5719 0, /* tp_members */
5720 0, /* tp_getset */
5721 0, /* tp_base */
5722 0, /* tp_dict */
5723 0, /* tp_descr_get */
5724 0, /* tp_descr_set */
5725 0, /* tp_dictoffset */
5726 0, /* tp_init */
5727 0, /* tp_alloc */
5728 unicode_new, /* tp_new */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729};
5730
5731/* Initialize the Unicode implementation */
5732
Thomas Wouters78890102000-07-22 19:25:51 +00005733void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005734{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005735 int i;
5736
Fred Drakee4315f52000-05-09 19:53:39 +00005737 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005738 unicode_freelist = NULL;
5739 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005740 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005741 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005742 for (i = 0; i < 256; i++)
5743 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005744}
5745
5746/* Finalize the Unicode implementation */
5747
5748void
Thomas Wouters78890102000-07-22 19:25:51 +00005749_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005750{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005751 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005752 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005753
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005754 Py_XDECREF(unicode_empty);
5755 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005756
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005757 for (i = 0; i < 256; i++) {
5758 if (unicode_latin1[i]) {
5759 Py_DECREF(unicode_latin1[i]);
5760 unicode_latin1[i] = NULL;
5761 }
5762 }
5763
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005764 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005765 PyUnicodeObject *v = u;
5766 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005767 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005768 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005769 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005770 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005771 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005772 unicode_freelist = NULL;
5773 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005774}