blob: 50f2f5c627349cd521cab1716686f24d64a4cd03 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000107PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 const char *s;
402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
419 }
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
425 }
426 if (PyUnicode_Check(obj)) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000427 if (encoding) {
Tim Peters78e0fc72001-09-11 03:07:38 +0000428 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000429 "decoding Unicode is not supported");
Tim Peters78e0fc72001-09-11 03:07:38 +0000430 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000431 }
Tim Peters78e0fc72001-09-11 03:07:38 +0000432 if (PyUnicode_CheckExact(obj)) {
433 Py_INCREF(obj);
434 v = obj;
435 }
436 else {
437 /* For a subclass of unicode, return a true unicode object
438 with the same string value. */
439 v = PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
440 PyUnicode_GET_SIZE(obj));
441 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000442 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443 }
444 else if (PyString_Check(obj)) {
445 s = PyString_AS_STRING(obj);
446 len = PyString_GET_SIZE(obj);
447 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000448 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
449 /* Overwrite the error message with something more useful in
450 case of a TypeError. */
451 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000452 PyErr_Format(PyExc_TypeError,
453 "coercing to Unicode: need string or buffer, "
454 "%.80s found",
455 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000457 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000458
459 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 if (len == 0) {
461 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000462 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000463 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000464 else
465 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000466
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000468 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000470 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000471 return v;
472
473 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000474 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000475 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000476 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000477 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000478}
479
480PyObject *PyUnicode_Decode(const char *s,
481 int size,
482 const char *encoding,
483 const char *errors)
484{
485 PyObject *buffer = NULL, *unicode;
486
Fred Drakee4315f52000-05-09 19:53:39 +0000487 if (encoding == NULL)
488 encoding = PyUnicode_GetDefaultEncoding();
489
490 /* Shortcuts for common default encodings */
491 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000492 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000493 else if (strcmp(encoding, "latin-1") == 0)
494 return PyUnicode_DecodeLatin1(s, size, errors);
495 else if (strcmp(encoding, "ascii") == 0)
496 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000497
498 /* Decode via the codec registry */
499 buffer = PyBuffer_FromMemory((void *)s, size);
500 if (buffer == NULL)
501 goto onError;
502 unicode = PyCodec_Decode(buffer, encoding, errors);
503 if (unicode == NULL)
504 goto onError;
505 if (!PyUnicode_Check(unicode)) {
506 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000507 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508 unicode->ob_type->tp_name);
509 Py_DECREF(unicode);
510 goto onError;
511 }
512 Py_DECREF(buffer);
513 return unicode;
514
515 onError:
516 Py_XDECREF(buffer);
517 return NULL;
518}
519
520PyObject *PyUnicode_Encode(const Py_UNICODE *s,
521 int size,
522 const char *encoding,
523 const char *errors)
524{
525 PyObject *v, *unicode;
526
527 unicode = PyUnicode_FromUnicode(s, size);
528 if (unicode == NULL)
529 return NULL;
530 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
531 Py_DECREF(unicode);
532 return v;
533}
534
535PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
536 const char *encoding,
537 const char *errors)
538{
539 PyObject *v;
540
541 if (!PyUnicode_Check(unicode)) {
542 PyErr_BadArgument();
543 goto onError;
544 }
Fred Drakee4315f52000-05-09 19:53:39 +0000545
546 if (encoding == NULL)
547 encoding = PyUnicode_GetDefaultEncoding();
548
549 /* Shortcuts for common default encodings */
550 if (errors == NULL) {
551 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000552 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000553 else if (strcmp(encoding, "latin-1") == 0)
554 return PyUnicode_AsLatin1String(unicode);
555 else if (strcmp(encoding, "ascii") == 0)
556 return PyUnicode_AsASCIIString(unicode);
557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000558
559 /* Encode via the codec registry */
560 v = PyCodec_Encode(unicode, encoding, errors);
561 if (v == NULL)
562 goto onError;
563 /* XXX Should we really enforce this ? */
564 if (!PyString_Check(v)) {
565 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000566 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 v->ob_type->tp_name);
568 Py_DECREF(v);
569 goto onError;
570 }
571 return v;
572
573 onError:
574 return NULL;
575}
576
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000577PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
578 const char *errors)
579{
580 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
581
582 if (v)
583 return v;
584 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
585 if (v && errors == NULL)
586 ((PyUnicodeObject *)unicode)->defenc = v;
587 return v;
588}
589
Guido van Rossumd57fd912000-03-10 22:53:23 +0000590Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
591{
592 if (!PyUnicode_Check(unicode)) {
593 PyErr_BadArgument();
594 goto onError;
595 }
596 return PyUnicode_AS_UNICODE(unicode);
597
598 onError:
599 return NULL;
600}
601
602int PyUnicode_GetSize(PyObject *unicode)
603{
604 if (!PyUnicode_Check(unicode)) {
605 PyErr_BadArgument();
606 goto onError;
607 }
608 return PyUnicode_GET_SIZE(unicode);
609
610 onError:
611 return -1;
612}
613
Thomas Wouters78890102000-07-22 19:25:51 +0000614const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000615{
616 return unicode_default_encoding;
617}
618
619int PyUnicode_SetDefaultEncoding(const char *encoding)
620{
621 PyObject *v;
622
623 /* Make sure the encoding is valid. As side effect, this also
624 loads the encoding into the codec registry cache. */
625 v = _PyCodec_Lookup(encoding);
626 if (v == NULL)
627 goto onError;
628 Py_DECREF(v);
629 strncpy(unicode_default_encoding,
630 encoding,
631 sizeof(unicode_default_encoding));
632 return 0;
633
634 onError:
635 return -1;
636}
637
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +0000638/* --- UTF-7 Codec -------------------------------------------------------- */
639
640/* see RFC2152 for details */
641
642static
643char utf7_special[128] = {
644 /* indicate whether a UTF-7 character is special i.e. cannot be directly
645 encoded:
646 0 - not special
647 1 - special
648 2 - whitespace (optional)
649 3 - RFC2152 Set O (optional) */
650 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 2, 3, 3, 3, 3, 3, 3, 0, 0, 0, 3, 1, 0, 0, 0, 1,
653 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0,
654 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
655 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 1, 3, 3, 3,
656 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
657 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 1, 1,
658
659};
660
661#define SPECIAL(c, encodeO, encodeWS) \
662 (((c)>127 || utf7_special[(c)] == 1) || \
663 (encodeWS && (utf7_special[(c)] == 2)) || \
664 (encodeO && (utf7_special[(c)] == 3)))
665
666#define B64(n) ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
667#define B64CHAR(c) (isalnum(c) || (c) == '+' || (c) == '/')
668#define UB64(c) ((c) == '+' ? 62 : (c) == '/' ? 63 : (c) >= 'a' ? \
669 (c) - 71 : (c) >= 'A' ? (c) - 65 : (c) + 4)
670
671#define ENCODE(out, ch, bits) \
672 while (bits >= 6) { \
673 *out++ = B64(ch >> (bits-6)); \
674 bits -= 6; \
675 }
676
677#define DECODE(out, ch, bits, surrogate) \
678 while (bits >= 16) { \
679 Py_UNICODE outCh = (Py_UNICODE) ((ch >> (bits-16)) & 0xffff); \
680 bits -= 16; \
681 if (surrogate) { \
682 /* We have already generated an error for the high surrogate
683 so let's not bother seeing if the low surrogate is correct or not */\
684 surrogate = 0; \
685 } else if (0xDC00 <= outCh && outCh <= 0xDFFF) { \
686 /* This is a surrogate pair. Unfortunately we can't represent \
687 it in a 16-bit character */ \
688 surrogate = 1; \
689 errmsg = "code pairs are not supported"; \
690 goto utf7Error; \
691 } else { \
692 *out++ = outCh; \
693 } \
694 } \
695
696static
697int utf7_decoding_error(Py_UNICODE **dest,
698 const char *errors,
699 const char *details)
700{
701 if ((errors == NULL) ||
702 (strcmp(errors,"strict") == 0)) {
703 PyErr_Format(PyExc_UnicodeError,
704 "UTF-7 decoding error: %.400s",
705 details);
706 return -1;
707 }
708 else if (strcmp(errors,"ignore") == 0) {
709 return 0;
710 }
711 else if (strcmp(errors,"replace") == 0) {
712 if (dest != NULL) {
713 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
714 (*dest)++;
715 }
716 return 0;
717 }
718 else {
719 PyErr_Format(PyExc_ValueError,
720 "UTF-7 decoding error; unknown error handling code: %.400s",
721 errors);
722 return -1;
723 }
724}
725
726PyObject *PyUnicode_DecodeUTF7(const char *s,
727 int size,
728 const char *errors)
729{
730 const char *e;
731 PyUnicodeObject *unicode;
732 Py_UNICODE *p;
733 const char *errmsg = "";
734 int inShift = 0;
735 unsigned int bitsleft = 0;
736 unsigned long charsleft = 0;
737 int surrogate = 0;
738
739 unicode = _PyUnicode_New(size);
740 if (!unicode)
741 return NULL;
742 if (size == 0)
743 return (PyObject *)unicode;
744
745 p = unicode->str;
746 e = s + size;
747
748 while (s < e) {
749 Py_UNICODE ch = *s;
750
751 if (inShift) {
752 if ((ch == '-') || !B64CHAR(ch)) {
753 inShift = 0;
754 s++;
755
756 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
757 if (bitsleft >= 6) {
758 /* The shift sequence has a partial character in it. If
759 bitsleft < 6 then we could just classify it as padding
760 but that is not the case here */
761
762 errmsg = "partial character in shift sequence";
763 goto utf7Error;
764 }
765 /* According to RFC2152 the remaining bits should be zero. We
766 choose to signal an error/insert a replacement character
767 here so indicate the potential of a misencoded character. */
768
769 /* On x86, a << b == a << (b%32) so make sure that bitsleft != 0 */
770 if (bitsleft && charsleft << (sizeof(charsleft) * 8 - bitsleft)) {
771 errmsg = "non-zero padding bits in shift sequence";
772 goto utf7Error;
773 }
774
775 if (ch == '-') {
776 if ((s < e) && (*(s) == '-')) {
777 *p++ = '-';
778 inShift = 1;
779 }
780 } else if (SPECIAL(ch,0,0)) {
781 errmsg = "unexpected special character";
782 goto utf7Error;
783 } else {
784 *p++ = ch;
785 }
786 } else {
787 charsleft = (charsleft << 6) | UB64(ch);
788 bitsleft += 6;
789 s++;
790 /* p, charsleft, bitsleft, surrogate = */ DECODE(p, charsleft, bitsleft, surrogate);
791 }
792 }
793 else if ( ch == '+' ) {
794 s++;
795 if (s < e && *s == '-') {
796 s++;
797 *p++ = '+';
798 } else
799 {
800 inShift = 1;
801 bitsleft = 0;
802 }
803 }
804 else if (SPECIAL(ch,0,0)) {
805 errmsg = "unexpected special character";
806 s++;
807 goto utf7Error;
808 }
809 else {
810 *p++ = ch;
811 s++;
812 }
813 continue;
814 utf7Error:
815 if (utf7_decoding_error(&p, errors, errmsg))
816 goto onError;
817 }
818
819 if (inShift) {
820 if (utf7_decoding_error(&p, errors, "unterminated shift sequence"))
821 goto onError;
822 }
823
824 if (_PyUnicode_Resize(&unicode, p - unicode->str))
825 goto onError;
826
827 return (PyObject *)unicode;
828
829onError:
830 Py_DECREF(unicode);
831 return NULL;
832}
833
834
835PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
836 int size,
837 int encodeSetO,
838 int encodeWhiteSpace,
839 const char *errors)
840{
841 PyObject *v;
842 /* It might be possible to tighten this worst case */
843 unsigned int cbAllocated = 5 * size;
844 int inShift = 0;
845 int i = 0;
846 unsigned int bitsleft = 0;
847 unsigned long charsleft = 0;
848 char * out;
849 char * start;
850
851 if (size == 0)
852 return PyString_FromStringAndSize(NULL, 0);
853
854 v = PyString_FromStringAndSize(NULL, cbAllocated);
855 if (v == NULL)
856 return NULL;
857
858 start = out = PyString_AS_STRING(v);
859 for (;i < size; ++i) {
860 Py_UNICODE ch = s[i];
861
862 if (!inShift) {
863 if (ch == '+') {
864 *out++ = '+';
865 *out++ = '-';
866 } else if (SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
867 charsleft = ch;
868 bitsleft = 16;
869 *out++ = '+';
870 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
871 inShift = bitsleft > 0;
872 } else {
873 *out++ = (char) ch;
874 }
875 } else {
876 if (!SPECIAL(ch, encodeSetO, encodeWhiteSpace)) {
877 *out++ = B64(charsleft << (6-bitsleft));
878 charsleft = 0;
879 bitsleft = 0;
880 /* Characters not in the BASE64 set implicitly unshift the sequence
881 so no '-' is required, except if the character is itself a '-' */
882 if (B64CHAR(ch) || ch == '-') {
883 *out++ = '-';
884 }
885 inShift = 0;
886 *out++ = (char) ch;
887 } else {
888 bitsleft += 16;
889 charsleft = (charsleft << 16) | ch;
890 /* out, charsleft, bitsleft = */ ENCODE(out, charsleft, bitsleft);
891
892 /* If the next character is special then we dont' need to terminate
893 the shift sequence. If the next character is not a BASE64 character
894 or '-' then the shift sequence will be terminated implicitly and we
895 don't have to insert a '-'. */
896
897 if (bitsleft == 0) {
898 if (i + 1 < size) {
899 Py_UNICODE ch2 = s[i+1];
900
901 if (SPECIAL(ch2, encodeSetO, encodeWhiteSpace)) {
902
903 } else if (B64CHAR(ch2) || ch2 == '-') {
904 *out++ = '-';
905 inShift = 0;
906 } else {
907 inShift = 0;
908 }
909
910 }
911 else {
912 *out++ = '-';
913 inShift = 0;
914 }
915 }
916 }
917 }
918 }
919 if (bitsleft) {
920 *out++= B64(charsleft << (6-bitsleft) );
921 *out++ = '-';
922 }
923
924 if (_PyString_Resize(&v, out - start)) {
925 Py_DECREF(v);
926 return NULL;
927 }
928 return v;
929}
930
931#undef SPECIAL
932#undef B64
933#undef B64CHAR
934#undef UB64
935#undef ENCODE
936#undef DECODE
937
Guido van Rossumd57fd912000-03-10 22:53:23 +0000938/* --- UTF-8 Codec -------------------------------------------------------- */
939
940static
941char utf8_code_length[256] = {
942 /* Map UTF-8 encoded prefix byte to sequence length. zero means
943 illegal prefix. see RFC 2279 for details */
944 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
945 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
946 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
947 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
948 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
949 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
950 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
951 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
952 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
953 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
954 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
955 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
956 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
957 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
958 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
959 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
960};
961
962static
963int utf8_decoding_error(const char **source,
964 Py_UNICODE **dest,
965 const char *errors,
966 const char *details)
967{
968 if ((errors == NULL) ||
969 (strcmp(errors,"strict") == 0)) {
970 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000971 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000972 details);
973 return -1;
974 }
975 else if (strcmp(errors,"ignore") == 0) {
976 (*source)++;
977 return 0;
978 }
979 else if (strcmp(errors,"replace") == 0) {
980 (*source)++;
981 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
982 (*dest)++;
983 return 0;
984 }
985 else {
986 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000987 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000988 errors);
989 return -1;
990 }
991}
992
Guido van Rossumd57fd912000-03-10 22:53:23 +0000993PyObject *PyUnicode_DecodeUTF8(const char *s,
994 int size,
995 const char *errors)
996{
997 int n;
998 const char *e;
999 PyUnicodeObject *unicode;
1000 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001001 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002
1003 /* Note: size will always be longer than the resulting Unicode
1004 character count */
1005 unicode = _PyUnicode_New(size);
1006 if (!unicode)
1007 return NULL;
1008 if (size == 0)
1009 return (PyObject *)unicode;
1010
1011 /* Unpack UTF-8 encoded data */
1012 p = unicode->str;
1013 e = s + size;
1014
1015 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001016 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017
1018 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001019 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020 s++;
1021 continue;
1022 }
1023
1024 n = utf8_code_length[ch];
1025
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001026 if (s + n > e) {
1027 errmsg = "unexpected end of data";
1028 goto utf8Error;
1029 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030
1031 switch (n) {
1032
1033 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001034 errmsg = "unexpected code byte";
1035 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001036
1037 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001038 errmsg = "internal error";
1039 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040
1041 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001042 if ((s[1] & 0xc0) != 0x80) {
1043 errmsg = "invalid data";
1044 goto utf8Error;
1045 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001047 if (ch < 0x80) {
1048 errmsg = "illegal encoding";
1049 goto utf8Error;
1050 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001052 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053 break;
1054
1055 case 3:
1056 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001057 (s[2] & 0xc0) != 0x80) {
1058 errmsg = "invalid data";
1059 goto utf8Error;
1060 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001062 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
1063 errmsg = "illegal encoding";
1064 goto utf8Error;
1065 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001066 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001067 *p++ = (Py_UNICODE)ch;
1068 break;
1069
1070 case 4:
1071 if ((s[1] & 0xc0) != 0x80 ||
1072 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001073 (s[3] & 0xc0) != 0x80) {
1074 errmsg = "invalid data";
1075 goto utf8Error;
1076 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001077 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
1078 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
1079 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001080 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001081 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001082 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001083 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001084 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001085 errmsg = "illegal encoding";
1086 goto utf8Error;
1087 }
Fredrik Lundh8f455852001-06-27 18:59:43 +00001088#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001089 *p++ = (Py_UNICODE)ch;
1090#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001091 /* compute and append the two surrogates: */
1092
1093 /* translate from 10000..10FFFF to 0..FFFF */
1094 ch -= 0x10000;
1095
1096 /* high surrogate = top 10 bits added to D800 */
1097 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
1098
1099 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +00001100 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001101#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001102 break;
1103
1104 default:
1105 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001106 errmsg = "unsupported Unicode code range";
1107 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 }
1109 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001110 continue;
1111
1112 utf8Error:
1113 if (utf8_decoding_error(&s, &p, errors, errmsg))
1114 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 }
1116
1117 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001118 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001119 goto onError;
1120
1121 return (PyObject *)unicode;
1122
1123onError:
1124 Py_DECREF(unicode);
1125 return NULL;
1126}
1127
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001128/* Not used anymore, now that the encoder supports UTF-16
1129 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +00001130#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131static
1132int utf8_encoding_error(const Py_UNICODE **source,
1133 char **dest,
1134 const char *errors,
1135 const char *details)
1136{
1137 if ((errors == NULL) ||
1138 (strcmp(errors,"strict") == 0)) {
1139 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001140 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141 details);
1142 return -1;
1143 }
1144 else if (strcmp(errors,"ignore") == 0) {
1145 return 0;
1146 }
1147 else if (strcmp(errors,"replace") == 0) {
1148 **dest = '?';
1149 (*dest)++;
1150 return 0;
1151 }
1152 else {
1153 PyErr_Format(PyExc_ValueError,
1154 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001155 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 errors);
1157 return -1;
1158 }
1159}
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001160#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161
1162PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
1163 int size,
1164 const char *errors)
1165{
1166 PyObject *v;
1167 char *p;
1168 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001169 Py_UCS4 ch2;
1170 unsigned int cbAllocated = 3 * size;
1171 unsigned int cbWritten = 0;
1172 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001174 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 if (v == NULL)
1176 return NULL;
1177 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001178 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179
1180 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001181 while (i < size) {
1182 Py_UCS4 ch = s[i++];
1183 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001184 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001185 cbWritten++;
1186 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187 else if (ch < 0x0800) {
1188 *p++ = 0xc0 | (ch >> 6);
1189 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001190 cbWritten += 2;
1191 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001192 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001193 /* Check for high surrogate */
1194 if (0xD800 <= ch && ch <= 0xDBFF) {
1195 if (i != size) {
1196 ch2 = s[i];
1197 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
1198
1199 if (cbWritten >= (cbAllocated - 4)) {
1200 /* Provide enough room for some more
1201 surrogates */
1202 cbAllocated += 4*10;
1203 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001204 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001205 }
1206
1207 /* combine the two values */
1208 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
1209
1210 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +00001211 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +00001212 i++;
1213 cbWritten += 4;
1214 }
1215 }
1216 }
1217 else {
1218 *p++ = (char)(0xe0 | (ch >> 12));
1219 cbWritten += 3;
1220 }
1221 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
1222 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001223 } else {
1224 *p++ = 0xf0 | (ch>>18);
1225 *p++ = 0x80 | ((ch>>12) & 0x3f);
1226 *p++ = 0x80 | ((ch>>6) & 0x3f);
1227 *p++ = 0x80 | (ch & 0x3f);
1228 cbWritten += 4;
1229 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001230 }
1231 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001232 if (_PyString_Resize(&v, p - q))
1233 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 return v;
1235
1236 onError:
1237 Py_DECREF(v);
1238 return NULL;
1239}
1240
Guido van Rossumd57fd912000-03-10 22:53:23 +00001241PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
1242{
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243 if (!PyUnicode_Check(unicode)) {
1244 PyErr_BadArgument();
1245 return NULL;
1246 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +00001247 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1248 PyUnicode_GET_SIZE(unicode),
1249 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001250}
1251
1252/* --- UTF-16 Codec ------------------------------------------------------- */
1253
1254static
Tim Peters772747b2001-08-09 22:21:55 +00001255int utf16_decoding_error(Py_UNICODE **dest,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001256 const char *errors,
1257 const char *details)
1258{
1259 if ((errors == NULL) ||
1260 (strcmp(errors,"strict") == 0)) {
1261 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001262 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001263 details);
1264 return -1;
1265 }
1266 else if (strcmp(errors,"ignore") == 0) {
1267 return 0;
1268 }
1269 else if (strcmp(errors,"replace") == 0) {
1270 if (dest) {
1271 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1272 (*dest)++;
1273 }
1274 return 0;
1275 }
1276 else {
1277 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +00001278 "UTF-16 decoding error; "
1279 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001280 errors);
1281 return -1;
1282 }
1283}
1284
Tim Peters772747b2001-08-09 22:21:55 +00001285PyObject *
1286PyUnicode_DecodeUTF16(const char *s,
1287 int size,
1288 const char *errors,
1289 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001290{
1291 PyUnicodeObject *unicode;
1292 Py_UNICODE *p;
Tim Peters772747b2001-08-09 22:21:55 +00001293 const unsigned char *q, *e;
1294 int bo = 0; /* assume native ordering by default */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001295 const char *errmsg = "";
Tim Peters772747b2001-08-09 22:21:55 +00001296 /* Offsets from q for retrieving byte pairs in the right order. */
1297#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1298 int ihi = 1, ilo = 0;
1299#else
1300 int ihi = 0, ilo = 1;
1301#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302
1303 /* size should be an even number */
Tim Peters772747b2001-08-09 22:21:55 +00001304 if (size & 1) {
1305 if (utf16_decoding_error(NULL, errors, "truncated data"))
1306 return NULL;
1307 --size; /* else ignore the oddball byte */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 }
1309
1310 /* Note: size will always be longer than the resulting Unicode
1311 character count */
1312 unicode = _PyUnicode_New(size);
1313 if (!unicode)
1314 return NULL;
1315 if (size == 0)
1316 return (PyObject *)unicode;
1317
1318 /* Unpack UTF-16 encoded data */
1319 p = unicode->str;
Tim Peters772747b2001-08-09 22:21:55 +00001320 q = (unsigned char *)s;
1321 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001322
1323 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00001324 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001325
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001326 /* Check for BOM marks (U+FEFF) in the input and adjust current
1327 byte order setting accordingly. In native mode, the leading BOM
1328 mark is skipped, in all other modes, it is copied to the output
1329 stream as-is (giving a ZWNBSP character). */
1330 if (bo == 0) {
Tim Peters772747b2001-08-09 22:21:55 +00001331 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001332#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Tim Peters772747b2001-08-09 22:21:55 +00001333 if (bom == 0xFEFF) {
1334 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001335 bo = -1;
Tim Peters772747b2001-08-09 22:21:55 +00001336 }
1337 else if (bom == 0xFFFE) {
1338 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001339 bo = 1;
1340 }
1341#else
Tim Peters772747b2001-08-09 22:21:55 +00001342 if (bom == 0xFEFF) {
1343 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001344 bo = 1;
Tim Peters772747b2001-08-09 22:21:55 +00001345 }
1346 else if (bom == 0xFFFE) {
1347 q += 2;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001348 bo = -1;
1349 }
1350#endif
1351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352
Tim Peters772747b2001-08-09 22:21:55 +00001353 if (bo == -1) {
1354 /* force LE */
1355 ihi = 1;
1356 ilo = 0;
1357 }
1358 else if (bo == 1) {
1359 /* force BE */
1360 ihi = 0;
1361 ilo = 1;
1362 }
1363
1364 while (q < e) {
1365 Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
1366 q += 2;
1367
Guido van Rossumd57fd912000-03-10 22:53:23 +00001368 if (ch < 0xD800 || ch > 0xDFFF) {
1369 *p++ = ch;
1370 continue;
1371 }
1372
1373 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001374 if (q >= e) {
1375 errmsg = "unexpected end of data";
1376 goto utf16Error;
1377 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001378 if (0xD800 <= ch && ch <= 0xDBFF) {
Tim Peters772747b2001-08-09 22:21:55 +00001379 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
1380 q += 2;
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001381 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001382#ifndef Py_UNICODE_WIDE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001383 *p++ = ch;
1384 *p++ = ch2;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001385#else
1386 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001387#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001388 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001389 }
1390 else {
1391 errmsg = "illegal UTF-16 surrogate";
1392 goto utf16Error;
1393 }
1394
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001396 errmsg = "illegal encoding";
1397 /* Fall through to report the error */
1398
1399 utf16Error:
Tim Peters772747b2001-08-09 22:21:55 +00001400 if (utf16_decoding_error(&p, errors, errmsg))
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001401 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001402 }
1403
1404 if (byteorder)
1405 *byteorder = bo;
1406
1407 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001408 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 goto onError;
1410
1411 return (PyObject *)unicode;
1412
1413onError:
1414 Py_DECREF(unicode);
1415 return NULL;
1416}
1417
Tim Peters772747b2001-08-09 22:21:55 +00001418PyObject *
1419PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1420 int size,
1421 const char *errors,
1422 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423{
1424 PyObject *v;
Tim Peters772747b2001-08-09 22:21:55 +00001425 unsigned char *p;
1426 int i, pairs;
1427 /* Offsets from p for storing byte pairs in the right order. */
1428#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1429 int ihi = 1, ilo = 0;
1430#else
1431 int ihi = 0, ilo = 1;
1432#endif
1433
1434#define STORECHAR(CH) \
1435 do { \
1436 p[ihi] = ((CH) >> 8) & 0xff; \
1437 p[ilo] = (CH) & 0xff; \
1438 p += 2; \
1439 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001441 for (i = pairs = 0; i < size; i++)
1442 if (s[i] >= 0x10000)
1443 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001444 v = PyString_FromStringAndSize(NULL,
Tim Peters772747b2001-08-09 22:21:55 +00001445 2 * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001446 if (v == NULL)
1447 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001448
Tim Peters772747b2001-08-09 22:21:55 +00001449 p = (unsigned char *)PyString_AS_STRING(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001450 if (byteorder == 0)
Tim Peters772747b2001-08-09 22:21:55 +00001451 STORECHAR(0xFEFF);
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001452 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001453 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001454
1455 if (byteorder == -1) {
1456 /* force LE */
1457 ihi = 1;
1458 ilo = 0;
1459 }
1460 else if (byteorder == 1) {
1461 /* force BE */
1462 ihi = 0;
1463 ilo = 1;
1464 }
1465
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001466 while (size-- > 0) {
1467 Py_UNICODE ch = *s++;
1468 Py_UNICODE ch2 = 0;
1469 if (ch >= 0x10000) {
Tim Peters772747b2001-08-09 22:21:55 +00001470 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
1471 ch = 0xD800 | ((ch-0x10000) >> 10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001472 }
Tim Peters772747b2001-08-09 22:21:55 +00001473 STORECHAR(ch);
1474 if (ch2)
1475 STORECHAR(ch2);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001476 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477 return v;
Tim Peters772747b2001-08-09 22:21:55 +00001478#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479}
1480
1481PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1482{
1483 if (!PyUnicode_Check(unicode)) {
1484 PyErr_BadArgument();
1485 return NULL;
1486 }
1487 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1488 PyUnicode_GET_SIZE(unicode),
1489 NULL,
1490 0);
1491}
1492
1493/* --- Unicode Escape Codec ----------------------------------------------- */
1494
1495static
1496int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001497 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498 const char *errors,
1499 const char *details)
1500{
1501 if ((errors == NULL) ||
1502 (strcmp(errors,"strict") == 0)) {
1503 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001504 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 details);
1506 return -1;
1507 }
1508 else if (strcmp(errors,"ignore") == 0) {
1509 return 0;
1510 }
1511 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001512 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513 return 0;
1514 }
1515 else {
1516 PyErr_Format(PyExc_ValueError,
1517 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001518 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001519 errors);
1520 return -1;
1521 }
1522}
1523
Fredrik Lundh06d12682001-01-24 07:59:11 +00001524static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001525
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1527 int size,
1528 const char *errors)
1529{
1530 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001531 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001533 char* message;
1534 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1535
Guido van Rossumd57fd912000-03-10 22:53:23 +00001536 /* Escaped strings will always be longer than the resulting
1537 Unicode string, so we start with size here and then reduce the
1538 length after conversion to the true value. */
1539 v = _PyUnicode_New(size);
1540 if (v == NULL)
1541 goto onError;
1542 if (size == 0)
1543 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001544
Guido van Rossumd57fd912000-03-10 22:53:23 +00001545 p = buf = PyUnicode_AS_UNICODE(v);
1546 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001547
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 while (s < end) {
1549 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001550 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001551 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001552
1553 /* Non-escape characters are interpreted as Unicode ordinals */
1554 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001555 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001556 continue;
1557 }
1558
1559 /* \ - Escapes */
1560 s++;
1561 switch (*s++) {
1562
1563 /* \x escapes */
1564 case '\n': break;
1565 case '\\': *p++ = '\\'; break;
1566 case '\'': *p++ = '\''; break;
1567 case '\"': *p++ = '\"'; break;
1568 case 'b': *p++ = '\b'; break;
1569 case 'f': *p++ = '\014'; break; /* FF */
1570 case 't': *p++ = '\t'; break;
1571 case 'n': *p++ = '\n'; break;
1572 case 'r': *p++ = '\r'; break;
1573 case 'v': *p++ = '\013'; break; /* VT */
1574 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1575
1576 /* \OOO (octal) escapes */
1577 case '0': case '1': case '2': case '3':
1578 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001579 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001581 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001583 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001584 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001585 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 break;
1587
Fredrik Lundhccc74732001-02-18 22:13:49 +00001588 /* hex escapes */
1589 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001591 digits = 2;
1592 message = "truncated \\xXX escape";
1593 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594
Fredrik Lundhccc74732001-02-18 22:13:49 +00001595 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001597 digits = 4;
1598 message = "truncated \\uXXXX escape";
1599 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001600
Fredrik Lundhccc74732001-02-18 22:13:49 +00001601 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001602 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001603 digits = 8;
1604 message = "truncated \\UXXXXXXXX escape";
1605 hexescape:
1606 chr = 0;
1607 for (i = 0; i < digits; i++) {
1608 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001609 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001610 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001611 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001612 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001613 i++;
1614 break;
1615 }
1616 chr = (chr<<4) & ~0xF;
1617 if (c >= '0' && c <= '9')
1618 chr += c - '0';
1619 else if (c >= 'a' && c <= 'f')
1620 chr += 10 + c - 'a';
1621 else
1622 chr += 10 + c - 'A';
1623 }
1624 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001625 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001626 /* when we get here, chr is a 32-bit unicode character */
1627 if (chr <= 0xffff)
1628 /* UCS-2 character */
1629 *p++ = (Py_UNICODE) chr;
1630 else if (chr <= 0x10ffff) {
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001631 /* UCS-4 character. Either store directly, or as
1632 surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001633#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001634 *p++ = chr;
1635#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001636 chr -= 0x10000L;
1637 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001638 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001639#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001640 } else {
1641 if (unicodeescape_decoding_error(
1642 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001643 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001644 )
1645 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001646 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001647 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001648 break;
1649
1650 /* \N{name} */
1651 case 'N':
1652 message = "malformed \\N character escape";
1653 if (ucnhash_CAPI == NULL) {
1654 /* load the unicode data module */
1655 PyObject *m, *v;
1656 m = PyImport_ImportModule("unicodedata");
1657 if (m == NULL)
1658 goto ucnhashError;
1659 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1660 Py_DECREF(m);
1661 if (v == NULL)
1662 goto ucnhashError;
1663 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1664 Py_DECREF(v);
1665 if (ucnhash_CAPI == NULL)
1666 goto ucnhashError;
1667 }
1668 if (*s == '{') {
1669 const char *start = s+1;
1670 /* look for the closing brace */
1671 while (*s != '}' && s < end)
1672 s++;
1673 if (s > start && s < end && *s == '}') {
1674 /* found a name. look it up in the unicode database */
1675 message = "unknown Unicode character name";
1676 s++;
1677 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1678 goto store;
1679 }
1680 }
1681 if (unicodeescape_decoding_error(&s, &x, errors, message))
1682 goto onError;
1683 *p++ = x;
1684 break;
1685
1686 default:
1687 *p++ = '\\';
1688 *p++ = (unsigned char)s[-1];
1689 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 }
1691 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001692 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001693 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 return (PyObject *)v;
1695
Fredrik Lundhccc74732001-02-18 22:13:49 +00001696ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001697 PyErr_SetString(
1698 PyExc_UnicodeError,
1699 "\\N escapes not supported (can't load unicodedata module)"
1700 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001701 return NULL;
1702
Fredrik Lundhccc74732001-02-18 22:13:49 +00001703onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001704 Py_XDECREF(v);
1705 return NULL;
1706}
1707
1708/* Return a Unicode-Escape string version of the Unicode object.
1709
1710 If quotes is true, the string is enclosed in u"" or u'' quotes as
1711 appropriate.
1712
1713*/
1714
Barry Warsaw51ac5802000-03-20 16:36:48 +00001715static const Py_UNICODE *findchar(const Py_UNICODE *s,
1716 int size,
1717 Py_UNICODE ch);
1718
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719static
1720PyObject *unicodeescape_string(const Py_UNICODE *s,
1721 int size,
1722 int quotes)
1723{
1724 PyObject *repr;
1725 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001726
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001727 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728
1729 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1730 if (repr == NULL)
1731 return NULL;
1732
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001733 p = PyString_AS_STRING(repr);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734
1735 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 *p++ = 'u';
1737 *p++ = (findchar(s, size, '\'') &&
1738 !findchar(s, size, '"')) ? '"' : '\'';
1739 }
1740 while (size-- > 0) {
1741 Py_UNICODE ch = *s++;
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001742
Guido van Rossumd57fd912000-03-10 22:53:23 +00001743 /* Escape quotes */
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001744 if (quotes &&
1745 (ch == (Py_UNICODE) PyString_AS_STRING(repr)[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746 *p++ = '\\';
1747 *p++ = (char) ch;
1748 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001749
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001750#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001751 /* Map 21-bit characters to '\U00xxxxxx' */
1752 else if (ch >= 0x10000) {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001753 int offset = p - PyString_AS_STRING(repr);
1754
1755 /* Resize the string if necessary */
1756 if (offset + 12 > PyString_GET_SIZE(repr)) {
1757 if (_PyString_Resize(&repr, PyString_GET_SIZE(repr) + 100))
1758 goto onError;
1759 p = PyString_AS_STRING(repr) + offset;
1760 }
1761
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001762 *p++ = '\\';
1763 *p++ = 'U';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001764 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
1765 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
1766 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
1767 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
1768 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
1769 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
1770 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001771 *p++ = hexdigit[ch & 0x0000000F];
1772 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001773 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001774#endif
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001775 /* Map UTF-16 surrogate pairs to Unicode \UXXXXXXXX escapes */
1776 else if (ch >= 0xD800 && ch < 0xDC00) {
1777 Py_UNICODE ch2;
1778 Py_UCS4 ucs;
1779
1780 ch2 = *s++;
1781 size--;
1782 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
1783 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
1784 *p++ = '\\';
1785 *p++ = 'U';
1786 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
1787 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
1788 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
1789 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
1790 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
1791 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
1792 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
1793 *p++ = hexdigit[ucs & 0x0000000F];
1794 continue;
1795 }
1796 /* Fall through: isolated surrogates are copied as-is */
1797 s--;
1798 size++;
1799 }
1800
Guido van Rossumd57fd912000-03-10 22:53:23 +00001801 /* Map 16-bit characters to '\uxxxx' */
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001802 if (ch >= 256) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001803 *p++ = '\\';
1804 *p++ = 'u';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001805 *p++ = hexdigit[(ch >> 12) & 0x000F];
1806 *p++ = hexdigit[(ch >> 8) & 0x000F];
1807 *p++ = hexdigit[(ch >> 4) & 0x000F];
1808 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001809 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001810
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001811 /* Map special whitespace to '\t', \n', '\r' */
1812 else if (ch == '\t') {
1813 *p++ = '\\';
1814 *p++ = 't';
1815 }
1816 else if (ch == '\n') {
1817 *p++ = '\\';
1818 *p++ = 'n';
1819 }
1820 else if (ch == '\r') {
1821 *p++ = '\\';
1822 *p++ = 'r';
1823 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001824
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001825 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001826 else if (ch < ' ' || ch >= 128) {
1827 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001828 *p++ = 'x';
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +00001829 *p++ = hexdigit[(ch >> 4) & 0x000F];
1830 *p++ = hexdigit[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001831 }
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001832
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 /* Copy everything else as-is */
1834 else
1835 *p++ = (char) ch;
1836 }
1837 if (quotes)
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001838 *p++ = PyString_AS_STRING(repr)[1];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839
1840 *p = '\0';
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00001841 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001843
1844 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001845
1846 onError:
1847 Py_DECREF(repr);
1848 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849}
1850
1851PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1852 int size)
1853{
1854 return unicodeescape_string(s, size, 0);
1855}
1856
1857PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1858{
1859 if (!PyUnicode_Check(unicode)) {
1860 PyErr_BadArgument();
1861 return NULL;
1862 }
1863 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1864 PyUnicode_GET_SIZE(unicode));
1865}
1866
1867/* --- Raw Unicode Escape Codec ------------------------------------------- */
1868
1869PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1870 int size,
1871 const char *errors)
1872{
1873 PyUnicodeObject *v;
1874 Py_UNICODE *p, *buf;
1875 const char *end;
1876 const char *bs;
1877
1878 /* Escaped strings will always be longer than the resulting
1879 Unicode string, so we start with size here and then reduce the
1880 length after conversion to the true value. */
1881 v = _PyUnicode_New(size);
1882 if (v == NULL)
1883 goto onError;
1884 if (size == 0)
1885 return (PyObject *)v;
1886 p = buf = PyUnicode_AS_UNICODE(v);
1887 end = s + size;
1888 while (s < end) {
1889 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001890 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 int i;
1892
1893 /* Non-escape characters are interpreted as Unicode ordinals */
1894 if (*s != '\\') {
1895 *p++ = (unsigned char)*s++;
1896 continue;
1897 }
1898
1899 /* \u-escapes are only interpreted iff the number of leading
1900 backslashes if odd */
1901 bs = s;
1902 for (;s < end;) {
1903 if (*s != '\\')
1904 break;
1905 *p++ = (unsigned char)*s++;
1906 }
1907 if (((s - bs) & 1) == 0 ||
1908 s >= end ||
1909 *s != 'u') {
1910 continue;
1911 }
1912 p--;
1913 s++;
1914
1915 /* \uXXXX with 4 hex digits */
1916 for (x = 0, i = 0; i < 4; i++) {
1917 c = (unsigned char)s[i];
1918 if (!isxdigit(c)) {
1919 if (unicodeescape_decoding_error(&s, &x, errors,
1920 "truncated \\uXXXX"))
1921 goto onError;
1922 i++;
1923 break;
1924 }
1925 x = (x<<4) & ~0xF;
1926 if (c >= '0' && c <= '9')
1927 x += c - '0';
1928 else if (c >= 'a' && c <= 'f')
1929 x += 10 + c - 'a';
1930 else
1931 x += 10 + c - 'A';
1932 }
1933 s += i;
1934 *p++ = x;
1935 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001936 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001937 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938 return (PyObject *)v;
1939
1940 onError:
1941 Py_XDECREF(v);
1942 return NULL;
1943}
1944
1945PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1946 int size)
1947{
1948 PyObject *repr;
1949 char *p;
1950 char *q;
1951
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001952 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953
1954 repr = PyString_FromStringAndSize(NULL, 6 * size);
1955 if (repr == NULL)
1956 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001957 if (size == 0)
1958 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959
1960 p = q = PyString_AS_STRING(repr);
1961 while (size-- > 0) {
1962 Py_UNICODE ch = *s++;
1963 /* Map 16-bit characters to '\uxxxx' */
1964 if (ch >= 256) {
1965 *p++ = '\\';
1966 *p++ = 'u';
1967 *p++ = hexdigit[(ch >> 12) & 0xf];
1968 *p++ = hexdigit[(ch >> 8) & 0xf];
1969 *p++ = hexdigit[(ch >> 4) & 0xf];
1970 *p++ = hexdigit[ch & 15];
1971 }
1972 /* Copy everything else as-is */
1973 else
1974 *p++ = (char) ch;
1975 }
1976 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001977 if (_PyString_Resize(&repr, p - q))
1978 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001979
1980 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001981
1982 onError:
1983 Py_DECREF(repr);
1984 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001985}
1986
1987PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1988{
1989 if (!PyUnicode_Check(unicode)) {
1990 PyErr_BadArgument();
1991 return NULL;
1992 }
1993 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1994 PyUnicode_GET_SIZE(unicode));
1995}
1996
1997/* --- Latin-1 Codec ------------------------------------------------------ */
1998
1999PyObject *PyUnicode_DecodeLatin1(const char *s,
2000 int size,
2001 const char *errors)
2002{
2003 PyUnicodeObject *v;
2004 Py_UNICODE *p;
2005
2006 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002007 if (size == 1 && *(unsigned char*)s < 256) {
2008 Py_UNICODE r = *(unsigned char*)s;
2009 return PyUnicode_FromUnicode(&r, 1);
2010 }
2011
Guido van Rossumd57fd912000-03-10 22:53:23 +00002012 v = _PyUnicode_New(size);
2013 if (v == NULL)
2014 goto onError;
2015 if (size == 0)
2016 return (PyObject *)v;
2017 p = PyUnicode_AS_UNICODE(v);
2018 while (size-- > 0)
2019 *p++ = (unsigned char)*s++;
2020 return (PyObject *)v;
2021
2022 onError:
2023 Py_XDECREF(v);
2024 return NULL;
2025}
2026
2027static
2028int latin1_encoding_error(const Py_UNICODE **source,
2029 char **dest,
2030 const char *errors,
2031 const char *details)
2032{
2033 if ((errors == NULL) ||
2034 (strcmp(errors,"strict") == 0)) {
2035 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002036 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 details);
2038 return -1;
2039 }
2040 else if (strcmp(errors,"ignore") == 0) {
2041 return 0;
2042 }
2043 else if (strcmp(errors,"replace") == 0) {
2044 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002045 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 return 0;
2047 }
2048 else {
2049 PyErr_Format(PyExc_ValueError,
2050 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002051 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002052 errors);
2053 return -1;
2054 }
2055}
2056
2057PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
2058 int size,
2059 const char *errors)
2060{
2061 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002062 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002063
Guido van Rossumd57fd912000-03-10 22:53:23 +00002064 repr = PyString_FromStringAndSize(NULL, size);
2065 if (repr == NULL)
2066 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002067 if (size == 0)
2068 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069
2070 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002071 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002072 while (size-- > 0) {
2073 Py_UNICODE ch = *p++;
2074 if (ch >= 256) {
2075 if (latin1_encoding_error(&p, &s, errors,
2076 "ordinal not in range(256)"))
2077 goto onError;
2078 }
2079 else
2080 *s++ = (char)ch;
2081 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002082 /* Resize if error handling skipped some characters */
2083 if (s - start < PyString_GET_SIZE(repr))
2084 if (_PyString_Resize(&repr, s - start))
2085 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002086 return repr;
2087
2088 onError:
2089 Py_DECREF(repr);
2090 return NULL;
2091}
2092
2093PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
2094{
2095 if (!PyUnicode_Check(unicode)) {
2096 PyErr_BadArgument();
2097 return NULL;
2098 }
2099 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
2100 PyUnicode_GET_SIZE(unicode),
2101 NULL);
2102}
2103
2104/* --- 7-bit ASCII Codec -------------------------------------------------- */
2105
2106static
2107int ascii_decoding_error(const char **source,
2108 Py_UNICODE **dest,
2109 const char *errors,
2110 const char *details)
2111{
2112 if ((errors == NULL) ||
2113 (strcmp(errors,"strict") == 0)) {
2114 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002115 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 details);
2117 return -1;
2118 }
2119 else if (strcmp(errors,"ignore") == 0) {
2120 return 0;
2121 }
2122 else if (strcmp(errors,"replace") == 0) {
2123 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2124 (*dest)++;
2125 return 0;
2126 }
2127 else {
2128 PyErr_Format(PyExc_ValueError,
2129 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002130 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002131 errors);
2132 return -1;
2133 }
2134}
2135
2136PyObject *PyUnicode_DecodeASCII(const char *s,
2137 int size,
2138 const char *errors)
2139{
2140 PyUnicodeObject *v;
2141 Py_UNICODE *p;
2142
2143 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002144 if (size == 1 && *(unsigned char*)s < 128) {
2145 Py_UNICODE r = *(unsigned char*)s;
2146 return PyUnicode_FromUnicode(&r, 1);
2147 }
2148
Guido van Rossumd57fd912000-03-10 22:53:23 +00002149 v = _PyUnicode_New(size);
2150 if (v == NULL)
2151 goto onError;
2152 if (size == 0)
2153 return (PyObject *)v;
2154 p = PyUnicode_AS_UNICODE(v);
2155 while (size-- > 0) {
2156 register unsigned char c;
2157
2158 c = (unsigned char)*s++;
2159 if (c < 128)
2160 *p++ = c;
2161 else if (ascii_decoding_error(&s, &p, errors,
2162 "ordinal not in range(128)"))
2163 goto onError;
2164 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002165 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002166 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002167 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002168 return (PyObject *)v;
2169
2170 onError:
2171 Py_XDECREF(v);
2172 return NULL;
2173}
2174
2175static
2176int ascii_encoding_error(const Py_UNICODE **source,
2177 char **dest,
2178 const char *errors,
2179 const char *details)
2180{
2181 if ((errors == NULL) ||
2182 (strcmp(errors,"strict") == 0)) {
2183 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002184 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002185 details);
2186 return -1;
2187 }
2188 else if (strcmp(errors,"ignore") == 0) {
2189 return 0;
2190 }
2191 else if (strcmp(errors,"replace") == 0) {
2192 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002193 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 return 0;
2195 }
2196 else {
2197 PyErr_Format(PyExc_ValueError,
2198 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002199 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200 errors);
2201 return -1;
2202 }
2203}
2204
2205PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
2206 int size,
2207 const char *errors)
2208{
2209 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002210 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002211
Guido van Rossumd57fd912000-03-10 22:53:23 +00002212 repr = PyString_FromStringAndSize(NULL, size);
2213 if (repr == NULL)
2214 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002215 if (size == 0)
2216 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217
2218 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002219 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002220 while (size-- > 0) {
2221 Py_UNICODE ch = *p++;
2222 if (ch >= 128) {
2223 if (ascii_encoding_error(&p, &s, errors,
2224 "ordinal not in range(128)"))
2225 goto onError;
2226 }
2227 else
2228 *s++ = (char)ch;
2229 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002230 /* Resize if error handling skipped some characters */
2231 if (s - start < PyString_GET_SIZE(repr))
2232 if (_PyString_Resize(&repr, s - start))
2233 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002234 return repr;
2235
2236 onError:
2237 Py_DECREF(repr);
2238 return NULL;
2239}
2240
2241PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
2242{
2243 if (!PyUnicode_Check(unicode)) {
2244 PyErr_BadArgument();
2245 return NULL;
2246 }
2247 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
2248 PyUnicode_GET_SIZE(unicode),
2249 NULL);
2250}
2251
Fredrik Lundh30831632001-06-26 15:11:00 +00002252#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002253
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002254/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002255
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002256PyObject *PyUnicode_DecodeMBCS(const char *s,
2257 int size,
2258 const char *errors)
2259{
2260 PyUnicodeObject *v;
2261 Py_UNICODE *p;
2262
2263 /* First get the size of the result */
2264 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00002265 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002266 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2267
2268 v = _PyUnicode_New(usize);
2269 if (v == NULL)
2270 return NULL;
2271 if (usize == 0)
2272 return (PyObject *)v;
2273 p = PyUnicode_AS_UNICODE(v);
2274 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
2275 Py_DECREF(v);
2276 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2277 }
2278
2279 return (PyObject *)v;
2280}
2281
2282PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
2283 int size,
2284 const char *errors)
2285{
2286 PyObject *repr;
2287 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00002288 DWORD mbcssize;
2289
2290 /* If there are no characters, bail now! */
2291 if (size==0)
2292 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002293
2294 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00002295 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002296 if (mbcssize==0)
2297 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2298
2299 repr = PyString_FromStringAndSize(NULL, mbcssize);
2300 if (repr == NULL)
2301 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002302 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002303 return repr;
2304
2305 /* Do the conversion */
2306 s = PyString_AS_STRING(repr);
2307 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
2308 Py_DECREF(repr);
2309 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
2310 }
2311 return repr;
2312}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00002313
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00002314#endif /* MS_WIN32 */
2315
Guido van Rossumd57fd912000-03-10 22:53:23 +00002316/* --- Character Mapping Codec -------------------------------------------- */
2317
2318static
2319int charmap_decoding_error(const char **source,
2320 Py_UNICODE **dest,
2321 const char *errors,
2322 const char *details)
2323{
2324 if ((errors == NULL) ||
2325 (strcmp(errors,"strict") == 0)) {
2326 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002327 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002328 details);
2329 return -1;
2330 }
2331 else if (strcmp(errors,"ignore") == 0) {
2332 return 0;
2333 }
2334 else if (strcmp(errors,"replace") == 0) {
2335 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
2336 (*dest)++;
2337 return 0;
2338 }
2339 else {
2340 PyErr_Format(PyExc_ValueError,
2341 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002342 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002343 errors);
2344 return -1;
2345 }
2346}
2347
2348PyObject *PyUnicode_DecodeCharmap(const char *s,
2349 int size,
2350 PyObject *mapping,
2351 const char *errors)
2352{
2353 PyUnicodeObject *v;
2354 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002355 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002356
2357 /* Default to Latin-1 */
2358 if (mapping == NULL)
2359 return PyUnicode_DecodeLatin1(s, size, errors);
2360
2361 v = _PyUnicode_New(size);
2362 if (v == NULL)
2363 goto onError;
2364 if (size == 0)
2365 return (PyObject *)v;
2366 p = PyUnicode_AS_UNICODE(v);
2367 while (size-- > 0) {
2368 unsigned char ch = *s++;
2369 PyObject *w, *x;
2370
2371 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2372 w = PyInt_FromLong((long)ch);
2373 if (w == NULL)
2374 goto onError;
2375 x = PyObject_GetItem(mapping, w);
2376 Py_DECREF(w);
2377 if (x == NULL) {
2378 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002379 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002380 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002381 x = Py_None;
2382 Py_INCREF(x);
2383 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002384 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002385 }
2386
2387 /* Apply mapping */
2388 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002389 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002390 if (value < 0 || value > 65535) {
2391 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002392 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002393 Py_DECREF(x);
2394 goto onError;
2395 }
2396 *p++ = (Py_UNICODE)value;
2397 }
2398 else if (x == Py_None) {
2399 /* undefined mapping */
2400 if (charmap_decoding_error(&s, &p, errors,
2401 "character maps to <undefined>")) {
2402 Py_DECREF(x);
2403 goto onError;
2404 }
2405 }
2406 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002407 int targetsize = PyUnicode_GET_SIZE(x);
2408
2409 if (targetsize == 1)
2410 /* 1-1 mapping */
2411 *p++ = *PyUnicode_AS_UNICODE(x);
2412
2413 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002414 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002415 if (targetsize > extrachars) {
2416 /* resize first */
2417 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2418 int needed = (targetsize - extrachars) + \
2419 (targetsize << 2);
2420 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002421 if (_PyUnicode_Resize(&v,
2422 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002423 Py_DECREF(x);
2424 goto onError;
2425 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002426 p = PyUnicode_AS_UNICODE(v) + oldpos;
2427 }
2428 Py_UNICODE_COPY(p,
2429 PyUnicode_AS_UNICODE(x),
2430 targetsize);
2431 p += targetsize;
2432 extrachars -= targetsize;
2433 }
2434 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 }
2436 else {
2437 /* wrong return value */
2438 PyErr_SetString(PyExc_TypeError,
2439 "character mapping must return integer, None or unicode");
2440 Py_DECREF(x);
2441 goto onError;
2442 }
2443 Py_DECREF(x);
2444 }
2445 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002446 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002447 goto onError;
2448 return (PyObject *)v;
2449
2450 onError:
2451 Py_XDECREF(v);
2452 return NULL;
2453}
2454
2455static
2456int charmap_encoding_error(const Py_UNICODE **source,
2457 char **dest,
2458 const char *errors,
2459 const char *details)
2460{
2461 if ((errors == NULL) ||
2462 (strcmp(errors,"strict") == 0)) {
2463 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002464 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002465 details);
2466 return -1;
2467 }
2468 else if (strcmp(errors,"ignore") == 0) {
2469 return 0;
2470 }
2471 else if (strcmp(errors,"replace") == 0) {
2472 **dest = '?';
2473 (*dest)++;
2474 return 0;
2475 }
2476 else {
2477 PyErr_Format(PyExc_ValueError,
2478 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002479 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002480 errors);
2481 return -1;
2482 }
2483}
2484
2485PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2486 int size,
2487 PyObject *mapping,
2488 const char *errors)
2489{
2490 PyObject *v;
2491 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002492 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493
2494 /* Default to Latin-1 */
2495 if (mapping == NULL)
2496 return PyUnicode_EncodeLatin1(p, size, errors);
2497
2498 v = PyString_FromStringAndSize(NULL, size);
2499 if (v == NULL)
2500 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002501 if (size == 0)
2502 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002503 s = PyString_AS_STRING(v);
2504 while (size-- > 0) {
2505 Py_UNICODE ch = *p++;
2506 PyObject *w, *x;
2507
2508 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2509 w = PyInt_FromLong((long)ch);
2510 if (w == NULL)
2511 goto onError;
2512 x = PyObject_GetItem(mapping, w);
2513 Py_DECREF(w);
2514 if (x == NULL) {
2515 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002516 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002517 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002518 x = Py_None;
2519 Py_INCREF(x);
2520 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002521 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522 }
2523
2524 /* Apply mapping */
2525 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002526 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002527 if (value < 0 || value > 255) {
2528 PyErr_SetString(PyExc_TypeError,
2529 "character mapping must be in range(256)");
2530 Py_DECREF(x);
2531 goto onError;
2532 }
2533 *s++ = (char)value;
2534 }
2535 else if (x == Py_None) {
2536 /* undefined mapping */
2537 if (charmap_encoding_error(&p, &s, errors,
2538 "character maps to <undefined>")) {
2539 Py_DECREF(x);
2540 goto onError;
2541 }
2542 }
2543 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002544 int targetsize = PyString_GET_SIZE(x);
2545
2546 if (targetsize == 1)
2547 /* 1-1 mapping */
2548 *s++ = *PyString_AS_STRING(x);
2549
2550 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002551 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002552 if (targetsize > extrachars) {
2553 /* resize first */
2554 int oldpos = (int)(s - PyString_AS_STRING(v));
2555 int needed = (targetsize - extrachars) + \
2556 (targetsize << 2);
2557 extrachars += needed;
2558 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002559 Py_DECREF(x);
2560 goto onError;
2561 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002562 s = PyString_AS_STRING(v) + oldpos;
2563 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002564 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002565 s += targetsize;
2566 extrachars -= targetsize;
2567 }
2568 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002569 }
2570 else {
2571 /* wrong return value */
2572 PyErr_SetString(PyExc_TypeError,
2573 "character mapping must return integer, None or unicode");
2574 Py_DECREF(x);
2575 goto onError;
2576 }
2577 Py_DECREF(x);
2578 }
2579 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2580 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2581 goto onError;
2582 return v;
2583
2584 onError:
2585 Py_DECREF(v);
2586 return NULL;
2587}
2588
2589PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2590 PyObject *mapping)
2591{
2592 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2593 PyErr_BadArgument();
2594 return NULL;
2595 }
2596 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2597 PyUnicode_GET_SIZE(unicode),
2598 mapping,
2599 NULL);
2600}
2601
2602static
2603int translate_error(const Py_UNICODE **source,
2604 Py_UNICODE **dest,
2605 const char *errors,
2606 const char *details)
2607{
2608 if ((errors == NULL) ||
2609 (strcmp(errors,"strict") == 0)) {
2610 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002611 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 details);
2613 return -1;
2614 }
2615 else if (strcmp(errors,"ignore") == 0) {
2616 return 0;
2617 }
2618 else if (strcmp(errors,"replace") == 0) {
2619 **dest = '?';
2620 (*dest)++;
2621 return 0;
2622 }
2623 else {
2624 PyErr_Format(PyExc_ValueError,
2625 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002626 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002627 errors);
2628 return -1;
2629 }
2630}
2631
2632PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2633 int size,
2634 PyObject *mapping,
2635 const char *errors)
2636{
2637 PyUnicodeObject *v;
2638 Py_UNICODE *p;
2639
2640 if (mapping == NULL) {
2641 PyErr_BadArgument();
2642 return NULL;
2643 }
2644
2645 /* Output will never be longer than input */
2646 v = _PyUnicode_New(size);
2647 if (v == NULL)
2648 goto onError;
2649 if (size == 0)
2650 goto done;
2651 p = PyUnicode_AS_UNICODE(v);
2652 while (size-- > 0) {
2653 Py_UNICODE ch = *s++;
2654 PyObject *w, *x;
2655
2656 /* Get mapping */
2657 w = PyInt_FromLong(ch);
2658 if (w == NULL)
2659 goto onError;
2660 x = PyObject_GetItem(mapping, w);
2661 Py_DECREF(w);
2662 if (x == NULL) {
2663 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2664 /* No mapping found: default to 1-1 mapping */
2665 PyErr_Clear();
2666 *p++ = ch;
2667 continue;
2668 }
2669 goto onError;
2670 }
2671
2672 /* Apply mapping */
2673 if (PyInt_Check(x))
2674 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2675 else if (x == Py_None) {
2676 /* undefined mapping */
2677 if (translate_error(&s, &p, errors,
2678 "character maps to <undefined>")) {
2679 Py_DECREF(x);
2680 goto onError;
2681 }
2682 }
2683 else if (PyUnicode_Check(x)) {
2684 if (PyUnicode_GET_SIZE(x) != 1) {
2685 /* 1-n mapping */
2686 PyErr_SetString(PyExc_NotImplementedError,
2687 "1-n mappings are currently not implemented");
2688 Py_DECREF(x);
2689 goto onError;
2690 }
2691 *p++ = *PyUnicode_AS_UNICODE(x);
2692 }
2693 else {
2694 /* wrong return value */
2695 PyErr_SetString(PyExc_TypeError,
2696 "translate mapping must return integer, None or unicode");
2697 Py_DECREF(x);
2698 goto onError;
2699 }
2700 Py_DECREF(x);
2701 }
2702 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002703 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002704 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002705
2706 done:
2707 return (PyObject *)v;
2708
2709 onError:
2710 Py_XDECREF(v);
2711 return NULL;
2712}
2713
2714PyObject *PyUnicode_Translate(PyObject *str,
2715 PyObject *mapping,
2716 const char *errors)
2717{
2718 PyObject *result;
2719
2720 str = PyUnicode_FromObject(str);
2721 if (str == NULL)
2722 goto onError;
2723 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2724 PyUnicode_GET_SIZE(str),
2725 mapping,
2726 errors);
2727 Py_DECREF(str);
2728 return result;
2729
2730 onError:
2731 Py_XDECREF(str);
2732 return NULL;
2733}
2734
Guido van Rossum9e896b32000-04-05 20:11:21 +00002735/* --- Decimal Encoder ---------------------------------------------------- */
2736
2737int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2738 int length,
2739 char *output,
2740 const char *errors)
2741{
2742 Py_UNICODE *p, *end;
2743
2744 if (output == NULL) {
2745 PyErr_BadArgument();
2746 return -1;
2747 }
2748
2749 p = s;
2750 end = s + length;
2751 while (p < end) {
2752 register Py_UNICODE ch = *p++;
2753 int decimal;
2754
2755 if (Py_UNICODE_ISSPACE(ch)) {
2756 *output++ = ' ';
2757 continue;
2758 }
2759 decimal = Py_UNICODE_TODECIMAL(ch);
2760 if (decimal >= 0) {
2761 *output++ = '0' + decimal;
2762 continue;
2763 }
Guido van Rossumba477042000-04-06 18:18:10 +00002764 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002765 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002766 continue;
2767 }
2768 /* All other characters are considered invalid */
2769 if (errors == NULL || strcmp(errors, "strict") == 0) {
2770 PyErr_SetString(PyExc_ValueError,
2771 "invalid decimal Unicode string");
2772 goto onError;
2773 }
2774 else if (strcmp(errors, "ignore") == 0)
2775 continue;
2776 else if (strcmp(errors, "replace") == 0) {
2777 *output++ = '?';
2778 continue;
2779 }
2780 }
2781 /* 0-terminate the output string */
2782 *output++ = '\0';
2783 return 0;
2784
2785 onError:
2786 return -1;
2787}
2788
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789/* --- Helpers ------------------------------------------------------------ */
2790
2791static
2792int count(PyUnicodeObject *self,
2793 int start,
2794 int end,
2795 PyUnicodeObject *substring)
2796{
2797 int count = 0;
2798
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002799 if (start < 0)
2800 start += self->length;
2801 if (start < 0)
2802 start = 0;
2803 if (end > self->length)
2804 end = self->length;
2805 if (end < 0)
2806 end += self->length;
2807 if (end < 0)
2808 end = 0;
2809
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002810 if (substring->length == 0)
2811 return (end - start + 1);
2812
Guido van Rossumd57fd912000-03-10 22:53:23 +00002813 end -= substring->length;
2814
2815 while (start <= end)
2816 if (Py_UNICODE_MATCH(self, start, substring)) {
2817 count++;
2818 start += substring->length;
2819 } else
2820 start++;
2821
2822 return count;
2823}
2824
2825int PyUnicode_Count(PyObject *str,
2826 PyObject *substr,
2827 int start,
2828 int end)
2829{
2830 int result;
2831
2832 str = PyUnicode_FromObject(str);
2833 if (str == NULL)
2834 return -1;
2835 substr = PyUnicode_FromObject(substr);
2836 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002837 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 return -1;
2839 }
2840
2841 result = count((PyUnicodeObject *)str,
2842 start, end,
2843 (PyUnicodeObject *)substr);
2844
2845 Py_DECREF(str);
2846 Py_DECREF(substr);
2847 return result;
2848}
2849
2850static
2851int findstring(PyUnicodeObject *self,
2852 PyUnicodeObject *substring,
2853 int start,
2854 int end,
2855 int direction)
2856{
2857 if (start < 0)
2858 start += self->length;
2859 if (start < 0)
2860 start = 0;
2861
2862 if (substring->length == 0)
2863 return start;
2864
2865 if (end > self->length)
2866 end = self->length;
2867 if (end < 0)
2868 end += self->length;
2869 if (end < 0)
2870 end = 0;
2871
2872 end -= substring->length;
2873
2874 if (direction < 0) {
2875 for (; end >= start; end--)
2876 if (Py_UNICODE_MATCH(self, end, substring))
2877 return end;
2878 } else {
2879 for (; start <= end; start++)
2880 if (Py_UNICODE_MATCH(self, start, substring))
2881 return start;
2882 }
2883
2884 return -1;
2885}
2886
2887int PyUnicode_Find(PyObject *str,
2888 PyObject *substr,
2889 int start,
2890 int end,
2891 int direction)
2892{
2893 int result;
2894
2895 str = PyUnicode_FromObject(str);
2896 if (str == NULL)
2897 return -1;
2898 substr = PyUnicode_FromObject(substr);
2899 if (substr == NULL) {
2900 Py_DECREF(substr);
2901 return -1;
2902 }
2903
2904 result = findstring((PyUnicodeObject *)str,
2905 (PyUnicodeObject *)substr,
2906 start, end, direction);
2907 Py_DECREF(str);
2908 Py_DECREF(substr);
2909 return result;
2910}
2911
2912static
2913int tailmatch(PyUnicodeObject *self,
2914 PyUnicodeObject *substring,
2915 int start,
2916 int end,
2917 int direction)
2918{
2919 if (start < 0)
2920 start += self->length;
2921 if (start < 0)
2922 start = 0;
2923
2924 if (substring->length == 0)
2925 return 1;
2926
2927 if (end > self->length)
2928 end = self->length;
2929 if (end < 0)
2930 end += self->length;
2931 if (end < 0)
2932 end = 0;
2933
2934 end -= substring->length;
2935 if (end < start)
2936 return 0;
2937
2938 if (direction > 0) {
2939 if (Py_UNICODE_MATCH(self, end, substring))
2940 return 1;
2941 } else {
2942 if (Py_UNICODE_MATCH(self, start, substring))
2943 return 1;
2944 }
2945
2946 return 0;
2947}
2948
2949int PyUnicode_Tailmatch(PyObject *str,
2950 PyObject *substr,
2951 int start,
2952 int end,
2953 int direction)
2954{
2955 int result;
2956
2957 str = PyUnicode_FromObject(str);
2958 if (str == NULL)
2959 return -1;
2960 substr = PyUnicode_FromObject(substr);
2961 if (substr == NULL) {
2962 Py_DECREF(substr);
2963 return -1;
2964 }
2965
2966 result = tailmatch((PyUnicodeObject *)str,
2967 (PyUnicodeObject *)substr,
2968 start, end, direction);
2969 Py_DECREF(str);
2970 Py_DECREF(substr);
2971 return result;
2972}
2973
2974static
2975const Py_UNICODE *findchar(const Py_UNICODE *s,
2976 int size,
2977 Py_UNICODE ch)
2978{
2979 /* like wcschr, but doesn't stop at NULL characters */
2980
2981 while (size-- > 0) {
2982 if (*s == ch)
2983 return s;
2984 s++;
2985 }
2986
2987 return NULL;
2988}
2989
2990/* Apply fixfct filter to the Unicode object self and return a
2991 reference to the modified object */
2992
2993static
2994PyObject *fixup(PyUnicodeObject *self,
2995 int (*fixfct)(PyUnicodeObject *s))
2996{
2997
2998 PyUnicodeObject *u;
2999
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003000 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003001 if (u == NULL)
3002 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003003
3004 Py_UNICODE_COPY(u->str, self->str, self->length);
3005
Tim Peters7a29bd52001-09-12 03:03:31 +00003006 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003007 /* fixfct should return TRUE if it modified the buffer. If
3008 FALSE, return a reference to the original buffer instead
3009 (to save space, not time) */
3010 Py_INCREF(self);
3011 Py_DECREF(u);
3012 return (PyObject*) self;
3013 }
3014 return (PyObject*) u;
3015}
3016
3017static
3018int fixupper(PyUnicodeObject *self)
3019{
3020 int len = self->length;
3021 Py_UNICODE *s = self->str;
3022 int status = 0;
3023
3024 while (len-- > 0) {
3025 register Py_UNICODE ch;
3026
3027 ch = Py_UNICODE_TOUPPER(*s);
3028 if (ch != *s) {
3029 status = 1;
3030 *s = ch;
3031 }
3032 s++;
3033 }
3034
3035 return status;
3036}
3037
3038static
3039int fixlower(PyUnicodeObject *self)
3040{
3041 int len = self->length;
3042 Py_UNICODE *s = self->str;
3043 int status = 0;
3044
3045 while (len-- > 0) {
3046 register Py_UNICODE ch;
3047
3048 ch = Py_UNICODE_TOLOWER(*s);
3049 if (ch != *s) {
3050 status = 1;
3051 *s = ch;
3052 }
3053 s++;
3054 }
3055
3056 return status;
3057}
3058
3059static
3060int fixswapcase(PyUnicodeObject *self)
3061{
3062 int len = self->length;
3063 Py_UNICODE *s = self->str;
3064 int status = 0;
3065
3066 while (len-- > 0) {
3067 if (Py_UNICODE_ISUPPER(*s)) {
3068 *s = Py_UNICODE_TOLOWER(*s);
3069 status = 1;
3070 } else if (Py_UNICODE_ISLOWER(*s)) {
3071 *s = Py_UNICODE_TOUPPER(*s);
3072 status = 1;
3073 }
3074 s++;
3075 }
3076
3077 return status;
3078}
3079
3080static
3081int fixcapitalize(PyUnicodeObject *self)
3082{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003083 int len = self->length;
3084 Py_UNICODE *s = self->str;
3085 int status = 0;
3086
3087 if (len == 0)
3088 return 0;
3089 if (Py_UNICODE_ISLOWER(*s)) {
3090 *s = Py_UNICODE_TOUPPER(*s);
3091 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00003093 s++;
3094 while (--len > 0) {
3095 if (Py_UNICODE_ISUPPER(*s)) {
3096 *s = Py_UNICODE_TOLOWER(*s);
3097 status = 1;
3098 }
3099 s++;
3100 }
3101 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003102}
3103
3104static
3105int fixtitle(PyUnicodeObject *self)
3106{
3107 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3108 register Py_UNICODE *e;
3109 int previous_is_cased;
3110
3111 /* Shortcut for single character strings */
3112 if (PyUnicode_GET_SIZE(self) == 1) {
3113 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
3114 if (*p != ch) {
3115 *p = ch;
3116 return 1;
3117 }
3118 else
3119 return 0;
3120 }
3121
3122 e = p + PyUnicode_GET_SIZE(self);
3123 previous_is_cased = 0;
3124 for (; p < e; p++) {
3125 register const Py_UNICODE ch = *p;
3126
3127 if (previous_is_cased)
3128 *p = Py_UNICODE_TOLOWER(ch);
3129 else
3130 *p = Py_UNICODE_TOTITLE(ch);
3131
3132 if (Py_UNICODE_ISLOWER(ch) ||
3133 Py_UNICODE_ISUPPER(ch) ||
3134 Py_UNICODE_ISTITLE(ch))
3135 previous_is_cased = 1;
3136 else
3137 previous_is_cased = 0;
3138 }
3139 return 1;
3140}
3141
3142PyObject *PyUnicode_Join(PyObject *separator,
3143 PyObject *seq)
3144{
3145 Py_UNICODE *sep;
3146 int seplen;
3147 PyUnicodeObject *res = NULL;
3148 int reslen = 0;
3149 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003150 int sz = 100;
3151 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00003152 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003153
Tim Peters2cfe3682001-05-05 05:36:48 +00003154 it = PyObject_GetIter(seq);
3155 if (it == NULL)
3156 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003157
3158 if (separator == NULL) {
3159 Py_UNICODE blank = ' ';
3160 sep = &blank;
3161 seplen = 1;
3162 }
3163 else {
3164 separator = PyUnicode_FromObject(separator);
3165 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00003166 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167 sep = PyUnicode_AS_UNICODE(separator);
3168 seplen = PyUnicode_GET_SIZE(separator);
3169 }
3170
3171 res = _PyUnicode_New(sz);
3172 if (res == NULL)
3173 goto onError;
3174 p = PyUnicode_AS_UNICODE(res);
3175 reslen = 0;
3176
Tim Peters2cfe3682001-05-05 05:36:48 +00003177 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003178 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00003179 PyObject *item = PyIter_Next(it);
3180 if (item == NULL) {
3181 if (PyErr_Occurred())
3182 goto onError;
3183 break;
3184 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003185 if (!PyUnicode_Check(item)) {
3186 PyObject *v;
3187 v = PyUnicode_FromObject(item);
3188 Py_DECREF(item);
3189 item = v;
3190 if (item == NULL)
3191 goto onError;
3192 }
3193 itemlen = PyUnicode_GET_SIZE(item);
3194 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003195 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 goto onError;
3197 sz *= 2;
3198 p = PyUnicode_AS_UNICODE(res) + reslen;
3199 }
3200 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003201 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003202 p += seplen;
3203 reslen += seplen;
3204 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003205 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003206 p += itemlen;
3207 reslen += itemlen;
3208 Py_DECREF(item);
3209 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003210 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 goto onError;
3212
3213 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003214 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003215 return (PyObject *)res;
3216
3217 onError:
3218 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00003219 Py_XDECREF(res);
3220 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221 return NULL;
3222}
3223
3224static
3225PyUnicodeObject *pad(PyUnicodeObject *self,
3226 int left,
3227 int right,
3228 Py_UNICODE fill)
3229{
3230 PyUnicodeObject *u;
3231
3232 if (left < 0)
3233 left = 0;
3234 if (right < 0)
3235 right = 0;
3236
Tim Peters7a29bd52001-09-12 03:03:31 +00003237 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003238 Py_INCREF(self);
3239 return self;
3240 }
3241
3242 u = _PyUnicode_New(left + self->length + right);
3243 if (u) {
3244 if (left)
3245 Py_UNICODE_FILL(u->str, fill, left);
3246 Py_UNICODE_COPY(u->str + left, self->str, self->length);
3247 if (right)
3248 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
3249 }
3250
3251 return u;
3252}
3253
3254#define SPLIT_APPEND(data, left, right) \
3255 str = PyUnicode_FromUnicode(data + left, right - left); \
3256 if (!str) \
3257 goto onError; \
3258 if (PyList_Append(list, str)) { \
3259 Py_DECREF(str); \
3260 goto onError; \
3261 } \
3262 else \
3263 Py_DECREF(str);
3264
3265static
3266PyObject *split_whitespace(PyUnicodeObject *self,
3267 PyObject *list,
3268 int maxcount)
3269{
3270 register int i;
3271 register int j;
3272 int len = self->length;
3273 PyObject *str;
3274
3275 for (i = j = 0; i < len; ) {
3276 /* find a token */
3277 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3278 i++;
3279 j = i;
3280 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
3281 i++;
3282 if (j < i) {
3283 if (maxcount-- <= 0)
3284 break;
3285 SPLIT_APPEND(self->str, j, i);
3286 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
3287 i++;
3288 j = i;
3289 }
3290 }
3291 if (j < len) {
3292 SPLIT_APPEND(self->str, j, len);
3293 }
3294 return list;
3295
3296 onError:
3297 Py_DECREF(list);
3298 return NULL;
3299}
3300
3301PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00003302 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303{
3304 register int i;
3305 register int j;
3306 int len;
3307 PyObject *list;
3308 PyObject *str;
3309 Py_UNICODE *data;
3310
3311 string = PyUnicode_FromObject(string);
3312 if (string == NULL)
3313 return NULL;
3314 data = PyUnicode_AS_UNICODE(string);
3315 len = PyUnicode_GET_SIZE(string);
3316
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 list = PyList_New(0);
3318 if (!list)
3319 goto onError;
3320
3321 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00003322 int eol;
3323
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324 /* Find a line and append it */
3325 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
3326 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327
3328 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00003329 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003330 if (i < len) {
3331 if (data[i] == '\r' && i + 1 < len &&
3332 data[i+1] == '\n')
3333 i += 2;
3334 else
3335 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00003336 if (keepends)
3337 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003338 }
Guido van Rossum86662912000-04-11 15:38:46 +00003339 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003340 j = i;
3341 }
3342 if (j < len) {
3343 SPLIT_APPEND(data, j, len);
3344 }
3345
3346 Py_DECREF(string);
3347 return list;
3348
3349 onError:
3350 Py_DECREF(list);
3351 Py_DECREF(string);
3352 return NULL;
3353}
3354
3355static
3356PyObject *split_char(PyUnicodeObject *self,
3357 PyObject *list,
3358 Py_UNICODE ch,
3359 int maxcount)
3360{
3361 register int i;
3362 register int j;
3363 int len = self->length;
3364 PyObject *str;
3365
3366 for (i = j = 0; i < len; ) {
3367 if (self->str[i] == ch) {
3368 if (maxcount-- <= 0)
3369 break;
3370 SPLIT_APPEND(self->str, j, i);
3371 i = j = i + 1;
3372 } else
3373 i++;
3374 }
3375 if (j <= len) {
3376 SPLIT_APPEND(self->str, j, len);
3377 }
3378 return list;
3379
3380 onError:
3381 Py_DECREF(list);
3382 return NULL;
3383}
3384
3385static
3386PyObject *split_substring(PyUnicodeObject *self,
3387 PyObject *list,
3388 PyUnicodeObject *substring,
3389 int maxcount)
3390{
3391 register int i;
3392 register int j;
3393 int len = self->length;
3394 int sublen = substring->length;
3395 PyObject *str;
3396
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003397 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003398 if (Py_UNICODE_MATCH(self, i, substring)) {
3399 if (maxcount-- <= 0)
3400 break;
3401 SPLIT_APPEND(self->str, j, i);
3402 i = j = i + sublen;
3403 } else
3404 i++;
3405 }
3406 if (j <= len) {
3407 SPLIT_APPEND(self->str, j, len);
3408 }
3409 return list;
3410
3411 onError:
3412 Py_DECREF(list);
3413 return NULL;
3414}
3415
3416#undef SPLIT_APPEND
3417
3418static
3419PyObject *split(PyUnicodeObject *self,
3420 PyUnicodeObject *substring,
3421 int maxcount)
3422{
3423 PyObject *list;
3424
3425 if (maxcount < 0)
3426 maxcount = INT_MAX;
3427
3428 list = PyList_New(0);
3429 if (!list)
3430 return NULL;
3431
3432 if (substring == NULL)
3433 return split_whitespace(self,list,maxcount);
3434
3435 else if (substring->length == 1)
3436 return split_char(self,list,substring->str[0],maxcount);
3437
3438 else if (substring->length == 0) {
3439 Py_DECREF(list);
3440 PyErr_SetString(PyExc_ValueError, "empty separator");
3441 return NULL;
3442 }
3443 else
3444 return split_substring(self,list,substring,maxcount);
3445}
3446
3447static
3448PyObject *strip(PyUnicodeObject *self,
3449 int left,
3450 int right)
3451{
3452 Py_UNICODE *p = self->str;
3453 int start = 0;
3454 int end = self->length;
3455
3456 if (left)
3457 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3458 start++;
3459
3460 if (right)
3461 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3462 end--;
3463
Tim Peters7a29bd52001-09-12 03:03:31 +00003464 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003465 /* couldn't strip anything off, return original string */
3466 Py_INCREF(self);
3467 return (PyObject*) self;
3468 }
3469
3470 return (PyObject*) PyUnicode_FromUnicode(
3471 self->str + start,
3472 end - start
3473 );
3474}
3475
3476static
3477PyObject *replace(PyUnicodeObject *self,
3478 PyUnicodeObject *str1,
3479 PyUnicodeObject *str2,
3480 int maxcount)
3481{
3482 PyUnicodeObject *u;
3483
3484 if (maxcount < 0)
3485 maxcount = INT_MAX;
3486
3487 if (str1->length == 1 && str2->length == 1) {
3488 int i;
3489
3490 /* replace characters */
Tim Peters7a29bd52001-09-12 03:03:31 +00003491 if (!findchar(self->str, self->length, str1->str[0]) &&
3492 PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003493 /* nothing to replace, return original string */
3494 Py_INCREF(self);
3495 u = self;
3496 } else {
3497 Py_UNICODE u1 = str1->str[0];
3498 Py_UNICODE u2 = str2->str[0];
3499
3500 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003501 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003502 self->length
3503 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003504 if (u != NULL) {
3505 Py_UNICODE_COPY(u->str, self->str,
3506 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003507 for (i = 0; i < u->length; i++)
3508 if (u->str[i] == u1) {
3509 if (--maxcount < 0)
3510 break;
3511 u->str[i] = u2;
3512 }
3513 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003514 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003515
3516 } else {
3517 int n, i;
3518 Py_UNICODE *p;
3519
3520 /* replace strings */
3521 n = count(self, 0, self->length, str1);
3522 if (n > maxcount)
3523 n = maxcount;
Tim Peters7a29bd52001-09-12 03:03:31 +00003524 if (n == 0 && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 /* nothing to replace, return original string */
3526 Py_INCREF(self);
3527 u = self;
3528 } else {
3529 u = _PyUnicode_New(
3530 self->length + n * (str2->length - str1->length));
3531 if (u) {
3532 i = 0;
3533 p = u->str;
3534 while (i <= self->length - str1->length)
3535 if (Py_UNICODE_MATCH(self, i, str1)) {
3536 /* replace string segment */
3537 Py_UNICODE_COPY(p, str2->str, str2->length);
3538 p += str2->length;
3539 i += str1->length;
3540 if (--n <= 0) {
3541 /* copy remaining part */
3542 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3543 break;
3544 }
3545 } else
3546 *p++ = self->str[i++];
3547 }
3548 }
3549 }
3550
3551 return (PyObject *) u;
3552}
3553
3554/* --- Unicode Object Methods --------------------------------------------- */
3555
3556static char title__doc__[] =
3557"S.title() -> unicode\n\
3558\n\
3559Return a titlecased version of S, i.e. words start with title case\n\
3560characters, all remaining cased characters have lower case.";
3561
3562static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003563unicode_title(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 return fixup(self, fixtitle);
3566}
3567
3568static char capitalize__doc__[] =
3569"S.capitalize() -> unicode\n\
3570\n\
3571Return a capitalized version of S, i.e. make the first character\n\
3572have upper case.";
3573
3574static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003575unicode_capitalize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003576{
Guido van Rossumd57fd912000-03-10 22:53:23 +00003577 return fixup(self, fixcapitalize);
3578}
3579
3580#if 0
3581static char capwords__doc__[] =
3582"S.capwords() -> unicode\n\
3583\n\
3584Apply .capitalize() to all words in S and return the result with\n\
3585normalized whitespace (all whitespace strings are replaced by ' ').";
3586
3587static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00003588unicode_capwords(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589{
3590 PyObject *list;
3591 PyObject *item;
3592 int i;
3593
Guido van Rossumd57fd912000-03-10 22:53:23 +00003594 /* Split into words */
3595 list = split(self, NULL, -1);
3596 if (!list)
3597 return NULL;
3598
3599 /* Capitalize each word */
3600 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3601 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3602 fixcapitalize);
3603 if (item == NULL)
3604 goto onError;
3605 Py_DECREF(PyList_GET_ITEM(list, i));
3606 PyList_SET_ITEM(list, i, item);
3607 }
3608
3609 /* Join the words to form a new string */
3610 item = PyUnicode_Join(NULL, list);
3611
3612onError:
3613 Py_DECREF(list);
3614 return (PyObject *)item;
3615}
3616#endif
3617
3618static char center__doc__[] =
3619"S.center(width) -> unicode\n\
3620\n\
3621Return S centered in a Unicode string of length width. Padding is done\n\
3622using spaces.";
3623
3624static PyObject *
3625unicode_center(PyUnicodeObject *self, PyObject *args)
3626{
3627 int marg, left;
3628 int width;
3629
3630 if (!PyArg_ParseTuple(args, "i:center", &width))
3631 return NULL;
3632
Tim Peters7a29bd52001-09-12 03:03:31 +00003633 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 Py_INCREF(self);
3635 return (PyObject*) self;
3636 }
3637
3638 marg = width - self->length;
3639 left = marg / 2 + (marg & width & 1);
3640
3641 return (PyObject*) pad(self, left, marg - left, ' ');
3642}
3643
Marc-André Lemburge5034372000-08-08 08:04:29 +00003644#if 0
3645
3646/* This code should go into some future Unicode collation support
3647 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003648 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003649
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003650/* speedy UTF-16 code point order comparison */
3651/* gleaned from: */
3652/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3653
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003654static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003655{
3656 0, 0, 0, 0, 0, 0, 0, 0,
3657 0, 0, 0, 0, 0, 0, 0, 0,
3658 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003659 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003660};
3661
Guido van Rossumd57fd912000-03-10 22:53:23 +00003662static int
3663unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3664{
3665 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003666
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667 Py_UNICODE *s1 = str1->str;
3668 Py_UNICODE *s2 = str2->str;
3669
3670 len1 = str1->length;
3671 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003672
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003674 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003675
3676 c1 = *s1++;
3677 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003678
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003679 if (c1 > (1<<11) * 26)
3680 c1 += utf16Fixup[c1>>11];
3681 if (c2 > (1<<11) * 26)
3682 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003683 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003684
3685 if (c1 != c2)
3686 return (c1 < c2) ? -1 : 1;
3687
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003688 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689 }
3690
3691 return (len1 < len2) ? -1 : (len1 != len2);
3692}
3693
Marc-André Lemburge5034372000-08-08 08:04:29 +00003694#else
3695
3696static int
3697unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3698{
3699 register int len1, len2;
3700
3701 Py_UNICODE *s1 = str1->str;
3702 Py_UNICODE *s2 = str2->str;
3703
3704 len1 = str1->length;
3705 len2 = str2->length;
3706
3707 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003708 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003709
Fredrik Lundh45714e92001-06-26 16:39:36 +00003710 c1 = *s1++;
3711 c2 = *s2++;
3712
3713 if (c1 != c2)
3714 return (c1 < c2) ? -1 : 1;
3715
Marc-André Lemburge5034372000-08-08 08:04:29 +00003716 len1--; len2--;
3717 }
3718
3719 return (len1 < len2) ? -1 : (len1 != len2);
3720}
3721
3722#endif
3723
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724int PyUnicode_Compare(PyObject *left,
3725 PyObject *right)
3726{
3727 PyUnicodeObject *u = NULL, *v = NULL;
3728 int result;
3729
3730 /* Coerce the two arguments */
3731 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3732 if (u == NULL)
3733 goto onError;
3734 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3735 if (v == NULL)
3736 goto onError;
3737
Thomas Wouters7e474022000-07-16 12:04:32 +00003738 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003739 if (v == u) {
3740 Py_DECREF(u);
3741 Py_DECREF(v);
3742 return 0;
3743 }
3744
3745 result = unicode_compare(u, v);
3746
3747 Py_DECREF(u);
3748 Py_DECREF(v);
3749 return result;
3750
3751onError:
3752 Py_XDECREF(u);
3753 Py_XDECREF(v);
3754 return -1;
3755}
3756
Guido van Rossum403d68b2000-03-13 15:55:09 +00003757int PyUnicode_Contains(PyObject *container,
3758 PyObject *element)
3759{
3760 PyUnicodeObject *u = NULL, *v = NULL;
3761 int result;
3762 register const Py_UNICODE *p, *e;
3763 register Py_UNICODE ch;
3764
3765 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003766 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003767 if (v == NULL) {
3768 PyErr_SetString(PyExc_TypeError,
3769 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003770 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003771 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003772 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3773 if (u == NULL) {
3774 Py_DECREF(v);
3775 goto onError;
3776 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003777
3778 /* Check v in u */
3779 if (PyUnicode_GET_SIZE(v) != 1) {
3780 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003781 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003782 goto onError;
3783 }
3784 ch = *PyUnicode_AS_UNICODE(v);
3785 p = PyUnicode_AS_UNICODE(u);
3786 e = p + PyUnicode_GET_SIZE(u);
3787 result = 0;
3788 while (p < e) {
3789 if (*p++ == ch) {
3790 result = 1;
3791 break;
3792 }
3793 }
3794
3795 Py_DECREF(u);
3796 Py_DECREF(v);
3797 return result;
3798
3799onError:
3800 Py_XDECREF(u);
3801 Py_XDECREF(v);
3802 return -1;
3803}
3804
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805/* Concat to string or Unicode object giving a new Unicode object. */
3806
3807PyObject *PyUnicode_Concat(PyObject *left,
3808 PyObject *right)
3809{
3810 PyUnicodeObject *u = NULL, *v = NULL, *w;
3811
3812 /* Coerce the two arguments */
3813 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3814 if (u == NULL)
3815 goto onError;
3816 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3817 if (v == NULL)
3818 goto onError;
3819
3820 /* Shortcuts */
3821 if (v == unicode_empty) {
3822 Py_DECREF(v);
3823 return (PyObject *)u;
3824 }
3825 if (u == unicode_empty) {
3826 Py_DECREF(u);
3827 return (PyObject *)v;
3828 }
3829
3830 /* Concat the two Unicode strings */
3831 w = _PyUnicode_New(u->length + v->length);
3832 if (w == NULL)
3833 goto onError;
3834 Py_UNICODE_COPY(w->str, u->str, u->length);
3835 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3836
3837 Py_DECREF(u);
3838 Py_DECREF(v);
3839 return (PyObject *)w;
3840
3841onError:
3842 Py_XDECREF(u);
3843 Py_XDECREF(v);
3844 return NULL;
3845}
3846
3847static char count__doc__[] =
3848"S.count(sub[, start[, end]]) -> int\n\
3849\n\
3850Return the number of occurrences of substring sub in Unicode string\n\
3851S[start:end]. Optional arguments start and end are\n\
3852interpreted as in slice notation.";
3853
3854static PyObject *
3855unicode_count(PyUnicodeObject *self, PyObject *args)
3856{
3857 PyUnicodeObject *substring;
3858 int start = 0;
3859 int end = INT_MAX;
3860 PyObject *result;
3861
Guido van Rossumb8872e62000-05-09 14:14:27 +00003862 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3863 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 return NULL;
3865
3866 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3867 (PyObject *)substring);
3868 if (substring == NULL)
3869 return NULL;
3870
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 if (start < 0)
3872 start += self->length;
3873 if (start < 0)
3874 start = 0;
3875 if (end > self->length)
3876 end = self->length;
3877 if (end < 0)
3878 end += self->length;
3879 if (end < 0)
3880 end = 0;
3881
3882 result = PyInt_FromLong((long) count(self, start, end, substring));
3883
3884 Py_DECREF(substring);
3885 return result;
3886}
3887
3888static char encode__doc__[] =
3889"S.encode([encoding[,errors]]) -> string\n\
3890\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003891Return an encoded string version of S. Default encoding is the current\n\
3892default string encoding. errors may be given to set a different error\n\
3893handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3894a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895
3896static PyObject *
3897unicode_encode(PyUnicodeObject *self, PyObject *args)
3898{
3899 char *encoding = NULL;
3900 char *errors = NULL;
3901 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3902 return NULL;
3903 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3904}
3905
3906static char expandtabs__doc__[] =
3907"S.expandtabs([tabsize]) -> unicode\n\
3908\n\
3909Return a copy of S where all tab characters are expanded using spaces.\n\
3910If tabsize is not given, a tab size of 8 characters is assumed.";
3911
3912static PyObject*
3913unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3914{
3915 Py_UNICODE *e;
3916 Py_UNICODE *p;
3917 Py_UNICODE *q;
3918 int i, j;
3919 PyUnicodeObject *u;
3920 int tabsize = 8;
3921
3922 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3923 return NULL;
3924
Thomas Wouters7e474022000-07-16 12:04:32 +00003925 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926 i = j = 0;
3927 e = self->str + self->length;
3928 for (p = self->str; p < e; p++)
3929 if (*p == '\t') {
3930 if (tabsize > 0)
3931 j += tabsize - (j % tabsize);
3932 }
3933 else {
3934 j++;
3935 if (*p == '\n' || *p == '\r') {
3936 i += j;
3937 j = 0;
3938 }
3939 }
3940
3941 /* Second pass: create output string and fill it */
3942 u = _PyUnicode_New(i + j);
3943 if (!u)
3944 return NULL;
3945
3946 j = 0;
3947 q = u->str;
3948
3949 for (p = self->str; p < e; p++)
3950 if (*p == '\t') {
3951 if (tabsize > 0) {
3952 i = tabsize - (j % tabsize);
3953 j += i;
3954 while (i--)
3955 *q++ = ' ';
3956 }
3957 }
3958 else {
3959 j++;
3960 *q++ = *p;
3961 if (*p == '\n' || *p == '\r')
3962 j = 0;
3963 }
3964
3965 return (PyObject*) u;
3966}
3967
3968static char find__doc__[] =
3969"S.find(sub [,start [,end]]) -> int\n\
3970\n\
3971Return the lowest index in S where substring sub is found,\n\
3972such that sub is contained within s[start,end]. Optional\n\
3973arguments start and end are interpreted as in slice notation.\n\
3974\n\
3975Return -1 on failure.";
3976
3977static PyObject *
3978unicode_find(PyUnicodeObject *self, PyObject *args)
3979{
3980 PyUnicodeObject *substring;
3981 int start = 0;
3982 int end = INT_MAX;
3983 PyObject *result;
3984
Guido van Rossumb8872e62000-05-09 14:14:27 +00003985 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3986 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003987 return NULL;
3988 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3989 (PyObject *)substring);
3990 if (substring == NULL)
3991 return NULL;
3992
3993 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3994
3995 Py_DECREF(substring);
3996 return result;
3997}
3998
3999static PyObject *
4000unicode_getitem(PyUnicodeObject *self, int index)
4001{
4002 if (index < 0 || index >= self->length) {
4003 PyErr_SetString(PyExc_IndexError, "string index out of range");
4004 return NULL;
4005 }
4006
4007 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
4008}
4009
4010static long
4011unicode_hash(PyUnicodeObject *self)
4012{
Fredrik Lundhdde61642000-07-10 18:27:47 +00004013 /* Since Unicode objects compare equal to their ASCII string
4014 counterparts, they should use the individual character values
4015 as basis for their hash value. This is needed to assure that
4016 strings and Unicode objects behave in the same way as
4017 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004018
Fredrik Lundhdde61642000-07-10 18:27:47 +00004019 register int len;
4020 register Py_UNICODE *p;
4021 register long x;
4022
Guido van Rossumd57fd912000-03-10 22:53:23 +00004023 if (self->hash != -1)
4024 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00004025 len = PyUnicode_GET_SIZE(self);
4026 p = PyUnicode_AS_UNICODE(self);
4027 x = *p << 7;
4028 while (--len >= 0)
4029 x = (1000003*x) ^ *p++;
4030 x ^= PyUnicode_GET_SIZE(self);
4031 if (x == -1)
4032 x = -2;
4033 self->hash = x;
4034 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035}
4036
4037static char index__doc__[] =
4038"S.index(sub [,start [,end]]) -> int\n\
4039\n\
4040Like S.find() but raise ValueError when the substring is not found.";
4041
4042static PyObject *
4043unicode_index(PyUnicodeObject *self, PyObject *args)
4044{
4045 int result;
4046 PyUnicodeObject *substring;
4047 int start = 0;
4048 int end = INT_MAX;
4049
Guido van Rossumb8872e62000-05-09 14:14:27 +00004050 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
4051 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004052 return NULL;
4053
4054 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4055 (PyObject *)substring);
4056 if (substring == NULL)
4057 return NULL;
4058
4059 result = findstring(self, substring, start, end, 1);
4060
4061 Py_DECREF(substring);
4062 if (result < 0) {
4063 PyErr_SetString(PyExc_ValueError, "substring not found");
4064 return NULL;
4065 }
4066 return PyInt_FromLong(result);
4067}
4068
4069static char islower__doc__[] =
4070"S.islower() -> int\n\
4071\n\
4072Return 1 if all cased characters in S are lowercase and there is\n\
4073at least one cased character in S, 0 otherwise.";
4074
4075static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004076unicode_islower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004077{
4078 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4079 register const Py_UNICODE *e;
4080 int cased;
4081
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082 /* Shortcut for single character strings */
4083 if (PyUnicode_GET_SIZE(self) == 1)
4084 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
4085
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004086 /* Special case for empty strings */
4087 if (PyString_GET_SIZE(self) == 0)
4088 return PyInt_FromLong(0);
4089
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 e = p + PyUnicode_GET_SIZE(self);
4091 cased = 0;
4092 for (; p < e; p++) {
4093 register const Py_UNICODE ch = *p;
4094
4095 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
4096 return PyInt_FromLong(0);
4097 else if (!cased && Py_UNICODE_ISLOWER(ch))
4098 cased = 1;
4099 }
4100 return PyInt_FromLong(cased);
4101}
4102
4103static char isupper__doc__[] =
4104"S.isupper() -> int\n\
4105\n\
4106Return 1 if all cased characters in S are uppercase and there is\n\
4107at least one cased character in S, 0 otherwise.";
4108
4109static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004110unicode_isupper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111{
4112 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4113 register const Py_UNICODE *e;
4114 int cased;
4115
Guido van Rossumd57fd912000-03-10 22:53:23 +00004116 /* Shortcut for single character strings */
4117 if (PyUnicode_GET_SIZE(self) == 1)
4118 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
4119
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004120 /* Special case for empty strings */
4121 if (PyString_GET_SIZE(self) == 0)
4122 return PyInt_FromLong(0);
4123
Guido van Rossumd57fd912000-03-10 22:53:23 +00004124 e = p + PyUnicode_GET_SIZE(self);
4125 cased = 0;
4126 for (; p < e; p++) {
4127 register const Py_UNICODE ch = *p;
4128
4129 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
4130 return PyInt_FromLong(0);
4131 else if (!cased && Py_UNICODE_ISUPPER(ch))
4132 cased = 1;
4133 }
4134 return PyInt_FromLong(cased);
4135}
4136
4137static char istitle__doc__[] =
4138"S.istitle() -> int\n\
4139\n\
4140Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
4141may only follow uncased characters and lowercase characters only cased\n\
4142ones. Return 0 otherwise.";
4143
4144static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004145unicode_istitle(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146{
4147 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4148 register const Py_UNICODE *e;
4149 int cased, previous_is_cased;
4150
Guido van Rossumd57fd912000-03-10 22:53:23 +00004151 /* Shortcut for single character strings */
4152 if (PyUnicode_GET_SIZE(self) == 1)
4153 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
4154 (Py_UNICODE_ISUPPER(*p) != 0));
4155
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004156 /* Special case for empty strings */
4157 if (PyString_GET_SIZE(self) == 0)
4158 return PyInt_FromLong(0);
4159
Guido van Rossumd57fd912000-03-10 22:53:23 +00004160 e = p + PyUnicode_GET_SIZE(self);
4161 cased = 0;
4162 previous_is_cased = 0;
4163 for (; p < e; p++) {
4164 register const Py_UNICODE ch = *p;
4165
4166 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
4167 if (previous_is_cased)
4168 return PyInt_FromLong(0);
4169 previous_is_cased = 1;
4170 cased = 1;
4171 }
4172 else if (Py_UNICODE_ISLOWER(ch)) {
4173 if (!previous_is_cased)
4174 return PyInt_FromLong(0);
4175 previous_is_cased = 1;
4176 cased = 1;
4177 }
4178 else
4179 previous_is_cased = 0;
4180 }
4181 return PyInt_FromLong(cased);
4182}
4183
4184static char isspace__doc__[] =
4185"S.isspace() -> int\n\
4186\n\
4187Return 1 if there are only whitespace characters in S,\n\
41880 otherwise.";
4189
4190static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004191unicode_isspace(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004192{
4193 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4194 register const Py_UNICODE *e;
4195
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 /* Shortcut for single character strings */
4197 if (PyUnicode_GET_SIZE(self) == 1 &&
4198 Py_UNICODE_ISSPACE(*p))
4199 return PyInt_FromLong(1);
4200
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004201 /* Special case for empty strings */
4202 if (PyString_GET_SIZE(self) == 0)
4203 return PyInt_FromLong(0);
4204
Guido van Rossumd57fd912000-03-10 22:53:23 +00004205 e = p + PyUnicode_GET_SIZE(self);
4206 for (; p < e; p++) {
4207 if (!Py_UNICODE_ISSPACE(*p))
4208 return PyInt_FromLong(0);
4209 }
4210 return PyInt_FromLong(1);
4211}
4212
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004213static char isalpha__doc__[] =
4214"S.isalpha() -> int\n\
4215\n\
4216Return 1 if all characters in S are alphabetic\n\
4217and there is at least one character in S, 0 otherwise.";
4218
4219static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004220unicode_isalpha(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004221{
4222 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4223 register const Py_UNICODE *e;
4224
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004225 /* Shortcut for single character strings */
4226 if (PyUnicode_GET_SIZE(self) == 1 &&
4227 Py_UNICODE_ISALPHA(*p))
4228 return PyInt_FromLong(1);
4229
4230 /* Special case for empty strings */
4231 if (PyString_GET_SIZE(self) == 0)
4232 return PyInt_FromLong(0);
4233
4234 e = p + PyUnicode_GET_SIZE(self);
4235 for (; p < e; p++) {
4236 if (!Py_UNICODE_ISALPHA(*p))
4237 return PyInt_FromLong(0);
4238 }
4239 return PyInt_FromLong(1);
4240}
4241
4242static char isalnum__doc__[] =
4243"S.isalnum() -> int\n\
4244\n\
4245Return 1 if all characters in S are alphanumeric\n\
4246and there is at least one character in S, 0 otherwise.";
4247
4248static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004249unicode_isalnum(PyUnicodeObject *self)
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004250{
4251 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4252 register const Py_UNICODE *e;
4253
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004254 /* Shortcut for single character strings */
4255 if (PyUnicode_GET_SIZE(self) == 1 &&
4256 Py_UNICODE_ISALNUM(*p))
4257 return PyInt_FromLong(1);
4258
4259 /* Special case for empty strings */
4260 if (PyString_GET_SIZE(self) == 0)
4261 return PyInt_FromLong(0);
4262
4263 e = p + PyUnicode_GET_SIZE(self);
4264 for (; p < e; p++) {
4265 if (!Py_UNICODE_ISALNUM(*p))
4266 return PyInt_FromLong(0);
4267 }
4268 return PyInt_FromLong(1);
4269}
4270
Guido van Rossumd57fd912000-03-10 22:53:23 +00004271static char isdecimal__doc__[] =
4272"S.isdecimal() -> int\n\
4273\n\
4274Return 1 if there are only decimal characters in S,\n\
42750 otherwise.";
4276
4277static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004278unicode_isdecimal(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004279{
4280 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4281 register const Py_UNICODE *e;
4282
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283 /* Shortcut for single character strings */
4284 if (PyUnicode_GET_SIZE(self) == 1 &&
4285 Py_UNICODE_ISDECIMAL(*p))
4286 return PyInt_FromLong(1);
4287
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004288 /* Special case for empty strings */
4289 if (PyString_GET_SIZE(self) == 0)
4290 return PyInt_FromLong(0);
4291
Guido van Rossumd57fd912000-03-10 22:53:23 +00004292 e = p + PyUnicode_GET_SIZE(self);
4293 for (; p < e; p++) {
4294 if (!Py_UNICODE_ISDECIMAL(*p))
4295 return PyInt_FromLong(0);
4296 }
4297 return PyInt_FromLong(1);
4298}
4299
4300static char isdigit__doc__[] =
4301"S.isdigit() -> int\n\
4302\n\
4303Return 1 if there are only digit characters in S,\n\
43040 otherwise.";
4305
4306static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004307unicode_isdigit(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308{
4309 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4310 register const Py_UNICODE *e;
4311
Guido van Rossumd57fd912000-03-10 22:53:23 +00004312 /* Shortcut for single character strings */
4313 if (PyUnicode_GET_SIZE(self) == 1 &&
4314 Py_UNICODE_ISDIGIT(*p))
4315 return PyInt_FromLong(1);
4316
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004317 /* Special case for empty strings */
4318 if (PyString_GET_SIZE(self) == 0)
4319 return PyInt_FromLong(0);
4320
Guido van Rossumd57fd912000-03-10 22:53:23 +00004321 e = p + PyUnicode_GET_SIZE(self);
4322 for (; p < e; p++) {
4323 if (!Py_UNICODE_ISDIGIT(*p))
4324 return PyInt_FromLong(0);
4325 }
4326 return PyInt_FromLong(1);
4327}
4328
4329static char isnumeric__doc__[] =
4330"S.isnumeric() -> int\n\
4331\n\
4332Return 1 if there are only numeric characters in S,\n\
43330 otherwise.";
4334
4335static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004336unicode_isnumeric(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004337{
4338 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4339 register const Py_UNICODE *e;
4340
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 /* Shortcut for single character strings */
4342 if (PyUnicode_GET_SIZE(self) == 1 &&
4343 Py_UNICODE_ISNUMERIC(*p))
4344 return PyInt_FromLong(1);
4345
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004346 /* Special case for empty strings */
4347 if (PyString_GET_SIZE(self) == 0)
4348 return PyInt_FromLong(0);
4349
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350 e = p + PyUnicode_GET_SIZE(self);
4351 for (; p < e; p++) {
4352 if (!Py_UNICODE_ISNUMERIC(*p))
4353 return PyInt_FromLong(0);
4354 }
4355 return PyInt_FromLong(1);
4356}
4357
4358static char join__doc__[] =
4359"S.join(sequence) -> unicode\n\
4360\n\
4361Return a string which is the concatenation of the strings in the\n\
4362sequence. The separator between elements is S.";
4363
4364static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004365unicode_join(PyObject *self, PyObject *data)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004366{
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004367 return PyUnicode_Join(self, data);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004368}
4369
4370static int
4371unicode_length(PyUnicodeObject *self)
4372{
4373 return self->length;
4374}
4375
4376static char ljust__doc__[] =
4377"S.ljust(width) -> unicode\n\
4378\n\
4379Return S left justified in a Unicode string of length width. Padding is\n\
4380done using spaces.";
4381
4382static PyObject *
4383unicode_ljust(PyUnicodeObject *self, PyObject *args)
4384{
4385 int width;
4386 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4387 return NULL;
4388
Tim Peters7a29bd52001-09-12 03:03:31 +00004389 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004390 Py_INCREF(self);
4391 return (PyObject*) self;
4392 }
4393
4394 return (PyObject*) pad(self, 0, width - self->length, ' ');
4395}
4396
4397static char lower__doc__[] =
4398"S.lower() -> unicode\n\
4399\n\
4400Return a copy of the string S converted to lowercase.";
4401
4402static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004403unicode_lower(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004404{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405 return fixup(self, fixlower);
4406}
4407
4408static char lstrip__doc__[] =
4409"S.lstrip() -> unicode\n\
4410\n\
4411Return a copy of the string S with leading whitespace removed.";
4412
4413static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004414unicode_lstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416 return strip(self, 1, 0);
4417}
4418
4419static PyObject*
4420unicode_repeat(PyUnicodeObject *str, int len)
4421{
4422 PyUnicodeObject *u;
4423 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004424 int nchars;
4425 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004426
4427 if (len < 0)
4428 len = 0;
4429
Tim Peters7a29bd52001-09-12 03:03:31 +00004430 if (len == 1 && PyUnicode_CheckExact(str)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004431 /* no repeat, return original string */
4432 Py_INCREF(str);
4433 return (PyObject*) str;
4434 }
Tim Peters8f422462000-09-09 06:13:41 +00004435
4436 /* ensure # of chars needed doesn't overflow int and # of bytes
4437 * needed doesn't overflow size_t
4438 */
4439 nchars = len * str->length;
4440 if (len && nchars / len != str->length) {
4441 PyErr_SetString(PyExc_OverflowError,
4442 "repeated string is too long");
4443 return NULL;
4444 }
4445 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4446 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4447 PyErr_SetString(PyExc_OverflowError,
4448 "repeated string is too long");
4449 return NULL;
4450 }
4451 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004452 if (!u)
4453 return NULL;
4454
4455 p = u->str;
4456
4457 while (len-- > 0) {
4458 Py_UNICODE_COPY(p, str->str, str->length);
4459 p += str->length;
4460 }
4461
4462 return (PyObject*) u;
4463}
4464
4465PyObject *PyUnicode_Replace(PyObject *obj,
4466 PyObject *subobj,
4467 PyObject *replobj,
4468 int maxcount)
4469{
4470 PyObject *self;
4471 PyObject *str1;
4472 PyObject *str2;
4473 PyObject *result;
4474
4475 self = PyUnicode_FromObject(obj);
4476 if (self == NULL)
4477 return NULL;
4478 str1 = PyUnicode_FromObject(subobj);
4479 if (str1 == NULL) {
4480 Py_DECREF(self);
4481 return NULL;
4482 }
4483 str2 = PyUnicode_FromObject(replobj);
4484 if (str2 == NULL) {
4485 Py_DECREF(self);
4486 Py_DECREF(str1);
4487 return NULL;
4488 }
4489 result = replace((PyUnicodeObject *)self,
4490 (PyUnicodeObject *)str1,
4491 (PyUnicodeObject *)str2,
4492 maxcount);
4493 Py_DECREF(self);
4494 Py_DECREF(str1);
4495 Py_DECREF(str2);
4496 return result;
4497}
4498
4499static char replace__doc__[] =
4500"S.replace (old, new[, maxsplit]) -> unicode\n\
4501\n\
4502Return a copy of S with all occurrences of substring\n\
4503old replaced by new. If the optional argument maxsplit is\n\
4504given, only the first maxsplit occurrences are replaced.";
4505
4506static PyObject*
4507unicode_replace(PyUnicodeObject *self, PyObject *args)
4508{
4509 PyUnicodeObject *str1;
4510 PyUnicodeObject *str2;
4511 int maxcount = -1;
4512 PyObject *result;
4513
4514 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4515 return NULL;
4516 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4517 if (str1 == NULL)
4518 return NULL;
4519 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4520 if (str2 == NULL)
4521 return NULL;
4522
4523 result = replace(self, str1, str2, maxcount);
4524
4525 Py_DECREF(str1);
4526 Py_DECREF(str2);
4527 return result;
4528}
4529
4530static
4531PyObject *unicode_repr(PyObject *unicode)
4532{
4533 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4534 PyUnicode_GET_SIZE(unicode),
4535 1);
4536}
4537
4538static char rfind__doc__[] =
4539"S.rfind(sub [,start [,end]]) -> int\n\
4540\n\
4541Return the highest index in S where substring sub is found,\n\
4542such that sub is contained within s[start,end]. Optional\n\
4543arguments start and end are interpreted as in slice notation.\n\
4544\n\
4545Return -1 on failure.";
4546
4547static PyObject *
4548unicode_rfind(PyUnicodeObject *self, PyObject *args)
4549{
4550 PyUnicodeObject *substring;
4551 int start = 0;
4552 int end = INT_MAX;
4553 PyObject *result;
4554
Guido van Rossumb8872e62000-05-09 14:14:27 +00004555 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4556 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004557 return NULL;
4558 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4559 (PyObject *)substring);
4560 if (substring == NULL)
4561 return NULL;
4562
4563 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4564
4565 Py_DECREF(substring);
4566 return result;
4567}
4568
4569static char rindex__doc__[] =
4570"S.rindex(sub [,start [,end]]) -> int\n\
4571\n\
4572Like S.rfind() but raise ValueError when the substring is not found.";
4573
4574static PyObject *
4575unicode_rindex(PyUnicodeObject *self, PyObject *args)
4576{
4577 int result;
4578 PyUnicodeObject *substring;
4579 int start = 0;
4580 int end = INT_MAX;
4581
Guido van Rossumb8872e62000-05-09 14:14:27 +00004582 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4583 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584 return NULL;
4585 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4586 (PyObject *)substring);
4587 if (substring == NULL)
4588 return NULL;
4589
4590 result = findstring(self, substring, start, end, -1);
4591
4592 Py_DECREF(substring);
4593 if (result < 0) {
4594 PyErr_SetString(PyExc_ValueError, "substring not found");
4595 return NULL;
4596 }
4597 return PyInt_FromLong(result);
4598}
4599
4600static char rjust__doc__[] =
4601"S.rjust(width) -> unicode\n\
4602\n\
4603Return S right justified in a Unicode string of length width. Padding is\n\
4604done using spaces.";
4605
4606static PyObject *
4607unicode_rjust(PyUnicodeObject *self, PyObject *args)
4608{
4609 int width;
4610 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4611 return NULL;
4612
Tim Peters7a29bd52001-09-12 03:03:31 +00004613 if (self->length >= width && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614 Py_INCREF(self);
4615 return (PyObject*) self;
4616 }
4617
4618 return (PyObject*) pad(self, width - self->length, 0, ' ');
4619}
4620
4621static char rstrip__doc__[] =
4622"S.rstrip() -> unicode\n\
4623\n\
4624Return a copy of the string S with trailing whitespace removed.";
4625
4626static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004627unicode_rstrip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004628{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 return strip(self, 0, 1);
4630}
4631
4632static PyObject*
4633unicode_slice(PyUnicodeObject *self, int start, int end)
4634{
4635 /* standard clamping */
4636 if (start < 0)
4637 start = 0;
4638 if (end < 0)
4639 end = 0;
4640 if (end > self->length)
4641 end = self->length;
Tim Peters7a29bd52001-09-12 03:03:31 +00004642 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004643 /* full slice, return original string */
4644 Py_INCREF(self);
4645 return (PyObject*) self;
4646 }
4647 if (start > end)
4648 start = end;
4649 /* copy slice */
4650 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4651 end - start);
4652}
4653
4654PyObject *PyUnicode_Split(PyObject *s,
4655 PyObject *sep,
4656 int maxsplit)
4657{
4658 PyObject *result;
4659
4660 s = PyUnicode_FromObject(s);
4661 if (s == NULL)
4662 return NULL;
4663 if (sep != NULL) {
4664 sep = PyUnicode_FromObject(sep);
4665 if (sep == NULL) {
4666 Py_DECREF(s);
4667 return NULL;
4668 }
4669 }
4670
4671 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4672
4673 Py_DECREF(s);
4674 Py_XDECREF(sep);
4675 return result;
4676}
4677
4678static char split__doc__[] =
4679"S.split([sep [,maxsplit]]) -> list of strings\n\
4680\n\
4681Return a list of the words in S, using sep as the\n\
4682delimiter string. If maxsplit is given, at most maxsplit\n\
4683splits are done. If sep is not specified, any whitespace string\n\
4684is a separator.";
4685
4686static PyObject*
4687unicode_split(PyUnicodeObject *self, PyObject *args)
4688{
4689 PyObject *substring = Py_None;
4690 int maxcount = -1;
4691
4692 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4693 return NULL;
4694
4695 if (substring == Py_None)
4696 return split(self, NULL, maxcount);
4697 else if (PyUnicode_Check(substring))
4698 return split(self, (PyUnicodeObject *)substring, maxcount);
4699 else
4700 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4701}
4702
4703static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004704"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705\n\
4706Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004707Line breaks are not included in the resulting list unless keepends\n\
4708is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709
4710static PyObject*
4711unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4712{
Guido van Rossum86662912000-04-11 15:38:46 +00004713 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714
Guido van Rossum86662912000-04-11 15:38:46 +00004715 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004716 return NULL;
4717
Guido van Rossum86662912000-04-11 15:38:46 +00004718 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719}
4720
4721static
4722PyObject *unicode_str(PyUnicodeObject *self)
4723{
Fred Drakee4315f52000-05-09 19:53:39 +00004724 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004725}
4726
4727static char strip__doc__[] =
4728"S.strip() -> unicode\n\
4729\n\
4730Return a copy of S with leading and trailing whitespace removed.";
4731
4732static PyObject *
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004733unicode_strip(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735 return strip(self, 1, 1);
4736}
4737
4738static char swapcase__doc__[] =
4739"S.swapcase() -> unicode\n\
4740\n\
4741Return a copy of S with uppercase characters converted to lowercase\n\
4742and vice versa.";
4743
4744static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004745unicode_swapcase(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747 return fixup(self, fixswapcase);
4748}
4749
4750static char translate__doc__[] =
4751"S.translate(table) -> unicode\n\
4752\n\
4753Return a copy of the string S, where all characters have been mapped\n\
4754through the given translation table, which must be a mapping of\n\
4755Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4756are left untouched. Characters mapped to None are deleted.";
4757
4758static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004759unicode_translate(PyUnicodeObject *self, PyObject *table)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 return PyUnicode_TranslateCharmap(self->str,
4762 self->length,
4763 table,
4764 "ignore");
4765}
4766
4767static char upper__doc__[] =
4768"S.upper() -> unicode\n\
4769\n\
4770Return a copy of S converted to uppercase.";
4771
4772static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004773unicode_upper(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 return fixup(self, fixupper);
4776}
4777
4778#if 0
4779static char zfill__doc__[] =
4780"S.zfill(width) -> unicode\n\
4781\n\
4782Pad a numeric string x with zeros on the left, to fill a field\n\
4783of the specified width. The string x is never truncated.";
4784
4785static PyObject *
4786unicode_zfill(PyUnicodeObject *self, PyObject *args)
4787{
4788 int fill;
4789 PyUnicodeObject *u;
4790
4791 int width;
4792 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4793 return NULL;
4794
4795 if (self->length >= width) {
4796 Py_INCREF(self);
4797 return (PyObject*) self;
4798 }
4799
4800 fill = width - self->length;
4801
4802 u = pad(self, fill, 0, '0');
4803
4804 if (u->str[fill] == '+' || u->str[fill] == '-') {
4805 /* move sign to beginning of string */
4806 u->str[0] = u->str[fill];
4807 u->str[fill] = '0';
4808 }
4809
4810 return (PyObject*) u;
4811}
4812#endif
4813
4814#if 0
4815static PyObject*
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004816unicode_freelistsize(PyUnicodeObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817{
Guido van Rossumd57fd912000-03-10 22:53:23 +00004818 return PyInt_FromLong(unicode_freelist_size);
4819}
4820#endif
4821
4822static char startswith__doc__[] =
4823"S.startswith(prefix[, start[, end]]) -> int\n\
4824\n\
4825Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4826optional start, test S beginning at that position. With optional end, stop\n\
4827comparing S at that position.";
4828
4829static PyObject *
4830unicode_startswith(PyUnicodeObject *self,
4831 PyObject *args)
4832{
4833 PyUnicodeObject *substring;
4834 int start = 0;
4835 int end = INT_MAX;
4836 PyObject *result;
4837
Guido van Rossumb8872e62000-05-09 14:14:27 +00004838 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4839 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 return NULL;
4841 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4842 (PyObject *)substring);
4843 if (substring == NULL)
4844 return NULL;
4845
4846 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4847
4848 Py_DECREF(substring);
4849 return result;
4850}
4851
4852
4853static char endswith__doc__[] =
4854"S.endswith(suffix[, start[, end]]) -> int\n\
4855\n\
4856Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4857optional start, test S beginning at that position. With optional end, stop\n\
4858comparing S at that position.";
4859
4860static PyObject *
4861unicode_endswith(PyUnicodeObject *self,
4862 PyObject *args)
4863{
4864 PyUnicodeObject *substring;
4865 int start = 0;
4866 int end = INT_MAX;
4867 PyObject *result;
4868
Guido van Rossumb8872e62000-05-09 14:14:27 +00004869 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4870 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 return NULL;
4872 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4873 (PyObject *)substring);
4874 if (substring == NULL)
4875 return NULL;
4876
4877 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4878
4879 Py_DECREF(substring);
4880 return result;
4881}
4882
4883
4884static PyMethodDef unicode_methods[] = {
4885
4886 /* Order is according to common usage: often used methods should
4887 appear first, since lookup is done sequentially. */
4888
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004889 {"encode", (PyCFunction) unicode_encode, METH_VARARGS, encode__doc__},
4890 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
4891 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
4892 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
4893 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
4894 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
4895 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
4896 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
4897 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
4898 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
4899 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
4900 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
4901 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
4902 {"lstrip", (PyCFunction) unicode_lstrip, METH_NOARGS, lstrip__doc__},
4903/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
4904 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
4905 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
4906 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
4907 {"rstrip", (PyCFunction) unicode_rstrip, METH_NOARGS, rstrip__doc__},
4908 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
4909 {"strip", (PyCFunction) unicode_strip, METH_NOARGS, strip__doc__},
4910 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
4911 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
4912 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
4913 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
4914 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
4915 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
4916 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
4917 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
4918 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
4919 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
4920 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
4921 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
4922 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
4923 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924#if 0
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004925 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
4926 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927#endif
4928
4929#if 0
4930 /* This one is just used for debugging the implementation. */
Martin v. Löwise3eb1f22001-08-16 13:15:00 +00004931 {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004932#endif
4933
4934 {NULL, NULL}
4935};
4936
Guido van Rossumd57fd912000-03-10 22:53:23 +00004937static PySequenceMethods unicode_as_sequence = {
4938 (inquiry) unicode_length, /* sq_length */
4939 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4940 (intargfunc) unicode_repeat, /* sq_repeat */
4941 (intargfunc) unicode_getitem, /* sq_item */
4942 (intintargfunc) unicode_slice, /* sq_slice */
4943 0, /* sq_ass_item */
4944 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004945 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004946};
4947
4948static int
4949unicode_buffer_getreadbuf(PyUnicodeObject *self,
4950 int index,
4951 const void **ptr)
4952{
4953 if (index != 0) {
4954 PyErr_SetString(PyExc_SystemError,
4955 "accessing non-existent unicode segment");
4956 return -1;
4957 }
4958 *ptr = (void *) self->str;
4959 return PyUnicode_GET_DATA_SIZE(self);
4960}
4961
4962static int
4963unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4964 const void **ptr)
4965{
4966 PyErr_SetString(PyExc_TypeError,
4967 "cannot use unicode as modifyable buffer");
4968 return -1;
4969}
4970
4971static int
4972unicode_buffer_getsegcount(PyUnicodeObject *self,
4973 int *lenp)
4974{
4975 if (lenp)
4976 *lenp = PyUnicode_GET_DATA_SIZE(self);
4977 return 1;
4978}
4979
4980static int
4981unicode_buffer_getcharbuf(PyUnicodeObject *self,
4982 int index,
4983 const void **ptr)
4984{
4985 PyObject *str;
4986
4987 if (index != 0) {
4988 PyErr_SetString(PyExc_SystemError,
4989 "accessing non-existent unicode segment");
4990 return -1;
4991 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004992 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 if (str == NULL)
4994 return -1;
4995 *ptr = (void *) PyString_AS_STRING(str);
4996 return PyString_GET_SIZE(str);
4997}
4998
4999/* Helpers for PyUnicode_Format() */
5000
5001static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00005002getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003{
5004 int argidx = *p_argidx;
5005 if (argidx < arglen) {
5006 (*p_argidx)++;
5007 if (arglen < 0)
5008 return args;
5009 else
5010 return PyTuple_GetItem(args, argidx);
5011 }
5012 PyErr_SetString(PyExc_TypeError,
5013 "not enough arguments for format string");
5014 return NULL;
5015}
5016
5017#define F_LJUST (1<<0)
5018#define F_SIGN (1<<1)
5019#define F_BLANK (1<<2)
5020#define F_ALT (1<<3)
5021#define F_ZERO (1<<4)
5022
5023static
Guido van Rossumd57fd912000-03-10 22:53:23 +00005024int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025{
5026 register int i;
5027 int len;
5028 va_list va;
5029 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005031
5032 /* First, format the string as char array, then expand to Py_UNICODE
5033 array. */
5034 charbuffer = (char *)buffer;
5035 len = vsprintf(charbuffer, format, va);
5036 for (i = len - 1; i >= 0; i--)
5037 buffer[i] = (Py_UNICODE) charbuffer[i];
5038
5039 va_end(va);
5040 return len;
5041}
5042
5043static int
5044formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005045 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 int flags,
5047 int prec,
5048 int type,
5049 PyObject *v)
5050{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005051 /* fmt = '%#.' + `prec` + `type`
5052 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00005053 char fmt[20];
5054 double x;
5055
5056 x = PyFloat_AsDouble(v);
5057 if (x == -1.0 && PyErr_Occurred())
5058 return -1;
5059 if (prec < 0)
5060 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
5062 type = 'g';
5063 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005064 /* worst case length calc to ensure no buffer overrun:
5065 fmt = %#.<prec>g
5066 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
5067 for any double rep.)
5068 len = 1 + prec + 1 + 2 + 5 = 9 + prec
5069 If prec=0 the effective precision is 1 (the leading digit is
5070 always given), therefore increase by one to 10+prec. */
5071 if (buflen <= (size_t)10 + (size_t)prec) {
5072 PyErr_SetString(PyExc_OverflowError,
5073 "formatted float is too long (precision too long?)");
5074 return -1;
5075 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 return usprintf(buf, fmt, x);
5077}
5078
Tim Peters38fd5b62000-09-21 05:43:11 +00005079static PyObject*
5080formatlong(PyObject *val, int flags, int prec, int type)
5081{
5082 char *buf;
5083 int i, len;
5084 PyObject *str; /* temporary string object. */
5085 PyUnicodeObject *result;
5086
5087 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
5088 if (!str)
5089 return NULL;
5090 result = _PyUnicode_New(len);
5091 for (i = 0; i < len; i++)
5092 result->str[i] = buf[i];
5093 result->str[len] = 0;
5094 Py_DECREF(str);
5095 return (PyObject*)result;
5096}
5097
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098static int
5099formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005100 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 int flags,
5102 int prec,
5103 int type,
5104 PyObject *v)
5105{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005106 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00005107 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
5108 + 1 + 1 = 24*/
5109 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005110 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005111 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112
5113 x = PyInt_AsLong(v);
5114 if (x == -1 && PyErr_Occurred())
5115 return -1;
5116 if (prec < 0)
5117 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005118 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
5119 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
5120 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
5121 PyErr_SetString(PyExc_OverflowError,
5122 "formatted integer is too long (precision too long?)");
5123 return -1;
5124 }
Tim Petersfff53252001-04-12 18:38:48 +00005125 /* When converting 0 under %#x or %#X, C leaves off the base marker,
5126 * but we want it (for consistency with other %#x conversions, and
5127 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005128 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
5129 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
5130 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00005131 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00005132 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
5133 /* Only way to know what the platform does is to try it. */
5134 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
5135 if (fmt[1] != (char)type) {
5136 /* Supply our own leading 0x/0X -- needed under std C */
5137 use_native_c_format = 0;
5138 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
5139 }
5140 }
5141 if (use_native_c_format)
5142 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 return usprintf(buf, fmt, x);
5144}
5145
5146static int
5147formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005148 size_t buflen,
5149 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005151 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005152 if (PyUnicode_Check(v)) {
5153 if (PyUnicode_GET_SIZE(v) != 1)
5154 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005155 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005156 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005158 else if (PyString_Check(v)) {
5159 if (PyString_GET_SIZE(v) != 1)
5160 goto onError;
5161 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
5162 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005163
5164 else {
5165 /* Integer input truncated to a character */
5166 long x;
5167 x = PyInt_AsLong(v);
5168 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005169 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005170 buf[0] = (char) x;
5171 }
5172 buf[1] = '\0';
5173 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005174
5175 onError:
5176 PyErr_SetString(PyExc_TypeError,
5177 "%c requires int or char");
5178 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005179}
5180
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005181/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
5182
5183 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
5184 chars are formatted. XXX This is a magic number. Each formatting
5185 routine does bounds checking to ensure no overflow, but a better
5186 solution may be to malloc a buffer of appropriate size for each
5187 format. For now, the current solution is sufficient.
5188*/
5189#define FORMATBUFLEN (size_t)120
5190
Guido van Rossumd57fd912000-03-10 22:53:23 +00005191PyObject *PyUnicode_Format(PyObject *format,
5192 PyObject *args)
5193{
5194 Py_UNICODE *fmt, *res;
5195 int fmtcnt, rescnt, reslen, arglen, argidx;
5196 int args_owned = 0;
5197 PyUnicodeObject *result = NULL;
5198 PyObject *dict = NULL;
5199 PyObject *uformat;
5200
5201 if (format == NULL || args == NULL) {
5202 PyErr_BadInternalCall();
5203 return NULL;
5204 }
5205 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00005206 if (uformat == NULL)
5207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 fmt = PyUnicode_AS_UNICODE(uformat);
5209 fmtcnt = PyUnicode_GET_SIZE(uformat);
5210
5211 reslen = rescnt = fmtcnt + 100;
5212 result = _PyUnicode_New(reslen);
5213 if (result == NULL)
5214 goto onError;
5215 res = PyUnicode_AS_UNICODE(result);
5216
5217 if (PyTuple_Check(args)) {
5218 arglen = PyTuple_Size(args);
5219 argidx = 0;
5220 }
5221 else {
5222 arglen = -1;
5223 argidx = -2;
5224 }
5225 if (args->ob_type->tp_as_mapping)
5226 dict = args;
5227
5228 while (--fmtcnt >= 0) {
5229 if (*fmt != '%') {
5230 if (--rescnt < 0) {
5231 rescnt = fmtcnt + 100;
5232 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005233 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 return NULL;
5235 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
5236 --rescnt;
5237 }
5238 *res++ = *fmt++;
5239 }
5240 else {
5241 /* Got a format specifier */
5242 int flags = 0;
5243 int width = -1;
5244 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005245 Py_UNICODE c = '\0';
5246 Py_UNICODE fill;
5247 PyObject *v = NULL;
5248 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005249 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005250 Py_UNICODE sign;
5251 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005252 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005253
5254 fmt++;
5255 if (*fmt == '(') {
5256 Py_UNICODE *keystart;
5257 int keylen;
5258 PyObject *key;
5259 int pcount = 1;
5260
5261 if (dict == NULL) {
5262 PyErr_SetString(PyExc_TypeError,
5263 "format requires a mapping");
5264 goto onError;
5265 }
5266 ++fmt;
5267 --fmtcnt;
5268 keystart = fmt;
5269 /* Skip over balanced parentheses */
5270 while (pcount > 0 && --fmtcnt >= 0) {
5271 if (*fmt == ')')
5272 --pcount;
5273 else if (*fmt == '(')
5274 ++pcount;
5275 fmt++;
5276 }
5277 keylen = fmt - keystart - 1;
5278 if (fmtcnt < 0 || pcount > 0) {
5279 PyErr_SetString(PyExc_ValueError,
5280 "incomplete format key");
5281 goto onError;
5282 }
Fred Drakee4315f52000-05-09 19:53:39 +00005283 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 then looked up since Python uses strings to hold
5285 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005286 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005287 key = PyUnicode_EncodeUTF8(keystart,
5288 keylen,
5289 NULL);
5290 if (key == NULL)
5291 goto onError;
5292 if (args_owned) {
5293 Py_DECREF(args);
5294 args_owned = 0;
5295 }
5296 args = PyObject_GetItem(dict, key);
5297 Py_DECREF(key);
5298 if (args == NULL) {
5299 goto onError;
5300 }
5301 args_owned = 1;
5302 arglen = -1;
5303 argidx = -2;
5304 }
5305 while (--fmtcnt >= 0) {
5306 switch (c = *fmt++) {
5307 case '-': flags |= F_LJUST; continue;
5308 case '+': flags |= F_SIGN; continue;
5309 case ' ': flags |= F_BLANK; continue;
5310 case '#': flags |= F_ALT; continue;
5311 case '0': flags |= F_ZERO; continue;
5312 }
5313 break;
5314 }
5315 if (c == '*') {
5316 v = getnextarg(args, arglen, &argidx);
5317 if (v == NULL)
5318 goto onError;
5319 if (!PyInt_Check(v)) {
5320 PyErr_SetString(PyExc_TypeError,
5321 "* wants int");
5322 goto onError;
5323 }
5324 width = PyInt_AsLong(v);
5325 if (width < 0) {
5326 flags |= F_LJUST;
5327 width = -width;
5328 }
5329 if (--fmtcnt >= 0)
5330 c = *fmt++;
5331 }
5332 else if (c >= '0' && c <= '9') {
5333 width = c - '0';
5334 while (--fmtcnt >= 0) {
5335 c = *fmt++;
5336 if (c < '0' || c > '9')
5337 break;
5338 if ((width*10) / 10 != width) {
5339 PyErr_SetString(PyExc_ValueError,
5340 "width too big");
5341 goto onError;
5342 }
5343 width = width*10 + (c - '0');
5344 }
5345 }
5346 if (c == '.') {
5347 prec = 0;
5348 if (--fmtcnt >= 0)
5349 c = *fmt++;
5350 if (c == '*') {
5351 v = getnextarg(args, arglen, &argidx);
5352 if (v == NULL)
5353 goto onError;
5354 if (!PyInt_Check(v)) {
5355 PyErr_SetString(PyExc_TypeError,
5356 "* wants int");
5357 goto onError;
5358 }
5359 prec = PyInt_AsLong(v);
5360 if (prec < 0)
5361 prec = 0;
5362 if (--fmtcnt >= 0)
5363 c = *fmt++;
5364 }
5365 else if (c >= '0' && c <= '9') {
5366 prec = c - '0';
5367 while (--fmtcnt >= 0) {
5368 c = Py_CHARMASK(*fmt++);
5369 if (c < '0' || c > '9')
5370 break;
5371 if ((prec*10) / 10 != prec) {
5372 PyErr_SetString(PyExc_ValueError,
5373 "prec too big");
5374 goto onError;
5375 }
5376 prec = prec*10 + (c - '0');
5377 }
5378 }
5379 } /* prec */
5380 if (fmtcnt >= 0) {
5381 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005382 if (--fmtcnt >= 0)
5383 c = *fmt++;
5384 }
5385 }
5386 if (fmtcnt < 0) {
5387 PyErr_SetString(PyExc_ValueError,
5388 "incomplete format");
5389 goto onError;
5390 }
5391 if (c != '%') {
5392 v = getnextarg(args, arglen, &argidx);
5393 if (v == NULL)
5394 goto onError;
5395 }
5396 sign = 0;
5397 fill = ' ';
5398 switch (c) {
5399
5400 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005401 pbuf = formatbuf;
5402 /* presume that buffer length is at least 1 */
5403 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005404 len = 1;
5405 break;
5406
5407 case 's':
5408 case 'r':
5409 if (PyUnicode_Check(v) && c == 's') {
5410 temp = v;
5411 Py_INCREF(temp);
5412 }
5413 else {
5414 PyObject *unicode;
5415 if (c == 's')
5416 temp = PyObject_Str(v);
5417 else
5418 temp = PyObject_Repr(v);
5419 if (temp == NULL)
5420 goto onError;
5421 if (!PyString_Check(temp)) {
5422 /* XXX Note: this should never happen, since
5423 PyObject_Repr() and PyObject_Str() assure
5424 this */
5425 Py_DECREF(temp);
5426 PyErr_SetString(PyExc_TypeError,
5427 "%s argument has non-string str()");
5428 goto onError;
5429 }
Fred Drakee4315f52000-05-09 19:53:39 +00005430 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005431 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005432 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005433 "strict");
5434 Py_DECREF(temp);
5435 temp = unicode;
5436 if (temp == NULL)
5437 goto onError;
5438 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005439 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005440 len = PyUnicode_GET_SIZE(temp);
5441 if (prec >= 0 && len > prec)
5442 len = prec;
5443 break;
5444
5445 case 'i':
5446 case 'd':
5447 case 'u':
5448 case 'o':
5449 case 'x':
5450 case 'X':
5451 if (c == 'i')
5452 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005453 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005454 temp = formatlong(v, flags, prec, c);
5455 if (!temp)
5456 goto onError;
5457 pbuf = PyUnicode_AS_UNICODE(temp);
5458 len = PyUnicode_GET_SIZE(temp);
5459 /* unbounded ints can always produce
5460 a sign character! */
5461 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005463 else {
5464 pbuf = formatbuf;
5465 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5466 flags, prec, c, v);
5467 if (len < 0)
5468 goto onError;
5469 /* only d conversion is signed */
5470 sign = c == 'd';
5471 }
5472 if (flags & F_ZERO)
5473 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005474 break;
5475
5476 case 'e':
5477 case 'E':
5478 case 'f':
5479 case 'g':
5480 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005481 pbuf = formatbuf;
5482 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5483 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005484 if (len < 0)
5485 goto onError;
5486 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005487 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488 fill = '0';
5489 break;
5490
5491 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005492 pbuf = formatbuf;
5493 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005494 if (len < 0)
5495 goto onError;
5496 break;
5497
5498 default:
5499 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005500 "unsupported format character '%c' (0x%x) "
5501 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005502 (31<=c && c<=126) ? c : '?',
5503 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005504 goto onError;
5505 }
5506 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005507 if (*pbuf == '-' || *pbuf == '+') {
5508 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005509 len--;
5510 }
5511 else if (flags & F_SIGN)
5512 sign = '+';
5513 else if (flags & F_BLANK)
5514 sign = ' ';
5515 else
5516 sign = 0;
5517 }
5518 if (width < len)
5519 width = len;
5520 if (rescnt < width + (sign != 0)) {
5521 reslen -= rescnt;
5522 rescnt = width + fmtcnt + 100;
5523 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005524 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005525 return NULL;
5526 res = PyUnicode_AS_UNICODE(result)
5527 + reslen - rescnt;
5528 }
5529 if (sign) {
5530 if (fill != ' ')
5531 *res++ = sign;
5532 rescnt--;
5533 if (width > len)
5534 width--;
5535 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005536 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5537 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005538 assert(pbuf[1] == c);
5539 if (fill != ' ') {
5540 *res++ = *pbuf++;
5541 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005542 }
Tim Petersfff53252001-04-12 18:38:48 +00005543 rescnt -= 2;
5544 width -= 2;
5545 if (width < 0)
5546 width = 0;
5547 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005548 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005549 if (width > len && !(flags & F_LJUST)) {
5550 do {
5551 --rescnt;
5552 *res++ = fill;
5553 } while (--width > len);
5554 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005555 if (fill == ' ') {
5556 if (sign)
5557 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005558 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005559 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005560 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005561 *res++ = *pbuf++;
5562 *res++ = *pbuf++;
5563 }
5564 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005565 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005566 res += len;
5567 rescnt -= len;
5568 while (--width >= len) {
5569 --rescnt;
5570 *res++ = ' ';
5571 }
5572 if (dict && (argidx < arglen) && c != '%') {
5573 PyErr_SetString(PyExc_TypeError,
5574 "not all arguments converted");
5575 goto onError;
5576 }
5577 Py_XDECREF(temp);
5578 } /* '%' */
5579 } /* until end */
5580 if (argidx < arglen && !dict) {
5581 PyErr_SetString(PyExc_TypeError,
5582 "not all arguments converted");
5583 goto onError;
5584 }
5585
5586 if (args_owned) {
5587 Py_DECREF(args);
5588 }
5589 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005590 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005591 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005592 return (PyObject *)result;
5593
5594 onError:
5595 Py_XDECREF(result);
5596 Py_DECREF(uformat);
5597 if (args_owned) {
5598 Py_DECREF(args);
5599 }
5600 return NULL;
5601}
5602
5603static PyBufferProcs unicode_as_buffer = {
5604 (getreadbufferproc) unicode_buffer_getreadbuf,
5605 (getwritebufferproc) unicode_buffer_getwritebuf,
5606 (getsegcountproc) unicode_buffer_getsegcount,
5607 (getcharbufferproc) unicode_buffer_getcharbuf,
5608};
5609
Guido van Rossume023fe02001-08-30 03:12:59 +00005610staticforward PyObject *
5611unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
5612
Tim Peters6d6c1a32001-08-02 04:15:00 +00005613static PyObject *
5614unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5615{
5616 PyObject *x = NULL;
5617 static char *kwlist[] = {"string", "encoding", "errors", 0};
5618 char *encoding = NULL;
5619 char *errors = NULL;
5620
Guido van Rossume023fe02001-08-30 03:12:59 +00005621 if (type != &PyUnicode_Type)
5622 return unicode_subtype_new(type, args, kwds);
Tim Peters6d6c1a32001-08-02 04:15:00 +00005623 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
5624 kwlist, &x, &encoding, &errors))
5625 return NULL;
5626 if (x == NULL)
5627 return (PyObject *)_PyUnicode_New(0);
5628 return PyUnicode_FromEncodedObject(x, encoding, errors);
5629}
5630
Guido van Rossume023fe02001-08-30 03:12:59 +00005631static PyObject *
5632unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
5633{
Tim Petersaf90b3e2001-09-12 05:18:58 +00005634 PyUnicodeObject *tmp, *pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005635 int n;
5636
5637 assert(PyType_IsSubtype(type, &PyUnicode_Type));
5638 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
5639 if (tmp == NULL)
5640 return NULL;
5641 assert(PyUnicode_Check(tmp));
Tim Petersaf90b3e2001-09-12 05:18:58 +00005642 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
5643 if (pnew == NULL)
Guido van Rossume023fe02001-08-30 03:12:59 +00005644 return NULL;
Tim Petersaf90b3e2001-09-12 05:18:58 +00005645 pnew->str = PyMem_NEW(Py_UNICODE, n+1);
5646 if (pnew->str == NULL) {
5647 _Py_ForgetReference((PyObject *)pnew);
5648 PyObject_DEL(pnew);
Guido van Rossume023fe02001-08-30 03:12:59 +00005649 return NULL;
5650 }
Tim Petersaf90b3e2001-09-12 05:18:58 +00005651 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
5652 pnew->length = n;
5653 pnew->hash = tmp->hash;
Guido van Rossume023fe02001-08-30 03:12:59 +00005654 Py_DECREF(tmp);
Tim Petersaf90b3e2001-09-12 05:18:58 +00005655 return (PyObject *)pnew;
Guido van Rossume023fe02001-08-30 03:12:59 +00005656}
5657
Tim Peters6d6c1a32001-08-02 04:15:00 +00005658static char unicode_doc[] =
5659"unicode(string [, encoding[, errors]]) -> object\n\
5660\n\
5661Create a new Unicode object from the given encoded string.\n\
5662encoding defaults to the current default string encoding and \n\
5663errors, defining the error handling, to 'strict'.";
5664
Guido van Rossumd57fd912000-03-10 22:53:23 +00005665PyTypeObject PyUnicode_Type = {
5666 PyObject_HEAD_INIT(&PyType_Type)
5667 0, /* ob_size */
5668 "unicode", /* tp_name */
5669 sizeof(PyUnicodeObject), /* tp_size */
5670 0, /* tp_itemsize */
5671 /* Slots */
5672 (destructor)_PyUnicode_Free, /* tp_dealloc */
5673 0, /* tp_print */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005674 0, /* tp_getattr */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005675 0, /* tp_setattr */
5676 (cmpfunc) unicode_compare, /* tp_compare */
5677 (reprfunc) unicode_repr, /* tp_repr */
5678 0, /* tp_as_number */
5679 &unicode_as_sequence, /* tp_as_sequence */
5680 0, /* tp_as_mapping */
5681 (hashfunc) unicode_hash, /* tp_hash*/
5682 0, /* tp_call*/
5683 (reprfunc) unicode_str, /* tp_str */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005684 PyObject_GenericGetAttr, /* tp_getattro */
5685 0, /* tp_setattro */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005686 &unicode_as_buffer, /* tp_as_buffer */
Guido van Rossume023fe02001-08-30 03:12:59 +00005687 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
Tim Peters6d6c1a32001-08-02 04:15:00 +00005688 unicode_doc, /* tp_doc */
5689 0, /* tp_traverse */
5690 0, /* tp_clear */
5691 0, /* tp_richcompare */
5692 0, /* tp_weaklistoffset */
5693 0, /* tp_iter */
5694 0, /* tp_iternext */
5695 unicode_methods, /* tp_methods */
5696 0, /* tp_members */
5697 0, /* tp_getset */
5698 0, /* tp_base */
5699 0, /* tp_dict */
5700 0, /* tp_descr_get */
5701 0, /* tp_descr_set */
5702 0, /* tp_dictoffset */
5703 0, /* tp_init */
5704 0, /* tp_alloc */
5705 unicode_new, /* tp_new */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005706};
5707
5708/* Initialize the Unicode implementation */
5709
Thomas Wouters78890102000-07-22 19:25:51 +00005710void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005712 int i;
5713
Fred Drakee4315f52000-05-09 19:53:39 +00005714 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005715 unicode_freelist = NULL;
5716 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005717 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005718 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005719 for (i = 0; i < 256; i++)
5720 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005721}
5722
5723/* Finalize the Unicode implementation */
5724
5725void
Thomas Wouters78890102000-07-22 19:25:51 +00005726_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005728 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005729 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005731 Py_XDECREF(unicode_empty);
5732 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005733
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005734 for (i = 0; i < 256; i++) {
5735 if (unicode_latin1[i]) {
5736 Py_DECREF(unicode_latin1[i]);
5737 unicode_latin1[i] = NULL;
5738 }
5739 }
5740
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005741 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005742 PyUnicodeObject *v = u;
5743 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005744 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005745 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005746 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005747 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005748 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005749 unicode_freelist = NULL;
5750 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751}