blob: 2f66c3cf93ea6d1fcb50e0870d5bed6b89bfb6f8 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
107PyUnicode_GetMax()
108{
109#ifdef USE_UCS4_STORAGE
110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 const char *s;
402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
419 }
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
425 }
426 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000428 v = obj;
429 if (encoding) {
430 PyErr_SetString(PyExc_TypeError,
431 "decoding Unicode is not supported");
432 return NULL;
433 }
434 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else if (PyString_Check(obj)) {
437 s = PyString_AS_STRING(obj);
438 len = PyString_GET_SIZE(obj);
439 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000440 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
441 /* Overwrite the error message with something more useful in
442 case of a TypeError. */
443 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000444 PyErr_Format(PyExc_TypeError,
445 "coercing to Unicode: need string or buffer, "
446 "%.80s found",
447 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000448 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450
451 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 if (len == 0) {
453 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000454 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 else
457 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000458
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000460 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000461 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463 return v;
464
465 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000466 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000468 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470}
471
472PyObject *PyUnicode_Decode(const char *s,
473 int size,
474 const char *encoding,
475 const char *errors)
476{
477 PyObject *buffer = NULL, *unicode;
478
Fred Drakee4315f52000-05-09 19:53:39 +0000479 if (encoding == NULL)
480 encoding = PyUnicode_GetDefaultEncoding();
481
482 /* Shortcuts for common default encodings */
483 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000485 else if (strcmp(encoding, "latin-1") == 0)
486 return PyUnicode_DecodeLatin1(s, size, errors);
487 else if (strcmp(encoding, "ascii") == 0)
488 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489
490 /* Decode via the codec registry */
491 buffer = PyBuffer_FromMemory((void *)s, size);
492 if (buffer == NULL)
493 goto onError;
494 unicode = PyCodec_Decode(buffer, encoding, errors);
495 if (unicode == NULL)
496 goto onError;
497 if (!PyUnicode_Check(unicode)) {
498 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000499 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 unicode->ob_type->tp_name);
501 Py_DECREF(unicode);
502 goto onError;
503 }
504 Py_DECREF(buffer);
505 return unicode;
506
507 onError:
508 Py_XDECREF(buffer);
509 return NULL;
510}
511
512PyObject *PyUnicode_Encode(const Py_UNICODE *s,
513 int size,
514 const char *encoding,
515 const char *errors)
516{
517 PyObject *v, *unicode;
518
519 unicode = PyUnicode_FromUnicode(s, size);
520 if (unicode == NULL)
521 return NULL;
522 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
523 Py_DECREF(unicode);
524 return v;
525}
526
527PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v;
532
533 if (!PyUnicode_Check(unicode)) {
534 PyErr_BadArgument();
535 goto onError;
536 }
Fred Drakee4315f52000-05-09 19:53:39 +0000537
538 if (encoding == NULL)
539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (errors == NULL) {
543 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000544 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000545 else if (strcmp(encoding, "latin-1") == 0)
546 return PyUnicode_AsLatin1String(unicode);
547 else if (strcmp(encoding, "ascii") == 0)
548 return PyUnicode_AsASCIIString(unicode);
549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550
551 /* Encode via the codec registry */
552 v = PyCodec_Encode(unicode, encoding, errors);
553 if (v == NULL)
554 goto onError;
555 /* XXX Should we really enforce this ? */
556 if (!PyString_Check(v)) {
557 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000558 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 v->ob_type->tp_name);
560 Py_DECREF(v);
561 goto onError;
562 }
563 return v;
564
565 onError:
566 return NULL;
567}
568
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000569/* Return a Python string holding the default encoded value of the
570 Unicode object.
571
572 The resulting string is cached in the Unicode object for subsequent
573 usage by this function. The cached version is needed to implement
574 the character buffer interface and will live (at least) as long as
575 the Unicode object itself.
576
577 The refcount of the string is *not* incremented.
578
579 *** Exported for internal use by the interpreter only !!! ***
580
581*/
582
583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644/* --- UTF-8 Codec -------------------------------------------------------- */
645
646static
647char utf8_code_length[256] = {
648 /* Map UTF-8 encoded prefix byte to sequence length. zero means
649 illegal prefix. see RFC 2279 for details */
650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
653 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
655 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
662 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
663 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
664 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
665 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
666};
667
668static
669int utf8_decoding_error(const char **source,
670 Py_UNICODE **dest,
671 const char *errors,
672 const char *details)
673{
674 if ((errors == NULL) ||
675 (strcmp(errors,"strict") == 0)) {
676 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000677 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 details);
679 return -1;
680 }
681 else if (strcmp(errors,"ignore") == 0) {
682 (*source)++;
683 return 0;
684 }
685 else if (strcmp(errors,"replace") == 0) {
686 (*source)++;
687 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
688 (*dest)++;
689 return 0;
690 }
691 else {
692 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000693 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694 errors);
695 return -1;
696 }
697}
698
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699PyObject *PyUnicode_DecodeUTF8(const char *s,
700 int size,
701 const char *errors)
702{
703 int n;
704 const char *e;
705 PyUnicodeObject *unicode;
706 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000707 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708
709 /* Note: size will always be longer than the resulting Unicode
710 character count */
711 unicode = _PyUnicode_New(size);
712 if (!unicode)
713 return NULL;
714 if (size == 0)
715 return (PyObject *)unicode;
716
717 /* Unpack UTF-8 encoded data */
718 p = unicode->str;
719 e = s + size;
720
721 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000722 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723
724 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000725 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 s++;
727 continue;
728 }
729
730 n = utf8_code_length[ch];
731
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000732 if (s + n > e) {
733 errmsg = "unexpected end of data";
734 goto utf8Error;
735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000736
737 switch (n) {
738
739 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000740 errmsg = "unexpected code byte";
741 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000742
743 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 errmsg = "internal error";
745 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000746
747 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000748 if ((s[1] & 0xc0) != 0x80) {
749 errmsg = "invalid data";
750 goto utf8Error;
751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000752 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000753 if (ch < 0x80) {
754 errmsg = "illegal encoding";
755 goto utf8Error;
756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000758 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000759 break;
760
761 case 3:
762 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000763 (s[2] & 0xc0) != 0x80) {
764 errmsg = "invalid data";
765 goto utf8Error;
766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000768 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
769 errmsg = "illegal encoding";
770 goto utf8Error;
771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000773 *p++ = (Py_UNICODE)ch;
774 break;
775
776 case 4:
777 if ((s[1] & 0xc0) != 0x80 ||
778 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000779 (s[3] & 0xc0) != 0x80) {
780 errmsg = "invalid data";
781 goto utf8Error;
782 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000783 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
784 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
785 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000786 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000787 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000788 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000789 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000790 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000791 errmsg = "illegal encoding";
792 goto utf8Error;
793 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000794#if Py_UNICODE_SIZE == 4
795 *p++ = (Py_UNICODE)ch;
796#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000797 /* compute and append the two surrogates: */
798
799 /* translate from 10000..10FFFF to 0..FFFF */
800 ch -= 0x10000;
801
802 /* high surrogate = top 10 bits added to D800 */
803 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
804
805 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +0000806 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000807#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 break;
809
810 default:
811 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000812 errmsg = "unsupported Unicode code range";
813 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 }
815 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000816 continue;
817
818 utf8Error:
819 if (utf8_decoding_error(&s, &p, errors, errmsg))
820 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821 }
822
823 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000824 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 goto onError;
826
827 return (PyObject *)unicode;
828
829onError:
830 Py_DECREF(unicode);
831 return NULL;
832}
833
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000834/* Not used anymore, now that the encoder supports UTF-16
835 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000836#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837static
838int utf8_encoding_error(const Py_UNICODE **source,
839 char **dest,
840 const char *errors,
841 const char *details)
842{
843 if ((errors == NULL) ||
844 (strcmp(errors,"strict") == 0)) {
845 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000846 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000847 details);
848 return -1;
849 }
850 else if (strcmp(errors,"ignore") == 0) {
851 return 0;
852 }
853 else if (strcmp(errors,"replace") == 0) {
854 **dest = '?';
855 (*dest)++;
856 return 0;
857 }
858 else {
859 PyErr_Format(PyExc_ValueError,
860 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000861 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 errors);
863 return -1;
864 }
865}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000866#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
868PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
869 int size,
870 const char *errors)
871{
872 PyObject *v;
873 char *p;
874 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000875 Py_UCS4 ch2;
876 unsigned int cbAllocated = 3 * size;
877 unsigned int cbWritten = 0;
878 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000880 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000881 if (v == NULL)
882 return NULL;
883 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000884 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885
886 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000887 while (i < size) {
888 Py_UCS4 ch = s[i++];
889 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000891 cbWritten++;
892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 else if (ch < 0x0800) {
894 *p++ = 0xc0 | (ch >> 6);
895 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000896 cbWritten += 2;
897 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000898 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000899 /* Check for high surrogate */
900 if (0xD800 <= ch && ch <= 0xDBFF) {
901 if (i != size) {
902 ch2 = s[i];
903 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
904
905 if (cbWritten >= (cbAllocated - 4)) {
906 /* Provide enough room for some more
907 surrogates */
908 cbAllocated += 4*10;
909 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000910 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000911 }
912
913 /* combine the two values */
914 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
915
916 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000917 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000918 i++;
919 cbWritten += 4;
920 }
921 }
922 }
923 else {
924 *p++ = (char)(0xe0 | (ch >> 12));
925 cbWritten += 3;
926 }
927 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
928 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000929 } else {
930 *p++ = 0xf0 | (ch>>18);
931 *p++ = 0x80 | ((ch>>12) & 0x3f);
932 *p++ = 0x80 | ((ch>>6) & 0x3f);
933 *p++ = 0x80 | (ch & 0x3f);
934 cbWritten += 4;
935 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 }
937 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000938 if (_PyString_Resize(&v, p - q))
939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 return v;
941
942 onError:
943 Py_DECREF(v);
944 return NULL;
945}
946
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
948{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 if (!PyUnicode_Check(unicode)) {
950 PyErr_BadArgument();
951 return NULL;
952 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000953 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
954 PyUnicode_GET_SIZE(unicode),
955 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000956}
957
958/* --- UTF-16 Codec ------------------------------------------------------- */
959
960static
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000961int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000962 Py_UNICODE **dest,
963 const char *errors,
964 const char *details)
965{
966 if ((errors == NULL) ||
967 (strcmp(errors,"strict") == 0)) {
968 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000969 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000970 details);
971 return -1;
972 }
973 else if (strcmp(errors,"ignore") == 0) {
974 return 0;
975 }
976 else if (strcmp(errors,"replace") == 0) {
977 if (dest) {
978 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
979 (*dest)++;
980 }
981 return 0;
982 }
983 else {
984 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000985 "UTF-16 decoding error; "
986 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987 errors);
988 return -1;
989 }
990}
991
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992PyObject *PyUnicode_DecodeUTF16(const char *s,
993 int size,
994 const char *errors,
995 int *byteorder)
996{
997 PyUnicodeObject *unicode;
998 Py_UNICODE *p;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000999 const Py_UCS2 *q, *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001000 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001001 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002
1003 /* size should be an even number */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001004 if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
1006 return NULL;
1007 /* The remaining input chars are ignored if we fall through
1008 here... */
1009 }
1010
1011 /* Note: size will always be longer than the resulting Unicode
1012 character count */
1013 unicode = _PyUnicode_New(size);
1014 if (!unicode)
1015 return NULL;
1016 if (size == 0)
1017 return (PyObject *)unicode;
1018
1019 /* Unpack UTF-16 encoded data */
1020 p = unicode->str;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001021 q = (Py_UCS2 *)s;
1022 e = q + (size / sizeof(Py_UCS2));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (byteorder)
1025 bo = *byteorder;
1026
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001027 /* Check for BOM marks (U+FEFF) in the input and adjust current
1028 byte order setting accordingly. In native mode, the leading BOM
1029 mark is skipped, in all other modes, it is copied to the output
1030 stream as-is (giving a ZWNBSP character). */
1031 if (bo == 0) {
1032#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1033 if (*q == 0xFEFF) {
1034 q++;
1035 bo = -1;
1036 } else if (*q == 0xFFFE) {
1037 q++;
1038 bo = 1;
1039 }
1040#else
1041 if (*q == 0xFEFF) {
1042 q++;
1043 bo = 1;
1044 } else if (*q == 0xFFFE) {
1045 q++;
1046 bo = -1;
1047 }
1048#endif
1049 }
1050
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 while (q < e) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001052 register Py_UCS2 ch = *q++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001054 /* Swap input bytes if needed. (This assumes
1055 sizeof(Py_UNICODE) == 2 !) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 if (bo == 1)
1058 ch = (ch >> 8) | (ch << 8);
1059#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 if (bo == -1)
1061 ch = (ch >> 8) | (ch << 8);
1062#endif
1063 if (ch < 0xD800 || ch > 0xDFFF) {
1064 *p++ = ch;
1065 continue;
1066 }
1067
1068 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001069 if (q >= e) {
1070 errmsg = "unexpected end of data";
1071 goto utf16Error;
1072 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001073 if (0xD800 <= ch && ch <= 0xDBFF) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001074 Py_UCS2 ch2 = *q++;
1075#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1076 if (bo == 1)
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001077 ch2 = (ch2 >> 8) | (ch2 << 8);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001078#else
1079 if (bo == -1)
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001080 ch2 = (ch2 >> 8) | (ch2 << 8);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001081#endif
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001082 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001083#if Py_UNICODE_SIZE == 2
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084 /* This is valid data (a UTF-16 surrogate pair), but
1085 we are not able to store this information since our
1086 Py_UNICODE type only has 16 bits... this might
1087 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001088 errmsg = "code pairs are not supported";
1089 goto utf16Error;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001090#else
1091 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093#endif
1094
1095 }
1096 else {
1097 errmsg = "illegal UTF-16 surrogate";
1098 goto utf16Error;
1099 }
1100
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001102 errmsg = "illegal encoding";
1103 /* Fall through to report the error */
1104
1105 utf16Error:
1106 if (utf16_decoding_error(&q, &p, errors, errmsg))
1107 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 }
1109
1110 if (byteorder)
1111 *byteorder = bo;
1112
1113 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001114 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 goto onError;
1116
1117 return (PyObject *)unicode;
1118
1119onError:
1120 Py_DECREF(unicode);
1121 return NULL;
1122}
1123
1124#undef UTF16_ERROR
1125
1126PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1127 int size,
1128 const char *errors,
1129 int byteorder)
1130{
1131 PyObject *v;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001132 Py_UCS2 *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133 char *q;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001134 int i, pairs, doswap = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001136 for (i = pairs = 0; i < size; i++)
1137 if (s[i] >= 0x10000)
1138 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139 v = PyString_FromStringAndSize(NULL,
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001140 sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141 if (v == NULL)
1142 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143
1144 q = PyString_AS_STRING(v);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001145 p = (Py_UCS2 *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146 if (byteorder == 0)
1147 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001148 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001149 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 if (byteorder == 0 ||
1151#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1152 byteorder == -1
1153#else
1154 byteorder == 1
1155#endif
1156 )
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001157 doswap = 0;
1158 while (size-- > 0) {
1159 Py_UNICODE ch = *s++;
1160 Py_UNICODE ch2 = 0;
1161 if (ch >= 0x10000) {
1162 ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
1163 ch = 0xD800|((ch-0x10000)>>10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001165 if (doswap){
1166 *p++ = (ch >> 8) | (ch << 8);
1167 if (ch2)
1168 *p++ = (ch2 >> 8) | (ch2 << 8);
1169 }else{
1170 *p++ = ch;
1171 if(ch2)
1172 *p++ = ch2;
1173 }
1174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 return v;
1176}
1177
1178PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1179{
1180 if (!PyUnicode_Check(unicode)) {
1181 PyErr_BadArgument();
1182 return NULL;
1183 }
1184 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1185 PyUnicode_GET_SIZE(unicode),
1186 NULL,
1187 0);
1188}
1189
1190/* --- Unicode Escape Codec ----------------------------------------------- */
1191
1192static
1193int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001194 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 const char *errors,
1196 const char *details)
1197{
1198 if ((errors == NULL) ||
1199 (strcmp(errors,"strict") == 0)) {
1200 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001201 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 details);
1203 return -1;
1204 }
1205 else if (strcmp(errors,"ignore") == 0) {
1206 return 0;
1207 }
1208 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001209 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 return 0;
1211 }
1212 else {
1213 PyErr_Format(PyExc_ValueError,
1214 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001215 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 errors);
1217 return -1;
1218 }
1219}
1220
Fredrik Lundh06d12682001-01-24 07:59:11 +00001221static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1224 int size,
1225 const char *errors)
1226{
1227 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001228 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001230 char* message;
1231 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1232
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 /* Escaped strings will always be longer than the resulting
1234 Unicode string, so we start with size here and then reduce the
1235 length after conversion to the true value. */
1236 v = _PyUnicode_New(size);
1237 if (v == NULL)
1238 goto onError;
1239 if (size == 0)
1240 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 p = buf = PyUnicode_AS_UNICODE(v);
1243 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001244
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 while (s < end) {
1246 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001247 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001248 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249
1250 /* Non-escape characters are interpreted as Unicode ordinals */
1251 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001252 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 continue;
1254 }
1255
1256 /* \ - Escapes */
1257 s++;
1258 switch (*s++) {
1259
1260 /* \x escapes */
1261 case '\n': break;
1262 case '\\': *p++ = '\\'; break;
1263 case '\'': *p++ = '\''; break;
1264 case '\"': *p++ = '\"'; break;
1265 case 'b': *p++ = '\b'; break;
1266 case 'f': *p++ = '\014'; break; /* FF */
1267 case 't': *p++ = '\t'; break;
1268 case 'n': *p++ = '\n'; break;
1269 case 'r': *p++ = '\r'; break;
1270 case 'v': *p++ = '\013'; break; /* VT */
1271 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1272
1273 /* \OOO (octal) escapes */
1274 case '0': case '1': case '2': case '3':
1275 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001276 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001278 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001280 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001282 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 break;
1284
Fredrik Lundhccc74732001-02-18 22:13:49 +00001285 /* hex escapes */
1286 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001288 digits = 2;
1289 message = "truncated \\xXX escape";
1290 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291
Fredrik Lundhccc74732001-02-18 22:13:49 +00001292 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001294 digits = 4;
1295 message = "truncated \\uXXXX escape";
1296 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297
Fredrik Lundhccc74732001-02-18 22:13:49 +00001298 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001299 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001300 digits = 8;
1301 message = "truncated \\UXXXXXXXX escape";
1302 hexescape:
1303 chr = 0;
1304 for (i = 0; i < digits; i++) {
1305 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001306 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001307 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001308 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001309 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001310 i++;
1311 break;
1312 }
1313 chr = (chr<<4) & ~0xF;
1314 if (c >= '0' && c <= '9')
1315 chr += c - '0';
1316 else if (c >= 'a' && c <= 'f')
1317 chr += 10 + c - 'a';
1318 else
1319 chr += 10 + c - 'A';
1320 }
1321 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001322 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001323 /* when we get here, chr is a 32-bit unicode character */
1324 if (chr <= 0xffff)
1325 /* UCS-2 character */
1326 *p++ = (Py_UNICODE) chr;
1327 else if (chr <= 0x10ffff) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001328 /* UCS-4 character. Either store directly, or as surrogate pair. */
1329#if Py_UNICODE_SIZE == 4
1330 *p++ = chr;
1331#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001332 chr -= 0x10000L;
1333 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001334 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001336 } else {
1337 if (unicodeescape_decoding_error(
1338 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001339 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001340 )
1341 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001342 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001343 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001344 break;
1345
1346 /* \N{name} */
1347 case 'N':
1348 message = "malformed \\N character escape";
1349 if (ucnhash_CAPI == NULL) {
1350 /* load the unicode data module */
1351 PyObject *m, *v;
1352 m = PyImport_ImportModule("unicodedata");
1353 if (m == NULL)
1354 goto ucnhashError;
1355 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1356 Py_DECREF(m);
1357 if (v == NULL)
1358 goto ucnhashError;
1359 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1360 Py_DECREF(v);
1361 if (ucnhash_CAPI == NULL)
1362 goto ucnhashError;
1363 }
1364 if (*s == '{') {
1365 const char *start = s+1;
1366 /* look for the closing brace */
1367 while (*s != '}' && s < end)
1368 s++;
1369 if (s > start && s < end && *s == '}') {
1370 /* found a name. look it up in the unicode database */
1371 message = "unknown Unicode character name";
1372 s++;
1373 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1374 goto store;
1375 }
1376 }
1377 if (unicodeescape_decoding_error(&s, &x, errors, message))
1378 goto onError;
1379 *p++ = x;
1380 break;
1381
1382 default:
1383 *p++ = '\\';
1384 *p++ = (unsigned char)s[-1];
1385 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 }
1387 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001388 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001389 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 return (PyObject *)v;
1391
Fredrik Lundhccc74732001-02-18 22:13:49 +00001392ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001393 PyErr_SetString(
1394 PyExc_UnicodeError,
1395 "\\N escapes not supported (can't load unicodedata module)"
1396 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001397 return NULL;
1398
Fredrik Lundhccc74732001-02-18 22:13:49 +00001399onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400 Py_XDECREF(v);
1401 return NULL;
1402}
1403
1404/* Return a Unicode-Escape string version of the Unicode object.
1405
1406 If quotes is true, the string is enclosed in u"" or u'' quotes as
1407 appropriate.
1408
1409*/
1410
Barry Warsaw51ac5802000-03-20 16:36:48 +00001411static const Py_UNICODE *findchar(const Py_UNICODE *s,
1412 int size,
1413 Py_UNICODE ch);
1414
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415static
1416PyObject *unicodeescape_string(const Py_UNICODE *s,
1417 int size,
1418 int quotes)
1419{
1420 PyObject *repr;
1421 char *p;
1422 char *q;
1423
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001424 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425
1426 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1427 if (repr == NULL)
1428 return NULL;
1429
1430 p = q = PyString_AS_STRING(repr);
1431
1432 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433 *p++ = 'u';
1434 *p++ = (findchar(s, size, '\'') &&
1435 !findchar(s, size, '"')) ? '"' : '\'';
1436 }
1437 while (size-- > 0) {
1438 Py_UNICODE ch = *s++;
1439 /* Escape quotes */
Fredrik Lundh30831632001-06-26 15:11:00 +00001440 if (quotes && (ch == (Py_UNICODE) q[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 *p++ = '\\';
1442 *p++ = (char) ch;
1443 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001444 /* Map 21-bit characters to '\U00xxxxxx' */
1445 else if (ch >= 0x10000) {
1446 *p++ = '\\';
1447 *p++ = 'U';
1448 *p++ = hexdigit[(ch >> 28) & 0xf];
1449 *p++ = hexdigit[(ch >> 24) & 0xf];
1450 *p++ = hexdigit[(ch >> 20) & 0xf];
1451 *p++ = hexdigit[(ch >> 16) & 0xf];
1452 *p++ = hexdigit[(ch >> 12) & 0xf];
1453 *p++ = hexdigit[(ch >> 8) & 0xf];
1454 *p++ = hexdigit[(ch >> 4) & 0xf];
1455 *p++ = hexdigit[ch & 15];
1456 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001457 /* Map 16-bit characters to '\uxxxx' */
1458 else if (ch >= 256) {
1459 *p++ = '\\';
1460 *p++ = 'u';
1461 *p++ = hexdigit[(ch >> 12) & 0xf];
1462 *p++ = hexdigit[(ch >> 8) & 0xf];
1463 *p++ = hexdigit[(ch >> 4) & 0xf];
1464 *p++ = hexdigit[ch & 15];
1465 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001466 /* Map special whitespace to '\t', \n', '\r' */
1467 else if (ch == '\t') {
1468 *p++ = '\\';
1469 *p++ = 't';
1470 }
1471 else if (ch == '\n') {
1472 *p++ = '\\';
1473 *p++ = 'n';
1474 }
1475 else if (ch == '\r') {
1476 *p++ = '\\';
1477 *p++ = 'r';
1478 }
1479 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 else if (ch < ' ' || ch >= 128) {
1481 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001482 *p++ = 'x';
1483 *p++ = hexdigit[(ch >> 4) & 0xf];
1484 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485 }
1486 /* Copy everything else as-is */
1487 else
1488 *p++ = (char) ch;
1489 }
1490 if (quotes)
1491 *p++ = q[1];
1492
1493 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001494 if (_PyString_Resize(&repr, p - q))
1495 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496
1497 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001498
1499 onError:
1500 Py_DECREF(repr);
1501 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001502}
1503
1504PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1505 int size)
1506{
1507 return unicodeescape_string(s, size, 0);
1508}
1509
1510PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1511{
1512 if (!PyUnicode_Check(unicode)) {
1513 PyErr_BadArgument();
1514 return NULL;
1515 }
1516 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1517 PyUnicode_GET_SIZE(unicode));
1518}
1519
1520/* --- Raw Unicode Escape Codec ------------------------------------------- */
1521
1522PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1523 int size,
1524 const char *errors)
1525{
1526 PyUnicodeObject *v;
1527 Py_UNICODE *p, *buf;
1528 const char *end;
1529 const char *bs;
1530
1531 /* Escaped strings will always be longer than the resulting
1532 Unicode string, so we start with size here and then reduce the
1533 length after conversion to the true value. */
1534 v = _PyUnicode_New(size);
1535 if (v == NULL)
1536 goto onError;
1537 if (size == 0)
1538 return (PyObject *)v;
1539 p = buf = PyUnicode_AS_UNICODE(v);
1540 end = s + size;
1541 while (s < end) {
1542 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001543 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544 int i;
1545
1546 /* Non-escape characters are interpreted as Unicode ordinals */
1547 if (*s != '\\') {
1548 *p++ = (unsigned char)*s++;
1549 continue;
1550 }
1551
1552 /* \u-escapes are only interpreted iff the number of leading
1553 backslashes if odd */
1554 bs = s;
1555 for (;s < end;) {
1556 if (*s != '\\')
1557 break;
1558 *p++ = (unsigned char)*s++;
1559 }
1560 if (((s - bs) & 1) == 0 ||
1561 s >= end ||
1562 *s != 'u') {
1563 continue;
1564 }
1565 p--;
1566 s++;
1567
1568 /* \uXXXX with 4 hex digits */
1569 for (x = 0, i = 0; i < 4; i++) {
1570 c = (unsigned char)s[i];
1571 if (!isxdigit(c)) {
1572 if (unicodeescape_decoding_error(&s, &x, errors,
1573 "truncated \\uXXXX"))
1574 goto onError;
1575 i++;
1576 break;
1577 }
1578 x = (x<<4) & ~0xF;
1579 if (c >= '0' && c <= '9')
1580 x += c - '0';
1581 else if (c >= 'a' && c <= 'f')
1582 x += 10 + c - 'a';
1583 else
1584 x += 10 + c - 'A';
1585 }
1586 s += i;
1587 *p++ = x;
1588 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001590 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591 return (PyObject *)v;
1592
1593 onError:
1594 Py_XDECREF(v);
1595 return NULL;
1596}
1597
1598PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1599 int size)
1600{
1601 PyObject *repr;
1602 char *p;
1603 char *q;
1604
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001605 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606
1607 repr = PyString_FromStringAndSize(NULL, 6 * size);
1608 if (repr == NULL)
1609 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001610 if (size == 0)
1611 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001612
1613 p = q = PyString_AS_STRING(repr);
1614 while (size-- > 0) {
1615 Py_UNICODE ch = *s++;
1616 /* Map 16-bit characters to '\uxxxx' */
1617 if (ch >= 256) {
1618 *p++ = '\\';
1619 *p++ = 'u';
1620 *p++ = hexdigit[(ch >> 12) & 0xf];
1621 *p++ = hexdigit[(ch >> 8) & 0xf];
1622 *p++ = hexdigit[(ch >> 4) & 0xf];
1623 *p++ = hexdigit[ch & 15];
1624 }
1625 /* Copy everything else as-is */
1626 else
1627 *p++ = (char) ch;
1628 }
1629 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001630 if (_PyString_Resize(&repr, p - q))
1631 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632
1633 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001634
1635 onError:
1636 Py_DECREF(repr);
1637 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638}
1639
1640PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1641{
1642 if (!PyUnicode_Check(unicode)) {
1643 PyErr_BadArgument();
1644 return NULL;
1645 }
1646 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1647 PyUnicode_GET_SIZE(unicode));
1648}
1649
1650/* --- Latin-1 Codec ------------------------------------------------------ */
1651
1652PyObject *PyUnicode_DecodeLatin1(const char *s,
1653 int size,
1654 const char *errors)
1655{
1656 PyUnicodeObject *v;
1657 Py_UNICODE *p;
1658
1659 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001660 if (size == 1 && *(unsigned char*)s < 256) {
1661 Py_UNICODE r = *(unsigned char*)s;
1662 return PyUnicode_FromUnicode(&r, 1);
1663 }
1664
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 v = _PyUnicode_New(size);
1666 if (v == NULL)
1667 goto onError;
1668 if (size == 0)
1669 return (PyObject *)v;
1670 p = PyUnicode_AS_UNICODE(v);
1671 while (size-- > 0)
1672 *p++ = (unsigned char)*s++;
1673 return (PyObject *)v;
1674
1675 onError:
1676 Py_XDECREF(v);
1677 return NULL;
1678}
1679
1680static
1681int latin1_encoding_error(const Py_UNICODE **source,
1682 char **dest,
1683 const char *errors,
1684 const char *details)
1685{
1686 if ((errors == NULL) ||
1687 (strcmp(errors,"strict") == 0)) {
1688 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001689 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 details);
1691 return -1;
1692 }
1693 else if (strcmp(errors,"ignore") == 0) {
1694 return 0;
1695 }
1696 else if (strcmp(errors,"replace") == 0) {
1697 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001698 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001699 return 0;
1700 }
1701 else {
1702 PyErr_Format(PyExc_ValueError,
1703 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001704 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705 errors);
1706 return -1;
1707 }
1708}
1709
1710PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1711 int size,
1712 const char *errors)
1713{
1714 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001715 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001716
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 repr = PyString_FromStringAndSize(NULL, size);
1718 if (repr == NULL)
1719 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001720 if (size == 0)
1721 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722
1723 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001724 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725 while (size-- > 0) {
1726 Py_UNICODE ch = *p++;
1727 if (ch >= 256) {
1728 if (latin1_encoding_error(&p, &s, errors,
1729 "ordinal not in range(256)"))
1730 goto onError;
1731 }
1732 else
1733 *s++ = (char)ch;
1734 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001735 /* Resize if error handling skipped some characters */
1736 if (s - start < PyString_GET_SIZE(repr))
1737 if (_PyString_Resize(&repr, s - start))
1738 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 return repr;
1740
1741 onError:
1742 Py_DECREF(repr);
1743 return NULL;
1744}
1745
1746PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1747{
1748 if (!PyUnicode_Check(unicode)) {
1749 PyErr_BadArgument();
1750 return NULL;
1751 }
1752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1753 PyUnicode_GET_SIZE(unicode),
1754 NULL);
1755}
1756
1757/* --- 7-bit ASCII Codec -------------------------------------------------- */
1758
1759static
1760int ascii_decoding_error(const char **source,
1761 Py_UNICODE **dest,
1762 const char *errors,
1763 const char *details)
1764{
1765 if ((errors == NULL) ||
1766 (strcmp(errors,"strict") == 0)) {
1767 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001768 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769 details);
1770 return -1;
1771 }
1772 else if (strcmp(errors,"ignore") == 0) {
1773 return 0;
1774 }
1775 else if (strcmp(errors,"replace") == 0) {
1776 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1777 (*dest)++;
1778 return 0;
1779 }
1780 else {
1781 PyErr_Format(PyExc_ValueError,
1782 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001783 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001784 errors);
1785 return -1;
1786 }
1787}
1788
1789PyObject *PyUnicode_DecodeASCII(const char *s,
1790 int size,
1791 const char *errors)
1792{
1793 PyUnicodeObject *v;
1794 Py_UNICODE *p;
1795
1796 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001797 if (size == 1 && *(unsigned char*)s < 128) {
1798 Py_UNICODE r = *(unsigned char*)s;
1799 return PyUnicode_FromUnicode(&r, 1);
1800 }
1801
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 v = _PyUnicode_New(size);
1803 if (v == NULL)
1804 goto onError;
1805 if (size == 0)
1806 return (PyObject *)v;
1807 p = PyUnicode_AS_UNICODE(v);
1808 while (size-- > 0) {
1809 register unsigned char c;
1810
1811 c = (unsigned char)*s++;
1812 if (c < 128)
1813 *p++ = c;
1814 else if (ascii_decoding_error(&s, &p, errors,
1815 "ordinal not in range(128)"))
1816 goto onError;
1817 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001818 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001819 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001820 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001821 return (PyObject *)v;
1822
1823 onError:
1824 Py_XDECREF(v);
1825 return NULL;
1826}
1827
1828static
1829int ascii_encoding_error(const Py_UNICODE **source,
1830 char **dest,
1831 const char *errors,
1832 const char *details)
1833{
1834 if ((errors == NULL) ||
1835 (strcmp(errors,"strict") == 0)) {
1836 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001837 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001838 details);
1839 return -1;
1840 }
1841 else if (strcmp(errors,"ignore") == 0) {
1842 return 0;
1843 }
1844 else if (strcmp(errors,"replace") == 0) {
1845 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001846 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001847 return 0;
1848 }
1849 else {
1850 PyErr_Format(PyExc_ValueError,
1851 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001852 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853 errors);
1854 return -1;
1855 }
1856}
1857
1858PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1859 int size,
1860 const char *errors)
1861{
1862 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001863 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001864
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 repr = PyString_FromStringAndSize(NULL, size);
1866 if (repr == NULL)
1867 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001868 if (size == 0)
1869 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001870
1871 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001872 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873 while (size-- > 0) {
1874 Py_UNICODE ch = *p++;
1875 if (ch >= 128) {
1876 if (ascii_encoding_error(&p, &s, errors,
1877 "ordinal not in range(128)"))
1878 goto onError;
1879 }
1880 else
1881 *s++ = (char)ch;
1882 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001883 /* Resize if error handling skipped some characters */
1884 if (s - start < PyString_GET_SIZE(repr))
1885 if (_PyString_Resize(&repr, s - start))
1886 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001887 return repr;
1888
1889 onError:
1890 Py_DECREF(repr);
1891 return NULL;
1892}
1893
1894PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1895{
1896 if (!PyUnicode_Check(unicode)) {
1897 PyErr_BadArgument();
1898 return NULL;
1899 }
1900 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1901 PyUnicode_GET_SIZE(unicode),
1902 NULL);
1903}
1904
Fredrik Lundh30831632001-06-26 15:11:00 +00001905#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001906
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001907/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001908
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001909PyObject *PyUnicode_DecodeMBCS(const char *s,
1910 int size,
1911 const char *errors)
1912{
1913 PyUnicodeObject *v;
1914 Py_UNICODE *p;
1915
1916 /* First get the size of the result */
1917 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001918 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001919 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1920
1921 v = _PyUnicode_New(usize);
1922 if (v == NULL)
1923 return NULL;
1924 if (usize == 0)
1925 return (PyObject *)v;
1926 p = PyUnicode_AS_UNICODE(v);
1927 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1928 Py_DECREF(v);
1929 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1930 }
1931
1932 return (PyObject *)v;
1933}
1934
1935PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1936 int size,
1937 const char *errors)
1938{
1939 PyObject *repr;
1940 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001941 DWORD mbcssize;
1942
1943 /* If there are no characters, bail now! */
1944 if (size==0)
1945 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001946
1947 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001948 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001949 if (mbcssize==0)
1950 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1951
1952 repr = PyString_FromStringAndSize(NULL, mbcssize);
1953 if (repr == NULL)
1954 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001955 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001956 return repr;
1957
1958 /* Do the conversion */
1959 s = PyString_AS_STRING(repr);
1960 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1961 Py_DECREF(repr);
1962 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1963 }
1964 return repr;
1965}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001966
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001967#endif /* MS_WIN32 */
1968
Guido van Rossumd57fd912000-03-10 22:53:23 +00001969/* --- Character Mapping Codec -------------------------------------------- */
1970
1971static
1972int charmap_decoding_error(const char **source,
1973 Py_UNICODE **dest,
1974 const char *errors,
1975 const char *details)
1976{
1977 if ((errors == NULL) ||
1978 (strcmp(errors,"strict") == 0)) {
1979 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001980 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 details);
1982 return -1;
1983 }
1984 else if (strcmp(errors,"ignore") == 0) {
1985 return 0;
1986 }
1987 else if (strcmp(errors,"replace") == 0) {
1988 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1989 (*dest)++;
1990 return 0;
1991 }
1992 else {
1993 PyErr_Format(PyExc_ValueError,
1994 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001995 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996 errors);
1997 return -1;
1998 }
1999}
2000
2001PyObject *PyUnicode_DecodeCharmap(const char *s,
2002 int size,
2003 PyObject *mapping,
2004 const char *errors)
2005{
2006 PyUnicodeObject *v;
2007 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002008 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009
2010 /* Default to Latin-1 */
2011 if (mapping == NULL)
2012 return PyUnicode_DecodeLatin1(s, size, errors);
2013
2014 v = _PyUnicode_New(size);
2015 if (v == NULL)
2016 goto onError;
2017 if (size == 0)
2018 return (PyObject *)v;
2019 p = PyUnicode_AS_UNICODE(v);
2020 while (size-- > 0) {
2021 unsigned char ch = *s++;
2022 PyObject *w, *x;
2023
2024 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2025 w = PyInt_FromLong((long)ch);
2026 if (w == NULL)
2027 goto onError;
2028 x = PyObject_GetItem(mapping, w);
2029 Py_DECREF(w);
2030 if (x == NULL) {
2031 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002032 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002034 x = Py_None;
2035 Py_INCREF(x);
2036 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002037 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 }
2039
2040 /* Apply mapping */
2041 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002042 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002043 if (value < 0 || value > 65535) {
2044 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002045 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002046 Py_DECREF(x);
2047 goto onError;
2048 }
2049 *p++ = (Py_UNICODE)value;
2050 }
2051 else if (x == Py_None) {
2052 /* undefined mapping */
2053 if (charmap_decoding_error(&s, &p, errors,
2054 "character maps to <undefined>")) {
2055 Py_DECREF(x);
2056 goto onError;
2057 }
2058 }
2059 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002060 int targetsize = PyUnicode_GET_SIZE(x);
2061
2062 if (targetsize == 1)
2063 /* 1-1 mapping */
2064 *p++ = *PyUnicode_AS_UNICODE(x);
2065
2066 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002068 if (targetsize > extrachars) {
2069 /* resize first */
2070 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2071 int needed = (targetsize - extrachars) + \
2072 (targetsize << 2);
2073 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002074 if (_PyUnicode_Resize(&v,
2075 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002076 Py_DECREF(x);
2077 goto onError;
2078 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002079 p = PyUnicode_AS_UNICODE(v) + oldpos;
2080 }
2081 Py_UNICODE_COPY(p,
2082 PyUnicode_AS_UNICODE(x),
2083 targetsize);
2084 p += targetsize;
2085 extrachars -= targetsize;
2086 }
2087 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088 }
2089 else {
2090 /* wrong return value */
2091 PyErr_SetString(PyExc_TypeError,
2092 "character mapping must return integer, None or unicode");
2093 Py_DECREF(x);
2094 goto onError;
2095 }
2096 Py_DECREF(x);
2097 }
2098 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002099 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002100 goto onError;
2101 return (PyObject *)v;
2102
2103 onError:
2104 Py_XDECREF(v);
2105 return NULL;
2106}
2107
2108static
2109int charmap_encoding_error(const Py_UNICODE **source,
2110 char **dest,
2111 const char *errors,
2112 const char *details)
2113{
2114 if ((errors == NULL) ||
2115 (strcmp(errors,"strict") == 0)) {
2116 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002117 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002118 details);
2119 return -1;
2120 }
2121 else if (strcmp(errors,"ignore") == 0) {
2122 return 0;
2123 }
2124 else if (strcmp(errors,"replace") == 0) {
2125 **dest = '?';
2126 (*dest)++;
2127 return 0;
2128 }
2129 else {
2130 PyErr_Format(PyExc_ValueError,
2131 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002132 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 errors);
2134 return -1;
2135 }
2136}
2137
2138PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2139 int size,
2140 PyObject *mapping,
2141 const char *errors)
2142{
2143 PyObject *v;
2144 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002145 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146
2147 /* Default to Latin-1 */
2148 if (mapping == NULL)
2149 return PyUnicode_EncodeLatin1(p, size, errors);
2150
2151 v = PyString_FromStringAndSize(NULL, size);
2152 if (v == NULL)
2153 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002154 if (size == 0)
2155 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002156 s = PyString_AS_STRING(v);
2157 while (size-- > 0) {
2158 Py_UNICODE ch = *p++;
2159 PyObject *w, *x;
2160
2161 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2162 w = PyInt_FromLong((long)ch);
2163 if (w == NULL)
2164 goto onError;
2165 x = PyObject_GetItem(mapping, w);
2166 Py_DECREF(w);
2167 if (x == NULL) {
2168 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002169 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002171 x = Py_None;
2172 Py_INCREF(x);
2173 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002174 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 }
2176
2177 /* Apply mapping */
2178 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002179 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 if (value < 0 || value > 255) {
2181 PyErr_SetString(PyExc_TypeError,
2182 "character mapping must be in range(256)");
2183 Py_DECREF(x);
2184 goto onError;
2185 }
2186 *s++ = (char)value;
2187 }
2188 else if (x == Py_None) {
2189 /* undefined mapping */
2190 if (charmap_encoding_error(&p, &s, errors,
2191 "character maps to <undefined>")) {
2192 Py_DECREF(x);
2193 goto onError;
2194 }
2195 }
2196 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002197 int targetsize = PyString_GET_SIZE(x);
2198
2199 if (targetsize == 1)
2200 /* 1-1 mapping */
2201 *s++ = *PyString_AS_STRING(x);
2202
2203 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002205 if (targetsize > extrachars) {
2206 /* resize first */
2207 int oldpos = (int)(s - PyString_AS_STRING(v));
2208 int needed = (targetsize - extrachars) + \
2209 (targetsize << 2);
2210 extrachars += needed;
2211 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002212 Py_DECREF(x);
2213 goto onError;
2214 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002215 s = PyString_AS_STRING(v) + oldpos;
2216 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002217 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002218 s += targetsize;
2219 extrachars -= targetsize;
2220 }
2221 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002222 }
2223 else {
2224 /* wrong return value */
2225 PyErr_SetString(PyExc_TypeError,
2226 "character mapping must return integer, None or unicode");
2227 Py_DECREF(x);
2228 goto onError;
2229 }
2230 Py_DECREF(x);
2231 }
2232 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2233 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2234 goto onError;
2235 return v;
2236
2237 onError:
2238 Py_DECREF(v);
2239 return NULL;
2240}
2241
2242PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2243 PyObject *mapping)
2244{
2245 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2246 PyErr_BadArgument();
2247 return NULL;
2248 }
2249 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2250 PyUnicode_GET_SIZE(unicode),
2251 mapping,
2252 NULL);
2253}
2254
2255static
2256int translate_error(const Py_UNICODE **source,
2257 Py_UNICODE **dest,
2258 const char *errors,
2259 const char *details)
2260{
2261 if ((errors == NULL) ||
2262 (strcmp(errors,"strict") == 0)) {
2263 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002264 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002265 details);
2266 return -1;
2267 }
2268 else if (strcmp(errors,"ignore") == 0) {
2269 return 0;
2270 }
2271 else if (strcmp(errors,"replace") == 0) {
2272 **dest = '?';
2273 (*dest)++;
2274 return 0;
2275 }
2276 else {
2277 PyErr_Format(PyExc_ValueError,
2278 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002279 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002280 errors);
2281 return -1;
2282 }
2283}
2284
2285PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2286 int size,
2287 PyObject *mapping,
2288 const char *errors)
2289{
2290 PyUnicodeObject *v;
2291 Py_UNICODE *p;
2292
2293 if (mapping == NULL) {
2294 PyErr_BadArgument();
2295 return NULL;
2296 }
2297
2298 /* Output will never be longer than input */
2299 v = _PyUnicode_New(size);
2300 if (v == NULL)
2301 goto onError;
2302 if (size == 0)
2303 goto done;
2304 p = PyUnicode_AS_UNICODE(v);
2305 while (size-- > 0) {
2306 Py_UNICODE ch = *s++;
2307 PyObject *w, *x;
2308
2309 /* Get mapping */
2310 w = PyInt_FromLong(ch);
2311 if (w == NULL)
2312 goto onError;
2313 x = PyObject_GetItem(mapping, w);
2314 Py_DECREF(w);
2315 if (x == NULL) {
2316 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2317 /* No mapping found: default to 1-1 mapping */
2318 PyErr_Clear();
2319 *p++ = ch;
2320 continue;
2321 }
2322 goto onError;
2323 }
2324
2325 /* Apply mapping */
2326 if (PyInt_Check(x))
2327 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2328 else if (x == Py_None) {
2329 /* undefined mapping */
2330 if (translate_error(&s, &p, errors,
2331 "character maps to <undefined>")) {
2332 Py_DECREF(x);
2333 goto onError;
2334 }
2335 }
2336 else if (PyUnicode_Check(x)) {
2337 if (PyUnicode_GET_SIZE(x) != 1) {
2338 /* 1-n mapping */
2339 PyErr_SetString(PyExc_NotImplementedError,
2340 "1-n mappings are currently not implemented");
2341 Py_DECREF(x);
2342 goto onError;
2343 }
2344 *p++ = *PyUnicode_AS_UNICODE(x);
2345 }
2346 else {
2347 /* wrong return value */
2348 PyErr_SetString(PyExc_TypeError,
2349 "translate mapping must return integer, None or unicode");
2350 Py_DECREF(x);
2351 goto onError;
2352 }
2353 Py_DECREF(x);
2354 }
2355 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002356 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002357 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002358
2359 done:
2360 return (PyObject *)v;
2361
2362 onError:
2363 Py_XDECREF(v);
2364 return NULL;
2365}
2366
2367PyObject *PyUnicode_Translate(PyObject *str,
2368 PyObject *mapping,
2369 const char *errors)
2370{
2371 PyObject *result;
2372
2373 str = PyUnicode_FromObject(str);
2374 if (str == NULL)
2375 goto onError;
2376 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2377 PyUnicode_GET_SIZE(str),
2378 mapping,
2379 errors);
2380 Py_DECREF(str);
2381 return result;
2382
2383 onError:
2384 Py_XDECREF(str);
2385 return NULL;
2386}
2387
Guido van Rossum9e896b32000-04-05 20:11:21 +00002388/* --- Decimal Encoder ---------------------------------------------------- */
2389
2390int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2391 int length,
2392 char *output,
2393 const char *errors)
2394{
2395 Py_UNICODE *p, *end;
2396
2397 if (output == NULL) {
2398 PyErr_BadArgument();
2399 return -1;
2400 }
2401
2402 p = s;
2403 end = s + length;
2404 while (p < end) {
2405 register Py_UNICODE ch = *p++;
2406 int decimal;
2407
2408 if (Py_UNICODE_ISSPACE(ch)) {
2409 *output++ = ' ';
2410 continue;
2411 }
2412 decimal = Py_UNICODE_TODECIMAL(ch);
2413 if (decimal >= 0) {
2414 *output++ = '0' + decimal;
2415 continue;
2416 }
Guido van Rossumba477042000-04-06 18:18:10 +00002417 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002418 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002419 continue;
2420 }
2421 /* All other characters are considered invalid */
2422 if (errors == NULL || strcmp(errors, "strict") == 0) {
2423 PyErr_SetString(PyExc_ValueError,
2424 "invalid decimal Unicode string");
2425 goto onError;
2426 }
2427 else if (strcmp(errors, "ignore") == 0)
2428 continue;
2429 else if (strcmp(errors, "replace") == 0) {
2430 *output++ = '?';
2431 continue;
2432 }
2433 }
2434 /* 0-terminate the output string */
2435 *output++ = '\0';
2436 return 0;
2437
2438 onError:
2439 return -1;
2440}
2441
Guido van Rossumd57fd912000-03-10 22:53:23 +00002442/* --- Helpers ------------------------------------------------------------ */
2443
2444static
2445int count(PyUnicodeObject *self,
2446 int start,
2447 int end,
2448 PyUnicodeObject *substring)
2449{
2450 int count = 0;
2451
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002452 if (start < 0)
2453 start += self->length;
2454 if (start < 0)
2455 start = 0;
2456 if (end > self->length)
2457 end = self->length;
2458 if (end < 0)
2459 end += self->length;
2460 if (end < 0)
2461 end = 0;
2462
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002463 if (substring->length == 0)
2464 return (end - start + 1);
2465
Guido van Rossumd57fd912000-03-10 22:53:23 +00002466 end -= substring->length;
2467
2468 while (start <= end)
2469 if (Py_UNICODE_MATCH(self, start, substring)) {
2470 count++;
2471 start += substring->length;
2472 } else
2473 start++;
2474
2475 return count;
2476}
2477
2478int PyUnicode_Count(PyObject *str,
2479 PyObject *substr,
2480 int start,
2481 int end)
2482{
2483 int result;
2484
2485 str = PyUnicode_FromObject(str);
2486 if (str == NULL)
2487 return -1;
2488 substr = PyUnicode_FromObject(substr);
2489 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002490 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002491 return -1;
2492 }
2493
2494 result = count((PyUnicodeObject *)str,
2495 start, end,
2496 (PyUnicodeObject *)substr);
2497
2498 Py_DECREF(str);
2499 Py_DECREF(substr);
2500 return result;
2501}
2502
2503static
2504int findstring(PyUnicodeObject *self,
2505 PyUnicodeObject *substring,
2506 int start,
2507 int end,
2508 int direction)
2509{
2510 if (start < 0)
2511 start += self->length;
2512 if (start < 0)
2513 start = 0;
2514
2515 if (substring->length == 0)
2516 return start;
2517
2518 if (end > self->length)
2519 end = self->length;
2520 if (end < 0)
2521 end += self->length;
2522 if (end < 0)
2523 end = 0;
2524
2525 end -= substring->length;
2526
2527 if (direction < 0) {
2528 for (; end >= start; end--)
2529 if (Py_UNICODE_MATCH(self, end, substring))
2530 return end;
2531 } else {
2532 for (; start <= end; start++)
2533 if (Py_UNICODE_MATCH(self, start, substring))
2534 return start;
2535 }
2536
2537 return -1;
2538}
2539
2540int PyUnicode_Find(PyObject *str,
2541 PyObject *substr,
2542 int start,
2543 int end,
2544 int direction)
2545{
2546 int result;
2547
2548 str = PyUnicode_FromObject(str);
2549 if (str == NULL)
2550 return -1;
2551 substr = PyUnicode_FromObject(substr);
2552 if (substr == NULL) {
2553 Py_DECREF(substr);
2554 return -1;
2555 }
2556
2557 result = findstring((PyUnicodeObject *)str,
2558 (PyUnicodeObject *)substr,
2559 start, end, direction);
2560 Py_DECREF(str);
2561 Py_DECREF(substr);
2562 return result;
2563}
2564
2565static
2566int tailmatch(PyUnicodeObject *self,
2567 PyUnicodeObject *substring,
2568 int start,
2569 int end,
2570 int direction)
2571{
2572 if (start < 0)
2573 start += self->length;
2574 if (start < 0)
2575 start = 0;
2576
2577 if (substring->length == 0)
2578 return 1;
2579
2580 if (end > self->length)
2581 end = self->length;
2582 if (end < 0)
2583 end += self->length;
2584 if (end < 0)
2585 end = 0;
2586
2587 end -= substring->length;
2588 if (end < start)
2589 return 0;
2590
2591 if (direction > 0) {
2592 if (Py_UNICODE_MATCH(self, end, substring))
2593 return 1;
2594 } else {
2595 if (Py_UNICODE_MATCH(self, start, substring))
2596 return 1;
2597 }
2598
2599 return 0;
2600}
2601
2602int PyUnicode_Tailmatch(PyObject *str,
2603 PyObject *substr,
2604 int start,
2605 int end,
2606 int direction)
2607{
2608 int result;
2609
2610 str = PyUnicode_FromObject(str);
2611 if (str == NULL)
2612 return -1;
2613 substr = PyUnicode_FromObject(substr);
2614 if (substr == NULL) {
2615 Py_DECREF(substr);
2616 return -1;
2617 }
2618
2619 result = tailmatch((PyUnicodeObject *)str,
2620 (PyUnicodeObject *)substr,
2621 start, end, direction);
2622 Py_DECREF(str);
2623 Py_DECREF(substr);
2624 return result;
2625}
2626
2627static
2628const Py_UNICODE *findchar(const Py_UNICODE *s,
2629 int size,
2630 Py_UNICODE ch)
2631{
2632 /* like wcschr, but doesn't stop at NULL characters */
2633
2634 while (size-- > 0) {
2635 if (*s == ch)
2636 return s;
2637 s++;
2638 }
2639
2640 return NULL;
2641}
2642
2643/* Apply fixfct filter to the Unicode object self and return a
2644 reference to the modified object */
2645
2646static
2647PyObject *fixup(PyUnicodeObject *self,
2648 int (*fixfct)(PyUnicodeObject *s))
2649{
2650
2651 PyUnicodeObject *u;
2652
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002653 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 if (u == NULL)
2655 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002656
2657 Py_UNICODE_COPY(u->str, self->str, self->length);
2658
Guido van Rossumd57fd912000-03-10 22:53:23 +00002659 if (!fixfct(u)) {
2660 /* fixfct should return TRUE if it modified the buffer. If
2661 FALSE, return a reference to the original buffer instead
2662 (to save space, not time) */
2663 Py_INCREF(self);
2664 Py_DECREF(u);
2665 return (PyObject*) self;
2666 }
2667 return (PyObject*) u;
2668}
2669
2670static
2671int fixupper(PyUnicodeObject *self)
2672{
2673 int len = self->length;
2674 Py_UNICODE *s = self->str;
2675 int status = 0;
2676
2677 while (len-- > 0) {
2678 register Py_UNICODE ch;
2679
2680 ch = Py_UNICODE_TOUPPER(*s);
2681 if (ch != *s) {
2682 status = 1;
2683 *s = ch;
2684 }
2685 s++;
2686 }
2687
2688 return status;
2689}
2690
2691static
2692int fixlower(PyUnicodeObject *self)
2693{
2694 int len = self->length;
2695 Py_UNICODE *s = self->str;
2696 int status = 0;
2697
2698 while (len-- > 0) {
2699 register Py_UNICODE ch;
2700
2701 ch = Py_UNICODE_TOLOWER(*s);
2702 if (ch != *s) {
2703 status = 1;
2704 *s = ch;
2705 }
2706 s++;
2707 }
2708
2709 return status;
2710}
2711
2712static
2713int fixswapcase(PyUnicodeObject *self)
2714{
2715 int len = self->length;
2716 Py_UNICODE *s = self->str;
2717 int status = 0;
2718
2719 while (len-- > 0) {
2720 if (Py_UNICODE_ISUPPER(*s)) {
2721 *s = Py_UNICODE_TOLOWER(*s);
2722 status = 1;
2723 } else if (Py_UNICODE_ISLOWER(*s)) {
2724 *s = Py_UNICODE_TOUPPER(*s);
2725 status = 1;
2726 }
2727 s++;
2728 }
2729
2730 return status;
2731}
2732
2733static
2734int fixcapitalize(PyUnicodeObject *self)
2735{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002736 int len = self->length;
2737 Py_UNICODE *s = self->str;
2738 int status = 0;
2739
2740 if (len == 0)
2741 return 0;
2742 if (Py_UNICODE_ISLOWER(*s)) {
2743 *s = Py_UNICODE_TOUPPER(*s);
2744 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002745 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002746 s++;
2747 while (--len > 0) {
2748 if (Py_UNICODE_ISUPPER(*s)) {
2749 *s = Py_UNICODE_TOLOWER(*s);
2750 status = 1;
2751 }
2752 s++;
2753 }
2754 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002755}
2756
2757static
2758int fixtitle(PyUnicodeObject *self)
2759{
2760 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2761 register Py_UNICODE *e;
2762 int previous_is_cased;
2763
2764 /* Shortcut for single character strings */
2765 if (PyUnicode_GET_SIZE(self) == 1) {
2766 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2767 if (*p != ch) {
2768 *p = ch;
2769 return 1;
2770 }
2771 else
2772 return 0;
2773 }
2774
2775 e = p + PyUnicode_GET_SIZE(self);
2776 previous_is_cased = 0;
2777 for (; p < e; p++) {
2778 register const Py_UNICODE ch = *p;
2779
2780 if (previous_is_cased)
2781 *p = Py_UNICODE_TOLOWER(ch);
2782 else
2783 *p = Py_UNICODE_TOTITLE(ch);
2784
2785 if (Py_UNICODE_ISLOWER(ch) ||
2786 Py_UNICODE_ISUPPER(ch) ||
2787 Py_UNICODE_ISTITLE(ch))
2788 previous_is_cased = 1;
2789 else
2790 previous_is_cased = 0;
2791 }
2792 return 1;
2793}
2794
2795PyObject *PyUnicode_Join(PyObject *separator,
2796 PyObject *seq)
2797{
2798 Py_UNICODE *sep;
2799 int seplen;
2800 PyUnicodeObject *res = NULL;
2801 int reslen = 0;
2802 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 int sz = 100;
2804 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002805 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002806
Tim Peters2cfe3682001-05-05 05:36:48 +00002807 it = PyObject_GetIter(seq);
2808 if (it == NULL)
2809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810
2811 if (separator == NULL) {
2812 Py_UNICODE blank = ' ';
2813 sep = &blank;
2814 seplen = 1;
2815 }
2816 else {
2817 separator = PyUnicode_FromObject(separator);
2818 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002819 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002820 sep = PyUnicode_AS_UNICODE(separator);
2821 seplen = PyUnicode_GET_SIZE(separator);
2822 }
2823
2824 res = _PyUnicode_New(sz);
2825 if (res == NULL)
2826 goto onError;
2827 p = PyUnicode_AS_UNICODE(res);
2828 reslen = 0;
2829
Tim Peters2cfe3682001-05-05 05:36:48 +00002830 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002832 PyObject *item = PyIter_Next(it);
2833 if (item == NULL) {
2834 if (PyErr_Occurred())
2835 goto onError;
2836 break;
2837 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 if (!PyUnicode_Check(item)) {
2839 PyObject *v;
2840 v = PyUnicode_FromObject(item);
2841 Py_DECREF(item);
2842 item = v;
2843 if (item == NULL)
2844 goto onError;
2845 }
2846 itemlen = PyUnicode_GET_SIZE(item);
2847 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002848 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 goto onError;
2850 sz *= 2;
2851 p = PyUnicode_AS_UNICODE(res) + reslen;
2852 }
2853 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002854 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002855 p += seplen;
2856 reslen += seplen;
2857 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002858 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 p += itemlen;
2860 reslen += itemlen;
2861 Py_DECREF(item);
2862 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002863 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 goto onError;
2865
2866 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002867 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002868 return (PyObject *)res;
2869
2870 onError:
2871 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002872 Py_XDECREF(res);
2873 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 return NULL;
2875}
2876
2877static
2878PyUnicodeObject *pad(PyUnicodeObject *self,
2879 int left,
2880 int right,
2881 Py_UNICODE fill)
2882{
2883 PyUnicodeObject *u;
2884
2885 if (left < 0)
2886 left = 0;
2887 if (right < 0)
2888 right = 0;
2889
2890 if (left == 0 && right == 0) {
2891 Py_INCREF(self);
2892 return self;
2893 }
2894
2895 u = _PyUnicode_New(left + self->length + right);
2896 if (u) {
2897 if (left)
2898 Py_UNICODE_FILL(u->str, fill, left);
2899 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2900 if (right)
2901 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2902 }
2903
2904 return u;
2905}
2906
2907#define SPLIT_APPEND(data, left, right) \
2908 str = PyUnicode_FromUnicode(data + left, right - left); \
2909 if (!str) \
2910 goto onError; \
2911 if (PyList_Append(list, str)) { \
2912 Py_DECREF(str); \
2913 goto onError; \
2914 } \
2915 else \
2916 Py_DECREF(str);
2917
2918static
2919PyObject *split_whitespace(PyUnicodeObject *self,
2920 PyObject *list,
2921 int maxcount)
2922{
2923 register int i;
2924 register int j;
2925 int len = self->length;
2926 PyObject *str;
2927
2928 for (i = j = 0; i < len; ) {
2929 /* find a token */
2930 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2931 i++;
2932 j = i;
2933 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2934 i++;
2935 if (j < i) {
2936 if (maxcount-- <= 0)
2937 break;
2938 SPLIT_APPEND(self->str, j, i);
2939 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2940 i++;
2941 j = i;
2942 }
2943 }
2944 if (j < len) {
2945 SPLIT_APPEND(self->str, j, len);
2946 }
2947 return list;
2948
2949 onError:
2950 Py_DECREF(list);
2951 return NULL;
2952}
2953
2954PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002955 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002956{
2957 register int i;
2958 register int j;
2959 int len;
2960 PyObject *list;
2961 PyObject *str;
2962 Py_UNICODE *data;
2963
2964 string = PyUnicode_FromObject(string);
2965 if (string == NULL)
2966 return NULL;
2967 data = PyUnicode_AS_UNICODE(string);
2968 len = PyUnicode_GET_SIZE(string);
2969
Guido van Rossumd57fd912000-03-10 22:53:23 +00002970 list = PyList_New(0);
2971 if (!list)
2972 goto onError;
2973
2974 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002975 int eol;
2976
Guido van Rossumd57fd912000-03-10 22:53:23 +00002977 /* Find a line and append it */
2978 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2979 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980
2981 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002982 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002983 if (i < len) {
2984 if (data[i] == '\r' && i + 1 < len &&
2985 data[i+1] == '\n')
2986 i += 2;
2987 else
2988 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002989 if (keepends)
2990 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002991 }
Guido van Rossum86662912000-04-11 15:38:46 +00002992 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 j = i;
2994 }
2995 if (j < len) {
2996 SPLIT_APPEND(data, j, len);
2997 }
2998
2999 Py_DECREF(string);
3000 return list;
3001
3002 onError:
3003 Py_DECREF(list);
3004 Py_DECREF(string);
3005 return NULL;
3006}
3007
3008static
3009PyObject *split_char(PyUnicodeObject *self,
3010 PyObject *list,
3011 Py_UNICODE ch,
3012 int maxcount)
3013{
3014 register int i;
3015 register int j;
3016 int len = self->length;
3017 PyObject *str;
3018
3019 for (i = j = 0; i < len; ) {
3020 if (self->str[i] == ch) {
3021 if (maxcount-- <= 0)
3022 break;
3023 SPLIT_APPEND(self->str, j, i);
3024 i = j = i + 1;
3025 } else
3026 i++;
3027 }
3028 if (j <= len) {
3029 SPLIT_APPEND(self->str, j, len);
3030 }
3031 return list;
3032
3033 onError:
3034 Py_DECREF(list);
3035 return NULL;
3036}
3037
3038static
3039PyObject *split_substring(PyUnicodeObject *self,
3040 PyObject *list,
3041 PyUnicodeObject *substring,
3042 int maxcount)
3043{
3044 register int i;
3045 register int j;
3046 int len = self->length;
3047 int sublen = substring->length;
3048 PyObject *str;
3049
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003050 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003051 if (Py_UNICODE_MATCH(self, i, substring)) {
3052 if (maxcount-- <= 0)
3053 break;
3054 SPLIT_APPEND(self->str, j, i);
3055 i = j = i + sublen;
3056 } else
3057 i++;
3058 }
3059 if (j <= len) {
3060 SPLIT_APPEND(self->str, j, len);
3061 }
3062 return list;
3063
3064 onError:
3065 Py_DECREF(list);
3066 return NULL;
3067}
3068
3069#undef SPLIT_APPEND
3070
3071static
3072PyObject *split(PyUnicodeObject *self,
3073 PyUnicodeObject *substring,
3074 int maxcount)
3075{
3076 PyObject *list;
3077
3078 if (maxcount < 0)
3079 maxcount = INT_MAX;
3080
3081 list = PyList_New(0);
3082 if (!list)
3083 return NULL;
3084
3085 if (substring == NULL)
3086 return split_whitespace(self,list,maxcount);
3087
3088 else if (substring->length == 1)
3089 return split_char(self,list,substring->str[0],maxcount);
3090
3091 else if (substring->length == 0) {
3092 Py_DECREF(list);
3093 PyErr_SetString(PyExc_ValueError, "empty separator");
3094 return NULL;
3095 }
3096 else
3097 return split_substring(self,list,substring,maxcount);
3098}
3099
3100static
3101PyObject *strip(PyUnicodeObject *self,
3102 int left,
3103 int right)
3104{
3105 Py_UNICODE *p = self->str;
3106 int start = 0;
3107 int end = self->length;
3108
3109 if (left)
3110 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3111 start++;
3112
3113 if (right)
3114 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3115 end--;
3116
3117 if (start == 0 && end == self->length) {
3118 /* couldn't strip anything off, return original string */
3119 Py_INCREF(self);
3120 return (PyObject*) self;
3121 }
3122
3123 return (PyObject*) PyUnicode_FromUnicode(
3124 self->str + start,
3125 end - start
3126 );
3127}
3128
3129static
3130PyObject *replace(PyUnicodeObject *self,
3131 PyUnicodeObject *str1,
3132 PyUnicodeObject *str2,
3133 int maxcount)
3134{
3135 PyUnicodeObject *u;
3136
3137 if (maxcount < 0)
3138 maxcount = INT_MAX;
3139
3140 if (str1->length == 1 && str2->length == 1) {
3141 int i;
3142
3143 /* replace characters */
3144 if (!findchar(self->str, self->length, str1->str[0])) {
3145 /* nothing to replace, return original string */
3146 Py_INCREF(self);
3147 u = self;
3148 } else {
3149 Py_UNICODE u1 = str1->str[0];
3150 Py_UNICODE u2 = str2->str[0];
3151
3152 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003153 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 self->length
3155 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003156 if (u != NULL) {
3157 Py_UNICODE_COPY(u->str, self->str,
3158 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003159 for (i = 0; i < u->length; i++)
3160 if (u->str[i] == u1) {
3161 if (--maxcount < 0)
3162 break;
3163 u->str[i] = u2;
3164 }
3165 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003167
3168 } else {
3169 int n, i;
3170 Py_UNICODE *p;
3171
3172 /* replace strings */
3173 n = count(self, 0, self->length, str1);
3174 if (n > maxcount)
3175 n = maxcount;
3176 if (n == 0) {
3177 /* nothing to replace, return original string */
3178 Py_INCREF(self);
3179 u = self;
3180 } else {
3181 u = _PyUnicode_New(
3182 self->length + n * (str2->length - str1->length));
3183 if (u) {
3184 i = 0;
3185 p = u->str;
3186 while (i <= self->length - str1->length)
3187 if (Py_UNICODE_MATCH(self, i, str1)) {
3188 /* replace string segment */
3189 Py_UNICODE_COPY(p, str2->str, str2->length);
3190 p += str2->length;
3191 i += str1->length;
3192 if (--n <= 0) {
3193 /* copy remaining part */
3194 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3195 break;
3196 }
3197 } else
3198 *p++ = self->str[i++];
3199 }
3200 }
3201 }
3202
3203 return (PyObject *) u;
3204}
3205
3206/* --- Unicode Object Methods --------------------------------------------- */
3207
3208static char title__doc__[] =
3209"S.title() -> unicode\n\
3210\n\
3211Return a titlecased version of S, i.e. words start with title case\n\
3212characters, all remaining cased characters have lower case.";
3213
3214static PyObject*
3215unicode_title(PyUnicodeObject *self, PyObject *args)
3216{
3217 if (!PyArg_NoArgs(args))
3218 return NULL;
3219 return fixup(self, fixtitle);
3220}
3221
3222static char capitalize__doc__[] =
3223"S.capitalize() -> unicode\n\
3224\n\
3225Return a capitalized version of S, i.e. make the first character\n\
3226have upper case.";
3227
3228static PyObject*
3229unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3230{
3231 if (!PyArg_NoArgs(args))
3232 return NULL;
3233 return fixup(self, fixcapitalize);
3234}
3235
3236#if 0
3237static char capwords__doc__[] =
3238"S.capwords() -> unicode\n\
3239\n\
3240Apply .capitalize() to all words in S and return the result with\n\
3241normalized whitespace (all whitespace strings are replaced by ' ').";
3242
3243static PyObject*
3244unicode_capwords(PyUnicodeObject *self, PyObject *args)
3245{
3246 PyObject *list;
3247 PyObject *item;
3248 int i;
3249
3250 if (!PyArg_NoArgs(args))
3251 return NULL;
3252
3253 /* Split into words */
3254 list = split(self, NULL, -1);
3255 if (!list)
3256 return NULL;
3257
3258 /* Capitalize each word */
3259 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3260 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3261 fixcapitalize);
3262 if (item == NULL)
3263 goto onError;
3264 Py_DECREF(PyList_GET_ITEM(list, i));
3265 PyList_SET_ITEM(list, i, item);
3266 }
3267
3268 /* Join the words to form a new string */
3269 item = PyUnicode_Join(NULL, list);
3270
3271onError:
3272 Py_DECREF(list);
3273 return (PyObject *)item;
3274}
3275#endif
3276
3277static char center__doc__[] =
3278"S.center(width) -> unicode\n\
3279\n\
3280Return S centered in a Unicode string of length width. Padding is done\n\
3281using spaces.";
3282
3283static PyObject *
3284unicode_center(PyUnicodeObject *self, PyObject *args)
3285{
3286 int marg, left;
3287 int width;
3288
3289 if (!PyArg_ParseTuple(args, "i:center", &width))
3290 return NULL;
3291
3292 if (self->length >= width) {
3293 Py_INCREF(self);
3294 return (PyObject*) self;
3295 }
3296
3297 marg = width - self->length;
3298 left = marg / 2 + (marg & width & 1);
3299
3300 return (PyObject*) pad(self, left, marg - left, ' ');
3301}
3302
Marc-André Lemburge5034372000-08-08 08:04:29 +00003303#if 0
3304
3305/* This code should go into some future Unicode collation support
3306 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003307 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003308
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003309/* speedy UTF-16 code point order comparison */
3310/* gleaned from: */
3311/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3312
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003313static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003314{
3315 0, 0, 0, 0, 0, 0, 0, 0,
3316 0, 0, 0, 0, 0, 0, 0, 0,
3317 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003318 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003319};
3320
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321static int
3322unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3323{
3324 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003325
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326 Py_UNICODE *s1 = str1->str;
3327 Py_UNICODE *s2 = str2->str;
3328
3329 len1 = str1->length;
3330 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003331
Guido van Rossumd57fd912000-03-10 22:53:23 +00003332 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003333 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003334
3335 c1 = *s1++;
3336 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003337
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003338 if (c1 > (1<<11) * 26)
3339 c1 += utf16Fixup[c1>>11];
3340 if (c2 > (1<<11) * 26)
3341 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003342 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003343
3344 if (c1 != c2)
3345 return (c1 < c2) ? -1 : 1;
3346
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003347 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003348 }
3349
3350 return (len1 < len2) ? -1 : (len1 != len2);
3351}
3352
Marc-André Lemburge5034372000-08-08 08:04:29 +00003353#else
3354
3355static int
3356unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3357{
3358 register int len1, len2;
3359
3360 Py_UNICODE *s1 = str1->str;
3361 Py_UNICODE *s2 = str2->str;
3362
3363 len1 = str1->length;
3364 len2 = str2->length;
3365
3366 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003367 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003368
Fredrik Lundh45714e92001-06-26 16:39:36 +00003369 c1 = *s1++;
3370 c2 = *s2++;
3371
3372 if (c1 != c2)
3373 return (c1 < c2) ? -1 : 1;
3374
Marc-André Lemburge5034372000-08-08 08:04:29 +00003375 len1--; len2--;
3376 }
3377
3378 return (len1 < len2) ? -1 : (len1 != len2);
3379}
3380
3381#endif
3382
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383int PyUnicode_Compare(PyObject *left,
3384 PyObject *right)
3385{
3386 PyUnicodeObject *u = NULL, *v = NULL;
3387 int result;
3388
3389 /* Coerce the two arguments */
3390 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3391 if (u == NULL)
3392 goto onError;
3393 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3394 if (v == NULL)
3395 goto onError;
3396
Thomas Wouters7e474022000-07-16 12:04:32 +00003397 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003398 if (v == u) {
3399 Py_DECREF(u);
3400 Py_DECREF(v);
3401 return 0;
3402 }
3403
3404 result = unicode_compare(u, v);
3405
3406 Py_DECREF(u);
3407 Py_DECREF(v);
3408 return result;
3409
3410onError:
3411 Py_XDECREF(u);
3412 Py_XDECREF(v);
3413 return -1;
3414}
3415
Guido van Rossum403d68b2000-03-13 15:55:09 +00003416int PyUnicode_Contains(PyObject *container,
3417 PyObject *element)
3418{
3419 PyUnicodeObject *u = NULL, *v = NULL;
3420 int result;
3421 register const Py_UNICODE *p, *e;
3422 register Py_UNICODE ch;
3423
3424 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003425 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003426 if (v == NULL) {
3427 PyErr_SetString(PyExc_TypeError,
3428 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003429 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003430 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003431 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3432 if (u == NULL) {
3433 Py_DECREF(v);
3434 goto onError;
3435 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003436
3437 /* Check v in u */
3438 if (PyUnicode_GET_SIZE(v) != 1) {
3439 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003440 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003441 goto onError;
3442 }
3443 ch = *PyUnicode_AS_UNICODE(v);
3444 p = PyUnicode_AS_UNICODE(u);
3445 e = p + PyUnicode_GET_SIZE(u);
3446 result = 0;
3447 while (p < e) {
3448 if (*p++ == ch) {
3449 result = 1;
3450 break;
3451 }
3452 }
3453
3454 Py_DECREF(u);
3455 Py_DECREF(v);
3456 return result;
3457
3458onError:
3459 Py_XDECREF(u);
3460 Py_XDECREF(v);
3461 return -1;
3462}
3463
Guido van Rossumd57fd912000-03-10 22:53:23 +00003464/* Concat to string or Unicode object giving a new Unicode object. */
3465
3466PyObject *PyUnicode_Concat(PyObject *left,
3467 PyObject *right)
3468{
3469 PyUnicodeObject *u = NULL, *v = NULL, *w;
3470
3471 /* Coerce the two arguments */
3472 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3473 if (u == NULL)
3474 goto onError;
3475 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3476 if (v == NULL)
3477 goto onError;
3478
3479 /* Shortcuts */
3480 if (v == unicode_empty) {
3481 Py_DECREF(v);
3482 return (PyObject *)u;
3483 }
3484 if (u == unicode_empty) {
3485 Py_DECREF(u);
3486 return (PyObject *)v;
3487 }
3488
3489 /* Concat the two Unicode strings */
3490 w = _PyUnicode_New(u->length + v->length);
3491 if (w == NULL)
3492 goto onError;
3493 Py_UNICODE_COPY(w->str, u->str, u->length);
3494 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3495
3496 Py_DECREF(u);
3497 Py_DECREF(v);
3498 return (PyObject *)w;
3499
3500onError:
3501 Py_XDECREF(u);
3502 Py_XDECREF(v);
3503 return NULL;
3504}
3505
3506static char count__doc__[] =
3507"S.count(sub[, start[, end]]) -> int\n\
3508\n\
3509Return the number of occurrences of substring sub in Unicode string\n\
3510S[start:end]. Optional arguments start and end are\n\
3511interpreted as in slice notation.";
3512
3513static PyObject *
3514unicode_count(PyUnicodeObject *self, PyObject *args)
3515{
3516 PyUnicodeObject *substring;
3517 int start = 0;
3518 int end = INT_MAX;
3519 PyObject *result;
3520
Guido van Rossumb8872e62000-05-09 14:14:27 +00003521 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3522 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003523 return NULL;
3524
3525 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3526 (PyObject *)substring);
3527 if (substring == NULL)
3528 return NULL;
3529
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 if (start < 0)
3531 start += self->length;
3532 if (start < 0)
3533 start = 0;
3534 if (end > self->length)
3535 end = self->length;
3536 if (end < 0)
3537 end += self->length;
3538 if (end < 0)
3539 end = 0;
3540
3541 result = PyInt_FromLong((long) count(self, start, end, substring));
3542
3543 Py_DECREF(substring);
3544 return result;
3545}
3546
3547static char encode__doc__[] =
3548"S.encode([encoding[,errors]]) -> string\n\
3549\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003550Return an encoded string version of S. Default encoding is the current\n\
3551default string encoding. errors may be given to set a different error\n\
3552handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3553a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554
3555static PyObject *
3556unicode_encode(PyUnicodeObject *self, PyObject *args)
3557{
3558 char *encoding = NULL;
3559 char *errors = NULL;
3560 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3561 return NULL;
3562 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3563}
3564
3565static char expandtabs__doc__[] =
3566"S.expandtabs([tabsize]) -> unicode\n\
3567\n\
3568Return a copy of S where all tab characters are expanded using spaces.\n\
3569If tabsize is not given, a tab size of 8 characters is assumed.";
3570
3571static PyObject*
3572unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3573{
3574 Py_UNICODE *e;
3575 Py_UNICODE *p;
3576 Py_UNICODE *q;
3577 int i, j;
3578 PyUnicodeObject *u;
3579 int tabsize = 8;
3580
3581 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3582 return NULL;
3583
Thomas Wouters7e474022000-07-16 12:04:32 +00003584 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003585 i = j = 0;
3586 e = self->str + self->length;
3587 for (p = self->str; p < e; p++)
3588 if (*p == '\t') {
3589 if (tabsize > 0)
3590 j += tabsize - (j % tabsize);
3591 }
3592 else {
3593 j++;
3594 if (*p == '\n' || *p == '\r') {
3595 i += j;
3596 j = 0;
3597 }
3598 }
3599
3600 /* Second pass: create output string and fill it */
3601 u = _PyUnicode_New(i + j);
3602 if (!u)
3603 return NULL;
3604
3605 j = 0;
3606 q = u->str;
3607
3608 for (p = self->str; p < e; p++)
3609 if (*p == '\t') {
3610 if (tabsize > 0) {
3611 i = tabsize - (j % tabsize);
3612 j += i;
3613 while (i--)
3614 *q++ = ' ';
3615 }
3616 }
3617 else {
3618 j++;
3619 *q++ = *p;
3620 if (*p == '\n' || *p == '\r')
3621 j = 0;
3622 }
3623
3624 return (PyObject*) u;
3625}
3626
3627static char find__doc__[] =
3628"S.find(sub [,start [,end]]) -> int\n\
3629\n\
3630Return the lowest index in S where substring sub is found,\n\
3631such that sub is contained within s[start,end]. Optional\n\
3632arguments start and end are interpreted as in slice notation.\n\
3633\n\
3634Return -1 on failure.";
3635
3636static PyObject *
3637unicode_find(PyUnicodeObject *self, PyObject *args)
3638{
3639 PyUnicodeObject *substring;
3640 int start = 0;
3641 int end = INT_MAX;
3642 PyObject *result;
3643
Guido van Rossumb8872e62000-05-09 14:14:27 +00003644 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3645 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003646 return NULL;
3647 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3648 (PyObject *)substring);
3649 if (substring == NULL)
3650 return NULL;
3651
3652 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3653
3654 Py_DECREF(substring);
3655 return result;
3656}
3657
3658static PyObject *
3659unicode_getitem(PyUnicodeObject *self, int index)
3660{
3661 if (index < 0 || index >= self->length) {
3662 PyErr_SetString(PyExc_IndexError, "string index out of range");
3663 return NULL;
3664 }
3665
3666 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3667}
3668
3669static long
3670unicode_hash(PyUnicodeObject *self)
3671{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003672 /* Since Unicode objects compare equal to their ASCII string
3673 counterparts, they should use the individual character values
3674 as basis for their hash value. This is needed to assure that
3675 strings and Unicode objects behave in the same way as
3676 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677
Fredrik Lundhdde61642000-07-10 18:27:47 +00003678 register int len;
3679 register Py_UNICODE *p;
3680 register long x;
3681
Guido van Rossumd57fd912000-03-10 22:53:23 +00003682 if (self->hash != -1)
3683 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003684 len = PyUnicode_GET_SIZE(self);
3685 p = PyUnicode_AS_UNICODE(self);
3686 x = *p << 7;
3687 while (--len >= 0)
3688 x = (1000003*x) ^ *p++;
3689 x ^= PyUnicode_GET_SIZE(self);
3690 if (x == -1)
3691 x = -2;
3692 self->hash = x;
3693 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003694}
3695
3696static char index__doc__[] =
3697"S.index(sub [,start [,end]]) -> int\n\
3698\n\
3699Like S.find() but raise ValueError when the substring is not found.";
3700
3701static PyObject *
3702unicode_index(PyUnicodeObject *self, PyObject *args)
3703{
3704 int result;
3705 PyUnicodeObject *substring;
3706 int start = 0;
3707 int end = INT_MAX;
3708
Guido van Rossumb8872e62000-05-09 14:14:27 +00003709 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3710 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 return NULL;
3712
3713 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3714 (PyObject *)substring);
3715 if (substring == NULL)
3716 return NULL;
3717
3718 result = findstring(self, substring, start, end, 1);
3719
3720 Py_DECREF(substring);
3721 if (result < 0) {
3722 PyErr_SetString(PyExc_ValueError, "substring not found");
3723 return NULL;
3724 }
3725 return PyInt_FromLong(result);
3726}
3727
3728static char islower__doc__[] =
3729"S.islower() -> int\n\
3730\n\
3731Return 1 if all cased characters in S are lowercase and there is\n\
3732at least one cased character in S, 0 otherwise.";
3733
3734static PyObject*
3735unicode_islower(PyUnicodeObject *self, PyObject *args)
3736{
3737 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3738 register const Py_UNICODE *e;
3739 int cased;
3740
3741 if (!PyArg_NoArgs(args))
3742 return NULL;
3743
3744 /* Shortcut for single character strings */
3745 if (PyUnicode_GET_SIZE(self) == 1)
3746 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3747
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003748 /* Special case for empty strings */
3749 if (PyString_GET_SIZE(self) == 0)
3750 return PyInt_FromLong(0);
3751
Guido van Rossumd57fd912000-03-10 22:53:23 +00003752 e = p + PyUnicode_GET_SIZE(self);
3753 cased = 0;
3754 for (; p < e; p++) {
3755 register const Py_UNICODE ch = *p;
3756
3757 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3758 return PyInt_FromLong(0);
3759 else if (!cased && Py_UNICODE_ISLOWER(ch))
3760 cased = 1;
3761 }
3762 return PyInt_FromLong(cased);
3763}
3764
3765static char isupper__doc__[] =
3766"S.isupper() -> int\n\
3767\n\
3768Return 1 if all cased characters in S are uppercase and there is\n\
3769at least one cased character in S, 0 otherwise.";
3770
3771static PyObject*
3772unicode_isupper(PyUnicodeObject *self, PyObject *args)
3773{
3774 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3775 register const Py_UNICODE *e;
3776 int cased;
3777
3778 if (!PyArg_NoArgs(args))
3779 return NULL;
3780
3781 /* Shortcut for single character strings */
3782 if (PyUnicode_GET_SIZE(self) == 1)
3783 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3784
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003785 /* Special case for empty strings */
3786 if (PyString_GET_SIZE(self) == 0)
3787 return PyInt_FromLong(0);
3788
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 e = p + PyUnicode_GET_SIZE(self);
3790 cased = 0;
3791 for (; p < e; p++) {
3792 register const Py_UNICODE ch = *p;
3793
3794 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3795 return PyInt_FromLong(0);
3796 else if (!cased && Py_UNICODE_ISUPPER(ch))
3797 cased = 1;
3798 }
3799 return PyInt_FromLong(cased);
3800}
3801
3802static char istitle__doc__[] =
3803"S.istitle() -> int\n\
3804\n\
3805Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3806may only follow uncased characters and lowercase characters only cased\n\
3807ones. Return 0 otherwise.";
3808
3809static PyObject*
3810unicode_istitle(PyUnicodeObject *self, PyObject *args)
3811{
3812 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3813 register const Py_UNICODE *e;
3814 int cased, previous_is_cased;
3815
3816 if (!PyArg_NoArgs(args))
3817 return NULL;
3818
3819 /* Shortcut for single character strings */
3820 if (PyUnicode_GET_SIZE(self) == 1)
3821 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3822 (Py_UNICODE_ISUPPER(*p) != 0));
3823
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003824 /* Special case for empty strings */
3825 if (PyString_GET_SIZE(self) == 0)
3826 return PyInt_FromLong(0);
3827
Guido van Rossumd57fd912000-03-10 22:53:23 +00003828 e = p + PyUnicode_GET_SIZE(self);
3829 cased = 0;
3830 previous_is_cased = 0;
3831 for (; p < e; p++) {
3832 register const Py_UNICODE ch = *p;
3833
3834 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3835 if (previous_is_cased)
3836 return PyInt_FromLong(0);
3837 previous_is_cased = 1;
3838 cased = 1;
3839 }
3840 else if (Py_UNICODE_ISLOWER(ch)) {
3841 if (!previous_is_cased)
3842 return PyInt_FromLong(0);
3843 previous_is_cased = 1;
3844 cased = 1;
3845 }
3846 else
3847 previous_is_cased = 0;
3848 }
3849 return PyInt_FromLong(cased);
3850}
3851
3852static char isspace__doc__[] =
3853"S.isspace() -> int\n\
3854\n\
3855Return 1 if there are only whitespace characters in S,\n\
38560 otherwise.";
3857
3858static PyObject*
3859unicode_isspace(PyUnicodeObject *self, PyObject *args)
3860{
3861 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3862 register const Py_UNICODE *e;
3863
3864 if (!PyArg_NoArgs(args))
3865 return NULL;
3866
3867 /* Shortcut for single character strings */
3868 if (PyUnicode_GET_SIZE(self) == 1 &&
3869 Py_UNICODE_ISSPACE(*p))
3870 return PyInt_FromLong(1);
3871
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003872 /* Special case for empty strings */
3873 if (PyString_GET_SIZE(self) == 0)
3874 return PyInt_FromLong(0);
3875
Guido van Rossumd57fd912000-03-10 22:53:23 +00003876 e = p + PyUnicode_GET_SIZE(self);
3877 for (; p < e; p++) {
3878 if (!Py_UNICODE_ISSPACE(*p))
3879 return PyInt_FromLong(0);
3880 }
3881 return PyInt_FromLong(1);
3882}
3883
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003884static char isalpha__doc__[] =
3885"S.isalpha() -> int\n\
3886\n\
3887Return 1 if all characters in S are alphabetic\n\
3888and there is at least one character in S, 0 otherwise.";
3889
3890static PyObject*
3891unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3892{
3893 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3894 register const Py_UNICODE *e;
3895
3896 if (!PyArg_NoArgs(args))
3897 return NULL;
3898
3899 /* Shortcut for single character strings */
3900 if (PyUnicode_GET_SIZE(self) == 1 &&
3901 Py_UNICODE_ISALPHA(*p))
3902 return PyInt_FromLong(1);
3903
3904 /* Special case for empty strings */
3905 if (PyString_GET_SIZE(self) == 0)
3906 return PyInt_FromLong(0);
3907
3908 e = p + PyUnicode_GET_SIZE(self);
3909 for (; p < e; p++) {
3910 if (!Py_UNICODE_ISALPHA(*p))
3911 return PyInt_FromLong(0);
3912 }
3913 return PyInt_FromLong(1);
3914}
3915
3916static char isalnum__doc__[] =
3917"S.isalnum() -> int\n\
3918\n\
3919Return 1 if all characters in S are alphanumeric\n\
3920and there is at least one character in S, 0 otherwise.";
3921
3922static PyObject*
3923unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3924{
3925 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3926 register const Py_UNICODE *e;
3927
3928 if (!PyArg_NoArgs(args))
3929 return NULL;
3930
3931 /* Shortcut for single character strings */
3932 if (PyUnicode_GET_SIZE(self) == 1 &&
3933 Py_UNICODE_ISALNUM(*p))
3934 return PyInt_FromLong(1);
3935
3936 /* Special case for empty strings */
3937 if (PyString_GET_SIZE(self) == 0)
3938 return PyInt_FromLong(0);
3939
3940 e = p + PyUnicode_GET_SIZE(self);
3941 for (; p < e; p++) {
3942 if (!Py_UNICODE_ISALNUM(*p))
3943 return PyInt_FromLong(0);
3944 }
3945 return PyInt_FromLong(1);
3946}
3947
Guido van Rossumd57fd912000-03-10 22:53:23 +00003948static char isdecimal__doc__[] =
3949"S.isdecimal() -> int\n\
3950\n\
3951Return 1 if there are only decimal characters in S,\n\
39520 otherwise.";
3953
3954static PyObject*
3955unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3956{
3957 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3958 register const Py_UNICODE *e;
3959
3960 if (!PyArg_NoArgs(args))
3961 return NULL;
3962
3963 /* Shortcut for single character strings */
3964 if (PyUnicode_GET_SIZE(self) == 1 &&
3965 Py_UNICODE_ISDECIMAL(*p))
3966 return PyInt_FromLong(1);
3967
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003968 /* Special case for empty strings */
3969 if (PyString_GET_SIZE(self) == 0)
3970 return PyInt_FromLong(0);
3971
Guido van Rossumd57fd912000-03-10 22:53:23 +00003972 e = p + PyUnicode_GET_SIZE(self);
3973 for (; p < e; p++) {
3974 if (!Py_UNICODE_ISDECIMAL(*p))
3975 return PyInt_FromLong(0);
3976 }
3977 return PyInt_FromLong(1);
3978}
3979
3980static char isdigit__doc__[] =
3981"S.isdigit() -> int\n\
3982\n\
3983Return 1 if there are only digit characters in S,\n\
39840 otherwise.";
3985
3986static PyObject*
3987unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3988{
3989 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3990 register const Py_UNICODE *e;
3991
3992 if (!PyArg_NoArgs(args))
3993 return NULL;
3994
3995 /* Shortcut for single character strings */
3996 if (PyUnicode_GET_SIZE(self) == 1 &&
3997 Py_UNICODE_ISDIGIT(*p))
3998 return PyInt_FromLong(1);
3999
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004000 /* Special case for empty strings */
4001 if (PyString_GET_SIZE(self) == 0)
4002 return PyInt_FromLong(0);
4003
Guido van Rossumd57fd912000-03-10 22:53:23 +00004004 e = p + PyUnicode_GET_SIZE(self);
4005 for (; p < e; p++) {
4006 if (!Py_UNICODE_ISDIGIT(*p))
4007 return PyInt_FromLong(0);
4008 }
4009 return PyInt_FromLong(1);
4010}
4011
4012static char isnumeric__doc__[] =
4013"S.isnumeric() -> int\n\
4014\n\
4015Return 1 if there are only numeric characters in S,\n\
40160 otherwise.";
4017
4018static PyObject*
4019unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
4020{
4021 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4022 register const Py_UNICODE *e;
4023
4024 if (!PyArg_NoArgs(args))
4025 return NULL;
4026
4027 /* Shortcut for single character strings */
4028 if (PyUnicode_GET_SIZE(self) == 1 &&
4029 Py_UNICODE_ISNUMERIC(*p))
4030 return PyInt_FromLong(1);
4031
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004032 /* Special case for empty strings */
4033 if (PyString_GET_SIZE(self) == 0)
4034 return PyInt_FromLong(0);
4035
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036 e = p + PyUnicode_GET_SIZE(self);
4037 for (; p < e; p++) {
4038 if (!Py_UNICODE_ISNUMERIC(*p))
4039 return PyInt_FromLong(0);
4040 }
4041 return PyInt_FromLong(1);
4042}
4043
4044static char join__doc__[] =
4045"S.join(sequence) -> unicode\n\
4046\n\
4047Return a string which is the concatenation of the strings in the\n\
4048sequence. The separator between elements is S.";
4049
4050static PyObject*
4051unicode_join(PyUnicodeObject *self, PyObject *args)
4052{
4053 PyObject *data;
4054 if (!PyArg_ParseTuple(args, "O:join", &data))
4055 return NULL;
4056
4057 return PyUnicode_Join((PyObject *)self, data);
4058}
4059
4060static int
4061unicode_length(PyUnicodeObject *self)
4062{
4063 return self->length;
4064}
4065
4066static char ljust__doc__[] =
4067"S.ljust(width) -> unicode\n\
4068\n\
4069Return S left justified in a Unicode string of length width. Padding is\n\
4070done using spaces.";
4071
4072static PyObject *
4073unicode_ljust(PyUnicodeObject *self, PyObject *args)
4074{
4075 int width;
4076 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4077 return NULL;
4078
4079 if (self->length >= width) {
4080 Py_INCREF(self);
4081 return (PyObject*) self;
4082 }
4083
4084 return (PyObject*) pad(self, 0, width - self->length, ' ');
4085}
4086
4087static char lower__doc__[] =
4088"S.lower() -> unicode\n\
4089\n\
4090Return a copy of the string S converted to lowercase.";
4091
4092static PyObject*
4093unicode_lower(PyUnicodeObject *self, PyObject *args)
4094{
4095 if (!PyArg_NoArgs(args))
4096 return NULL;
4097 return fixup(self, fixlower);
4098}
4099
4100static char lstrip__doc__[] =
4101"S.lstrip() -> unicode\n\
4102\n\
4103Return a copy of the string S with leading whitespace removed.";
4104
4105static PyObject *
4106unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4107{
4108 if (!PyArg_NoArgs(args))
4109 return NULL;
4110 return strip(self, 1, 0);
4111}
4112
4113static PyObject*
4114unicode_repeat(PyUnicodeObject *str, int len)
4115{
4116 PyUnicodeObject *u;
4117 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004118 int nchars;
4119 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004120
4121 if (len < 0)
4122 len = 0;
4123
4124 if (len == 1) {
4125 /* no repeat, return original string */
4126 Py_INCREF(str);
4127 return (PyObject*) str;
4128 }
Tim Peters8f422462000-09-09 06:13:41 +00004129
4130 /* ensure # of chars needed doesn't overflow int and # of bytes
4131 * needed doesn't overflow size_t
4132 */
4133 nchars = len * str->length;
4134 if (len && nchars / len != str->length) {
4135 PyErr_SetString(PyExc_OverflowError,
4136 "repeated string is too long");
4137 return NULL;
4138 }
4139 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4140 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4141 PyErr_SetString(PyExc_OverflowError,
4142 "repeated string is too long");
4143 return NULL;
4144 }
4145 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004146 if (!u)
4147 return NULL;
4148
4149 p = u->str;
4150
4151 while (len-- > 0) {
4152 Py_UNICODE_COPY(p, str->str, str->length);
4153 p += str->length;
4154 }
4155
4156 return (PyObject*) u;
4157}
4158
4159PyObject *PyUnicode_Replace(PyObject *obj,
4160 PyObject *subobj,
4161 PyObject *replobj,
4162 int maxcount)
4163{
4164 PyObject *self;
4165 PyObject *str1;
4166 PyObject *str2;
4167 PyObject *result;
4168
4169 self = PyUnicode_FromObject(obj);
4170 if (self == NULL)
4171 return NULL;
4172 str1 = PyUnicode_FromObject(subobj);
4173 if (str1 == NULL) {
4174 Py_DECREF(self);
4175 return NULL;
4176 }
4177 str2 = PyUnicode_FromObject(replobj);
4178 if (str2 == NULL) {
4179 Py_DECREF(self);
4180 Py_DECREF(str1);
4181 return NULL;
4182 }
4183 result = replace((PyUnicodeObject *)self,
4184 (PyUnicodeObject *)str1,
4185 (PyUnicodeObject *)str2,
4186 maxcount);
4187 Py_DECREF(self);
4188 Py_DECREF(str1);
4189 Py_DECREF(str2);
4190 return result;
4191}
4192
4193static char replace__doc__[] =
4194"S.replace (old, new[, maxsplit]) -> unicode\n\
4195\n\
4196Return a copy of S with all occurrences of substring\n\
4197old replaced by new. If the optional argument maxsplit is\n\
4198given, only the first maxsplit occurrences are replaced.";
4199
4200static PyObject*
4201unicode_replace(PyUnicodeObject *self, PyObject *args)
4202{
4203 PyUnicodeObject *str1;
4204 PyUnicodeObject *str2;
4205 int maxcount = -1;
4206 PyObject *result;
4207
4208 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4209 return NULL;
4210 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4211 if (str1 == NULL)
4212 return NULL;
4213 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4214 if (str2 == NULL)
4215 return NULL;
4216
4217 result = replace(self, str1, str2, maxcount);
4218
4219 Py_DECREF(str1);
4220 Py_DECREF(str2);
4221 return result;
4222}
4223
4224static
4225PyObject *unicode_repr(PyObject *unicode)
4226{
4227 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4228 PyUnicode_GET_SIZE(unicode),
4229 1);
4230}
4231
4232static char rfind__doc__[] =
4233"S.rfind(sub [,start [,end]]) -> int\n\
4234\n\
4235Return the highest index in S where substring sub is found,\n\
4236such that sub is contained within s[start,end]. Optional\n\
4237arguments start and end are interpreted as in slice notation.\n\
4238\n\
4239Return -1 on failure.";
4240
4241static PyObject *
4242unicode_rfind(PyUnicodeObject *self, PyObject *args)
4243{
4244 PyUnicodeObject *substring;
4245 int start = 0;
4246 int end = INT_MAX;
4247 PyObject *result;
4248
Guido van Rossumb8872e62000-05-09 14:14:27 +00004249 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4250 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004251 return NULL;
4252 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4253 (PyObject *)substring);
4254 if (substring == NULL)
4255 return NULL;
4256
4257 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4258
4259 Py_DECREF(substring);
4260 return result;
4261}
4262
4263static char rindex__doc__[] =
4264"S.rindex(sub [,start [,end]]) -> int\n\
4265\n\
4266Like S.rfind() but raise ValueError when the substring is not found.";
4267
4268static PyObject *
4269unicode_rindex(PyUnicodeObject *self, PyObject *args)
4270{
4271 int result;
4272 PyUnicodeObject *substring;
4273 int start = 0;
4274 int end = INT_MAX;
4275
Guido van Rossumb8872e62000-05-09 14:14:27 +00004276 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4277 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004278 return NULL;
4279 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4280 (PyObject *)substring);
4281 if (substring == NULL)
4282 return NULL;
4283
4284 result = findstring(self, substring, start, end, -1);
4285
4286 Py_DECREF(substring);
4287 if (result < 0) {
4288 PyErr_SetString(PyExc_ValueError, "substring not found");
4289 return NULL;
4290 }
4291 return PyInt_FromLong(result);
4292}
4293
4294static char rjust__doc__[] =
4295"S.rjust(width) -> unicode\n\
4296\n\
4297Return S right justified in a Unicode string of length width. Padding is\n\
4298done using spaces.";
4299
4300static PyObject *
4301unicode_rjust(PyUnicodeObject *self, PyObject *args)
4302{
4303 int width;
4304 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4305 return NULL;
4306
4307 if (self->length >= width) {
4308 Py_INCREF(self);
4309 return (PyObject*) self;
4310 }
4311
4312 return (PyObject*) pad(self, width - self->length, 0, ' ');
4313}
4314
4315static char rstrip__doc__[] =
4316"S.rstrip() -> unicode\n\
4317\n\
4318Return a copy of the string S with trailing whitespace removed.";
4319
4320static PyObject *
4321unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4322{
4323 if (!PyArg_NoArgs(args))
4324 return NULL;
4325 return strip(self, 0, 1);
4326}
4327
4328static PyObject*
4329unicode_slice(PyUnicodeObject *self, int start, int end)
4330{
4331 /* standard clamping */
4332 if (start < 0)
4333 start = 0;
4334 if (end < 0)
4335 end = 0;
4336 if (end > self->length)
4337 end = self->length;
4338 if (start == 0 && end == self->length) {
4339 /* full slice, return original string */
4340 Py_INCREF(self);
4341 return (PyObject*) self;
4342 }
4343 if (start > end)
4344 start = end;
4345 /* copy slice */
4346 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4347 end - start);
4348}
4349
4350PyObject *PyUnicode_Split(PyObject *s,
4351 PyObject *sep,
4352 int maxsplit)
4353{
4354 PyObject *result;
4355
4356 s = PyUnicode_FromObject(s);
4357 if (s == NULL)
4358 return NULL;
4359 if (sep != NULL) {
4360 sep = PyUnicode_FromObject(sep);
4361 if (sep == NULL) {
4362 Py_DECREF(s);
4363 return NULL;
4364 }
4365 }
4366
4367 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4368
4369 Py_DECREF(s);
4370 Py_XDECREF(sep);
4371 return result;
4372}
4373
4374static char split__doc__[] =
4375"S.split([sep [,maxsplit]]) -> list of strings\n\
4376\n\
4377Return a list of the words in S, using sep as the\n\
4378delimiter string. If maxsplit is given, at most maxsplit\n\
4379splits are done. If sep is not specified, any whitespace string\n\
4380is a separator.";
4381
4382static PyObject*
4383unicode_split(PyUnicodeObject *self, PyObject *args)
4384{
4385 PyObject *substring = Py_None;
4386 int maxcount = -1;
4387
4388 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4389 return NULL;
4390
4391 if (substring == Py_None)
4392 return split(self, NULL, maxcount);
4393 else if (PyUnicode_Check(substring))
4394 return split(self, (PyUnicodeObject *)substring, maxcount);
4395 else
4396 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4397}
4398
4399static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004400"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004401\n\
4402Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004403Line breaks are not included in the resulting list unless keepends\n\
4404is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405
4406static PyObject*
4407unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4408{
Guido van Rossum86662912000-04-11 15:38:46 +00004409 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410
Guido van Rossum86662912000-04-11 15:38:46 +00004411 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412 return NULL;
4413
Guido van Rossum86662912000-04-11 15:38:46 +00004414 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004415}
4416
4417static
4418PyObject *unicode_str(PyUnicodeObject *self)
4419{
Fred Drakee4315f52000-05-09 19:53:39 +00004420 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004421}
4422
4423static char strip__doc__[] =
4424"S.strip() -> unicode\n\
4425\n\
4426Return a copy of S with leading and trailing whitespace removed.";
4427
4428static PyObject *
4429unicode_strip(PyUnicodeObject *self, PyObject *args)
4430{
4431 if (!PyArg_NoArgs(args))
4432 return NULL;
4433 return strip(self, 1, 1);
4434}
4435
4436static char swapcase__doc__[] =
4437"S.swapcase() -> unicode\n\
4438\n\
4439Return a copy of S with uppercase characters converted to lowercase\n\
4440and vice versa.";
4441
4442static PyObject*
4443unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4444{
4445 if (!PyArg_NoArgs(args))
4446 return NULL;
4447 return fixup(self, fixswapcase);
4448}
4449
4450static char translate__doc__[] =
4451"S.translate(table) -> unicode\n\
4452\n\
4453Return a copy of the string S, where all characters have been mapped\n\
4454through the given translation table, which must be a mapping of\n\
4455Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4456are left untouched. Characters mapped to None are deleted.";
4457
4458static PyObject*
4459unicode_translate(PyUnicodeObject *self, PyObject *args)
4460{
4461 PyObject *table;
4462
4463 if (!PyArg_ParseTuple(args, "O:translate", &table))
4464 return NULL;
4465 return PyUnicode_TranslateCharmap(self->str,
4466 self->length,
4467 table,
4468 "ignore");
4469}
4470
4471static char upper__doc__[] =
4472"S.upper() -> unicode\n\
4473\n\
4474Return a copy of S converted to uppercase.";
4475
4476static PyObject*
4477unicode_upper(PyUnicodeObject *self, PyObject *args)
4478{
4479 if (!PyArg_NoArgs(args))
4480 return NULL;
4481 return fixup(self, fixupper);
4482}
4483
4484#if 0
4485static char zfill__doc__[] =
4486"S.zfill(width) -> unicode\n\
4487\n\
4488Pad a numeric string x with zeros on the left, to fill a field\n\
4489of the specified width. The string x is never truncated.";
4490
4491static PyObject *
4492unicode_zfill(PyUnicodeObject *self, PyObject *args)
4493{
4494 int fill;
4495 PyUnicodeObject *u;
4496
4497 int width;
4498 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4499 return NULL;
4500
4501 if (self->length >= width) {
4502 Py_INCREF(self);
4503 return (PyObject*) self;
4504 }
4505
4506 fill = width - self->length;
4507
4508 u = pad(self, fill, 0, '0');
4509
4510 if (u->str[fill] == '+' || u->str[fill] == '-') {
4511 /* move sign to beginning of string */
4512 u->str[0] = u->str[fill];
4513 u->str[fill] = '0';
4514 }
4515
4516 return (PyObject*) u;
4517}
4518#endif
4519
4520#if 0
4521static PyObject*
4522unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4523{
4524 if (!PyArg_NoArgs(args))
4525 return NULL;
4526 return PyInt_FromLong(unicode_freelist_size);
4527}
4528#endif
4529
4530static char startswith__doc__[] =
4531"S.startswith(prefix[, start[, end]]) -> int\n\
4532\n\
4533Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4534optional start, test S beginning at that position. With optional end, stop\n\
4535comparing S at that position.";
4536
4537static PyObject *
4538unicode_startswith(PyUnicodeObject *self,
4539 PyObject *args)
4540{
4541 PyUnicodeObject *substring;
4542 int start = 0;
4543 int end = INT_MAX;
4544 PyObject *result;
4545
Guido van Rossumb8872e62000-05-09 14:14:27 +00004546 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4547 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004548 return NULL;
4549 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4550 (PyObject *)substring);
4551 if (substring == NULL)
4552 return NULL;
4553
4554 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4555
4556 Py_DECREF(substring);
4557 return result;
4558}
4559
4560
4561static char endswith__doc__[] =
4562"S.endswith(suffix[, start[, end]]) -> int\n\
4563\n\
4564Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4565optional start, test S beginning at that position. With optional end, stop\n\
4566comparing S at that position.";
4567
4568static PyObject *
4569unicode_endswith(PyUnicodeObject *self,
4570 PyObject *args)
4571{
4572 PyUnicodeObject *substring;
4573 int start = 0;
4574 int end = INT_MAX;
4575 PyObject *result;
4576
Guido van Rossumb8872e62000-05-09 14:14:27 +00004577 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4578 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579 return NULL;
4580 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4581 (PyObject *)substring);
4582 if (substring == NULL)
4583 return NULL;
4584
4585 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4586
4587 Py_DECREF(substring);
4588 return result;
4589}
4590
4591
4592static PyMethodDef unicode_methods[] = {
4593
4594 /* Order is according to common usage: often used methods should
4595 appear first, since lookup is done sequentially. */
4596
4597 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4598 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4599 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4600 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4601 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4602 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4603 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4604 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4605 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4606 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4607 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4608 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4609 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4610 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4611/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4612 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4613 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4614 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4615 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4616 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4617 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4618 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4619 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4620 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4621 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4622 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4623 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4624 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4625 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4626 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4627 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4628 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4629 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004630 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4631 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632#if 0
4633 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4634 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4635#endif
4636
4637#if 0
4638 /* This one is just used for debugging the implementation. */
4639 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4640#endif
4641
4642 {NULL, NULL}
4643};
4644
4645static PyObject *
4646unicode_getattr(PyUnicodeObject *self, char *name)
4647{
4648 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4649}
4650
4651static PySequenceMethods unicode_as_sequence = {
4652 (inquiry) unicode_length, /* sq_length */
4653 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4654 (intargfunc) unicode_repeat, /* sq_repeat */
4655 (intargfunc) unicode_getitem, /* sq_item */
4656 (intintargfunc) unicode_slice, /* sq_slice */
4657 0, /* sq_ass_item */
4658 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004659 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660};
4661
4662static int
4663unicode_buffer_getreadbuf(PyUnicodeObject *self,
4664 int index,
4665 const void **ptr)
4666{
4667 if (index != 0) {
4668 PyErr_SetString(PyExc_SystemError,
4669 "accessing non-existent unicode segment");
4670 return -1;
4671 }
4672 *ptr = (void *) self->str;
4673 return PyUnicode_GET_DATA_SIZE(self);
4674}
4675
4676static int
4677unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4678 const void **ptr)
4679{
4680 PyErr_SetString(PyExc_TypeError,
4681 "cannot use unicode as modifyable buffer");
4682 return -1;
4683}
4684
4685static int
4686unicode_buffer_getsegcount(PyUnicodeObject *self,
4687 int *lenp)
4688{
4689 if (lenp)
4690 *lenp = PyUnicode_GET_DATA_SIZE(self);
4691 return 1;
4692}
4693
4694static int
4695unicode_buffer_getcharbuf(PyUnicodeObject *self,
4696 int index,
4697 const void **ptr)
4698{
4699 PyObject *str;
4700
4701 if (index != 0) {
4702 PyErr_SetString(PyExc_SystemError,
4703 "accessing non-existent unicode segment");
4704 return -1;
4705 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004706 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707 if (str == NULL)
4708 return -1;
4709 *ptr = (void *) PyString_AS_STRING(str);
4710 return PyString_GET_SIZE(str);
4711}
4712
4713/* Helpers for PyUnicode_Format() */
4714
4715static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004716getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004717{
4718 int argidx = *p_argidx;
4719 if (argidx < arglen) {
4720 (*p_argidx)++;
4721 if (arglen < 0)
4722 return args;
4723 else
4724 return PyTuple_GetItem(args, argidx);
4725 }
4726 PyErr_SetString(PyExc_TypeError,
4727 "not enough arguments for format string");
4728 return NULL;
4729}
4730
4731#define F_LJUST (1<<0)
4732#define F_SIGN (1<<1)
4733#define F_BLANK (1<<2)
4734#define F_ALT (1<<3)
4735#define F_ZERO (1<<4)
4736
4737static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004738int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739{
4740 register int i;
4741 int len;
4742 va_list va;
4743 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745
4746 /* First, format the string as char array, then expand to Py_UNICODE
4747 array. */
4748 charbuffer = (char *)buffer;
4749 len = vsprintf(charbuffer, format, va);
4750 for (i = len - 1; i >= 0; i--)
4751 buffer[i] = (Py_UNICODE) charbuffer[i];
4752
4753 va_end(va);
4754 return len;
4755}
4756
4757static int
4758formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004759 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004760 int flags,
4761 int prec,
4762 int type,
4763 PyObject *v)
4764{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004765 /* fmt = '%#.' + `prec` + `type`
4766 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004767 char fmt[20];
4768 double x;
4769
4770 x = PyFloat_AsDouble(v);
4771 if (x == -1.0 && PyErr_Occurred())
4772 return -1;
4773 if (prec < 0)
4774 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4776 type = 'g';
4777 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004778 /* worst case length calc to ensure no buffer overrun:
4779 fmt = %#.<prec>g
4780 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4781 for any double rep.)
4782 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4783 If prec=0 the effective precision is 1 (the leading digit is
4784 always given), therefore increase by one to 10+prec. */
4785 if (buflen <= (size_t)10 + (size_t)prec) {
4786 PyErr_SetString(PyExc_OverflowError,
4787 "formatted float is too long (precision too long?)");
4788 return -1;
4789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790 return usprintf(buf, fmt, x);
4791}
4792
Tim Peters38fd5b62000-09-21 05:43:11 +00004793static PyObject*
4794formatlong(PyObject *val, int flags, int prec, int type)
4795{
4796 char *buf;
4797 int i, len;
4798 PyObject *str; /* temporary string object. */
4799 PyUnicodeObject *result;
4800
4801 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4802 if (!str)
4803 return NULL;
4804 result = _PyUnicode_New(len);
4805 for (i = 0; i < len; i++)
4806 result->str[i] = buf[i];
4807 result->str[len] = 0;
4808 Py_DECREF(str);
4809 return (PyObject*)result;
4810}
4811
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812static int
4813formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004814 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004815 int flags,
4816 int prec,
4817 int type,
4818 PyObject *v)
4819{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004820 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004821 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4822 + 1 + 1 = 24*/
4823 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004825 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826
4827 x = PyInt_AsLong(v);
4828 if (x == -1 && PyErr_Occurred())
4829 return -1;
4830 if (prec < 0)
4831 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004832 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4833 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4834 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4835 PyErr_SetString(PyExc_OverflowError,
4836 "formatted integer is too long (precision too long?)");
4837 return -1;
4838 }
Tim Petersfff53252001-04-12 18:38:48 +00004839 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4840 * but we want it (for consistency with other %#x conversions, and
4841 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004842 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4843 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4844 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004845 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004846 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4847 /* Only way to know what the platform does is to try it. */
4848 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4849 if (fmt[1] != (char)type) {
4850 /* Supply our own leading 0x/0X -- needed under std C */
4851 use_native_c_format = 0;
4852 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4853 }
4854 }
4855 if (use_native_c_format)
4856 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004857 return usprintf(buf, fmt, x);
4858}
4859
4860static int
4861formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004862 size_t buflen,
4863 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004865 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004866 if (PyUnicode_Check(v)) {
4867 if (PyUnicode_GET_SIZE(v) != 1)
4868 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004870 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004872 else if (PyString_Check(v)) {
4873 if (PyString_GET_SIZE(v) != 1)
4874 goto onError;
4875 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4876 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877
4878 else {
4879 /* Integer input truncated to a character */
4880 long x;
4881 x = PyInt_AsLong(v);
4882 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004883 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004884 buf[0] = (char) x;
4885 }
4886 buf[1] = '\0';
4887 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004888
4889 onError:
4890 PyErr_SetString(PyExc_TypeError,
4891 "%c requires int or char");
4892 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893}
4894
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004895/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4896
4897 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4898 chars are formatted. XXX This is a magic number. Each formatting
4899 routine does bounds checking to ensure no overflow, but a better
4900 solution may be to malloc a buffer of appropriate size for each
4901 format. For now, the current solution is sufficient.
4902*/
4903#define FORMATBUFLEN (size_t)120
4904
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905PyObject *PyUnicode_Format(PyObject *format,
4906 PyObject *args)
4907{
4908 Py_UNICODE *fmt, *res;
4909 int fmtcnt, rescnt, reslen, arglen, argidx;
4910 int args_owned = 0;
4911 PyUnicodeObject *result = NULL;
4912 PyObject *dict = NULL;
4913 PyObject *uformat;
4914
4915 if (format == NULL || args == NULL) {
4916 PyErr_BadInternalCall();
4917 return NULL;
4918 }
4919 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004920 if (uformat == NULL)
4921 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004922 fmt = PyUnicode_AS_UNICODE(uformat);
4923 fmtcnt = PyUnicode_GET_SIZE(uformat);
4924
4925 reslen = rescnt = fmtcnt + 100;
4926 result = _PyUnicode_New(reslen);
4927 if (result == NULL)
4928 goto onError;
4929 res = PyUnicode_AS_UNICODE(result);
4930
4931 if (PyTuple_Check(args)) {
4932 arglen = PyTuple_Size(args);
4933 argidx = 0;
4934 }
4935 else {
4936 arglen = -1;
4937 argidx = -2;
4938 }
4939 if (args->ob_type->tp_as_mapping)
4940 dict = args;
4941
4942 while (--fmtcnt >= 0) {
4943 if (*fmt != '%') {
4944 if (--rescnt < 0) {
4945 rescnt = fmtcnt + 100;
4946 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004947 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948 return NULL;
4949 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4950 --rescnt;
4951 }
4952 *res++ = *fmt++;
4953 }
4954 else {
4955 /* Got a format specifier */
4956 int flags = 0;
4957 int width = -1;
4958 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 Py_UNICODE c = '\0';
4960 Py_UNICODE fill;
4961 PyObject *v = NULL;
4962 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004963 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 Py_UNICODE sign;
4965 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004966 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004967
4968 fmt++;
4969 if (*fmt == '(') {
4970 Py_UNICODE *keystart;
4971 int keylen;
4972 PyObject *key;
4973 int pcount = 1;
4974
4975 if (dict == NULL) {
4976 PyErr_SetString(PyExc_TypeError,
4977 "format requires a mapping");
4978 goto onError;
4979 }
4980 ++fmt;
4981 --fmtcnt;
4982 keystart = fmt;
4983 /* Skip over balanced parentheses */
4984 while (pcount > 0 && --fmtcnt >= 0) {
4985 if (*fmt == ')')
4986 --pcount;
4987 else if (*fmt == '(')
4988 ++pcount;
4989 fmt++;
4990 }
4991 keylen = fmt - keystart - 1;
4992 if (fmtcnt < 0 || pcount > 0) {
4993 PyErr_SetString(PyExc_ValueError,
4994 "incomplete format key");
4995 goto onError;
4996 }
Fred Drakee4315f52000-05-09 19:53:39 +00004997 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 then looked up since Python uses strings to hold
4999 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005000 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005001 key = PyUnicode_EncodeUTF8(keystart,
5002 keylen,
5003 NULL);
5004 if (key == NULL)
5005 goto onError;
5006 if (args_owned) {
5007 Py_DECREF(args);
5008 args_owned = 0;
5009 }
5010 args = PyObject_GetItem(dict, key);
5011 Py_DECREF(key);
5012 if (args == NULL) {
5013 goto onError;
5014 }
5015 args_owned = 1;
5016 arglen = -1;
5017 argidx = -2;
5018 }
5019 while (--fmtcnt >= 0) {
5020 switch (c = *fmt++) {
5021 case '-': flags |= F_LJUST; continue;
5022 case '+': flags |= F_SIGN; continue;
5023 case ' ': flags |= F_BLANK; continue;
5024 case '#': flags |= F_ALT; continue;
5025 case '0': flags |= F_ZERO; continue;
5026 }
5027 break;
5028 }
5029 if (c == '*') {
5030 v = getnextarg(args, arglen, &argidx);
5031 if (v == NULL)
5032 goto onError;
5033 if (!PyInt_Check(v)) {
5034 PyErr_SetString(PyExc_TypeError,
5035 "* wants int");
5036 goto onError;
5037 }
5038 width = PyInt_AsLong(v);
5039 if (width < 0) {
5040 flags |= F_LJUST;
5041 width = -width;
5042 }
5043 if (--fmtcnt >= 0)
5044 c = *fmt++;
5045 }
5046 else if (c >= '0' && c <= '9') {
5047 width = c - '0';
5048 while (--fmtcnt >= 0) {
5049 c = *fmt++;
5050 if (c < '0' || c > '9')
5051 break;
5052 if ((width*10) / 10 != width) {
5053 PyErr_SetString(PyExc_ValueError,
5054 "width too big");
5055 goto onError;
5056 }
5057 width = width*10 + (c - '0');
5058 }
5059 }
5060 if (c == '.') {
5061 prec = 0;
5062 if (--fmtcnt >= 0)
5063 c = *fmt++;
5064 if (c == '*') {
5065 v = getnextarg(args, arglen, &argidx);
5066 if (v == NULL)
5067 goto onError;
5068 if (!PyInt_Check(v)) {
5069 PyErr_SetString(PyExc_TypeError,
5070 "* wants int");
5071 goto onError;
5072 }
5073 prec = PyInt_AsLong(v);
5074 if (prec < 0)
5075 prec = 0;
5076 if (--fmtcnt >= 0)
5077 c = *fmt++;
5078 }
5079 else if (c >= '0' && c <= '9') {
5080 prec = c - '0';
5081 while (--fmtcnt >= 0) {
5082 c = Py_CHARMASK(*fmt++);
5083 if (c < '0' || c > '9')
5084 break;
5085 if ((prec*10) / 10 != prec) {
5086 PyErr_SetString(PyExc_ValueError,
5087 "prec too big");
5088 goto onError;
5089 }
5090 prec = prec*10 + (c - '0');
5091 }
5092 }
5093 } /* prec */
5094 if (fmtcnt >= 0) {
5095 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005096 if (--fmtcnt >= 0)
5097 c = *fmt++;
5098 }
5099 }
5100 if (fmtcnt < 0) {
5101 PyErr_SetString(PyExc_ValueError,
5102 "incomplete format");
5103 goto onError;
5104 }
5105 if (c != '%') {
5106 v = getnextarg(args, arglen, &argidx);
5107 if (v == NULL)
5108 goto onError;
5109 }
5110 sign = 0;
5111 fill = ' ';
5112 switch (c) {
5113
5114 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005115 pbuf = formatbuf;
5116 /* presume that buffer length is at least 1 */
5117 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005118 len = 1;
5119 break;
5120
5121 case 's':
5122 case 'r':
5123 if (PyUnicode_Check(v) && c == 's') {
5124 temp = v;
5125 Py_INCREF(temp);
5126 }
5127 else {
5128 PyObject *unicode;
5129 if (c == 's')
5130 temp = PyObject_Str(v);
5131 else
5132 temp = PyObject_Repr(v);
5133 if (temp == NULL)
5134 goto onError;
5135 if (!PyString_Check(temp)) {
5136 /* XXX Note: this should never happen, since
5137 PyObject_Repr() and PyObject_Str() assure
5138 this */
5139 Py_DECREF(temp);
5140 PyErr_SetString(PyExc_TypeError,
5141 "%s argument has non-string str()");
5142 goto onError;
5143 }
Fred Drakee4315f52000-05-09 19:53:39 +00005144 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005145 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005146 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 "strict");
5148 Py_DECREF(temp);
5149 temp = unicode;
5150 if (temp == NULL)
5151 goto onError;
5152 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005153 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005154 len = PyUnicode_GET_SIZE(temp);
5155 if (prec >= 0 && len > prec)
5156 len = prec;
5157 break;
5158
5159 case 'i':
5160 case 'd':
5161 case 'u':
5162 case 'o':
5163 case 'x':
5164 case 'X':
5165 if (c == 'i')
5166 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005167 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005168 temp = formatlong(v, flags, prec, c);
5169 if (!temp)
5170 goto onError;
5171 pbuf = PyUnicode_AS_UNICODE(temp);
5172 len = PyUnicode_GET_SIZE(temp);
5173 /* unbounded ints can always produce
5174 a sign character! */
5175 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005176 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005177 else {
5178 pbuf = formatbuf;
5179 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5180 flags, prec, c, v);
5181 if (len < 0)
5182 goto onError;
5183 /* only d conversion is signed */
5184 sign = c == 'd';
5185 }
5186 if (flags & F_ZERO)
5187 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005188 break;
5189
5190 case 'e':
5191 case 'E':
5192 case 'f':
5193 case 'g':
5194 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005195 pbuf = formatbuf;
5196 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5197 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 if (len < 0)
5199 goto onError;
5200 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005201 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005202 fill = '0';
5203 break;
5204
5205 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005206 pbuf = formatbuf;
5207 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 if (len < 0)
5209 goto onError;
5210 break;
5211
5212 default:
5213 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005214 "unsupported format character '%c' (0x%x) "
5215 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005216 (31<=c && c<=126) ? c : '?',
5217 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 goto onError;
5219 }
5220 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005221 if (*pbuf == '-' || *pbuf == '+') {
5222 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005223 len--;
5224 }
5225 else if (flags & F_SIGN)
5226 sign = '+';
5227 else if (flags & F_BLANK)
5228 sign = ' ';
5229 else
5230 sign = 0;
5231 }
5232 if (width < len)
5233 width = len;
5234 if (rescnt < width + (sign != 0)) {
5235 reslen -= rescnt;
5236 rescnt = width + fmtcnt + 100;
5237 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005238 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005239 return NULL;
5240 res = PyUnicode_AS_UNICODE(result)
5241 + reslen - rescnt;
5242 }
5243 if (sign) {
5244 if (fill != ' ')
5245 *res++ = sign;
5246 rescnt--;
5247 if (width > len)
5248 width--;
5249 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005250 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5251 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005252 assert(pbuf[1] == c);
5253 if (fill != ' ') {
5254 *res++ = *pbuf++;
5255 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005256 }
Tim Petersfff53252001-04-12 18:38:48 +00005257 rescnt -= 2;
5258 width -= 2;
5259 if (width < 0)
5260 width = 0;
5261 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005262 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005263 if (width > len && !(flags & F_LJUST)) {
5264 do {
5265 --rescnt;
5266 *res++ = fill;
5267 } while (--width > len);
5268 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005269 if (fill == ' ') {
5270 if (sign)
5271 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005272 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005273 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005274 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005275 *res++ = *pbuf++;
5276 *res++ = *pbuf++;
5277 }
5278 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005279 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280 res += len;
5281 rescnt -= len;
5282 while (--width >= len) {
5283 --rescnt;
5284 *res++ = ' ';
5285 }
5286 if (dict && (argidx < arglen) && c != '%') {
5287 PyErr_SetString(PyExc_TypeError,
5288 "not all arguments converted");
5289 goto onError;
5290 }
5291 Py_XDECREF(temp);
5292 } /* '%' */
5293 } /* until end */
5294 if (argidx < arglen && !dict) {
5295 PyErr_SetString(PyExc_TypeError,
5296 "not all arguments converted");
5297 goto onError;
5298 }
5299
5300 if (args_owned) {
5301 Py_DECREF(args);
5302 }
5303 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005304 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005305 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 return (PyObject *)result;
5307
5308 onError:
5309 Py_XDECREF(result);
5310 Py_DECREF(uformat);
5311 if (args_owned) {
5312 Py_DECREF(args);
5313 }
5314 return NULL;
5315}
5316
5317static PyBufferProcs unicode_as_buffer = {
5318 (getreadbufferproc) unicode_buffer_getreadbuf,
5319 (getwritebufferproc) unicode_buffer_getwritebuf,
5320 (getsegcountproc) unicode_buffer_getsegcount,
5321 (getcharbufferproc) unicode_buffer_getcharbuf,
5322};
5323
5324PyTypeObject PyUnicode_Type = {
5325 PyObject_HEAD_INIT(&PyType_Type)
5326 0, /* ob_size */
5327 "unicode", /* tp_name */
5328 sizeof(PyUnicodeObject), /* tp_size */
5329 0, /* tp_itemsize */
5330 /* Slots */
5331 (destructor)_PyUnicode_Free, /* tp_dealloc */
5332 0, /* tp_print */
5333 (getattrfunc)unicode_getattr, /* tp_getattr */
5334 0, /* tp_setattr */
5335 (cmpfunc) unicode_compare, /* tp_compare */
5336 (reprfunc) unicode_repr, /* tp_repr */
5337 0, /* tp_as_number */
5338 &unicode_as_sequence, /* tp_as_sequence */
5339 0, /* tp_as_mapping */
5340 (hashfunc) unicode_hash, /* tp_hash*/
5341 0, /* tp_call*/
5342 (reprfunc) unicode_str, /* tp_str */
5343 (getattrofunc) NULL, /* tp_getattro */
5344 (setattrofunc) NULL, /* tp_setattro */
5345 &unicode_as_buffer, /* tp_as_buffer */
5346 Py_TPFLAGS_DEFAULT, /* tp_flags */
5347};
5348
5349/* Initialize the Unicode implementation */
5350
Thomas Wouters78890102000-07-22 19:25:51 +00005351void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005352{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005353 int i;
5354
Fred Drakee4315f52000-05-09 19:53:39 +00005355 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005356 unicode_freelist = NULL;
5357 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005358 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005359 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005360 for (i = 0; i < 256; i++)
5361 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005362}
5363
5364/* Finalize the Unicode implementation */
5365
5366void
Thomas Wouters78890102000-07-22 19:25:51 +00005367_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005369 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005370 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005371
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005372 Py_XDECREF(unicode_empty);
5373 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005374
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005375 for (i = 0; i < 256; i++) {
5376 if (unicode_latin1[i]) {
5377 Py_DECREF(unicode_latin1[i]);
5378 unicode_latin1[i] = NULL;
5379 }
5380 }
5381
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005382 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005383 PyUnicodeObject *v = u;
5384 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005385 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005386 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005387 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005388 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005389 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005390 unicode_freelist = NULL;
5391 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392}