blob: 7dc370a48d423d8bb1f803666023f5383106c182 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* --- Unicode Object ----------------------------------------------------- */
107
108static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110 int length)
111{
112 void *oldstr;
113
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000114 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000115 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000116 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118 /* Resizing shared object (unicode_empty or single character
119 objects) in-place is not allowed. Use PyUnicode_Resize()
120 instead ! */
121 if (unicode == unicode_empty ||
122 (unicode->length == 1 &&
123 unicode->str[0] < 256 &&
124 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 return -1;
128 }
129
130 /* We allocate one more byte to make sure the string is
131 Ux0000 terminated -- XXX is this needed ? */
132 oldstr = unicode->str;
133 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
134 if (!unicode->str) {
135 unicode->str = oldstr;
136 PyErr_NoMemory();
137 return -1;
138 }
139 unicode->str[length] = 0;
140 unicode->length = length;
141
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000142 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000143 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000144 if (unicode->defenc) {
145 Py_DECREF(unicode->defenc);
146 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147 }
148 unicode->hash = -1;
149
150 return 0;
151}
152
153/* We allocate one more byte to make sure the string is
154 Ux0000 terminated -- XXX is this needed ?
155
156 XXX This allocator could further be enhanced by assuring that the
157 free list never reduces its size below 1.
158
159*/
160
161static
162PyUnicodeObject *_PyUnicode_New(int length)
163{
164 register PyUnicodeObject *unicode;
165
166 /* Optimization for empty strings */
167 if (length == 0 && unicode_empty != NULL) {
168 Py_INCREF(unicode_empty);
169 return unicode_empty;
170 }
171
172 /* Unicode freelist & memory allocation */
173 if (unicode_freelist) {
174 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000175 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000176 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 /* Keep-Alive optimization: we only upsize the buffer,
179 never downsize it. */
180 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000182 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000183 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 }
185 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000186 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000188 }
189 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 }
191 else {
192 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
193 if (unicode == NULL)
194 return NULL;
195 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
196 }
197
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000198 if (!unicode->str) {
199 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000200 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 unicode->str[length] = 0;
203 unicode->length = length;
204 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000205 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000207
208 onError:
209 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000210 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212}
213
214static
215void _PyUnicode_Free(register PyUnicodeObject *unicode)
216{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 /* Keep-Alive optimization */
219 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000220 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 unicode->str = NULL;
222 unicode->length = 0;
223 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000224 if (unicode->defenc) {
225 Py_DECREF(unicode->defenc);
226 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 }
228 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 *(PyUnicodeObject **)unicode = unicode_freelist;
230 unicode_freelist = unicode;
231 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000234 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000235 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000236 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 }
238}
239
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000240int PyUnicode_Resize(PyObject **unicode,
241 int length)
242{
243 register PyUnicodeObject *v;
244
245 /* Argument checks */
246 if (unicode == NULL) {
247 PyErr_BadInternalCall();
248 return -1;
249 }
250 v = (PyUnicodeObject *)*unicode;
251 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
252 PyErr_BadInternalCall();
253 return -1;
254 }
255
256 /* Resizing unicode_empty and single character objects is not
257 possible since these are being shared. We simply return a fresh
258 copy with the same Unicode content. */
259 if (v->length != length &&
260 (v == unicode_empty || v->length == 1)) {
261 PyUnicodeObject *w = _PyUnicode_New(length);
262 if (w == NULL)
263 return -1;
264 Py_UNICODE_COPY(w->str, v->str,
265 length < v->length ? length : v->length);
266 *unicode = (PyObject *)w;
267 return 0;
268 }
269
270 /* Note that we don't have to modify *unicode for unshared Unicode
271 objects, since we can modify them in-place. */
272 return unicode_resize(v, length);
273}
274
275/* Internal API for use in unicodeobject.c only ! */
276#define _PyUnicode_Resize(unicodevar, length) \
277 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
278
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
280 int size)
281{
282 PyUnicodeObject *unicode;
283
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284 /* If the Unicode data is known at construction time, we can apply
285 some optimizations which share commonly used objects. */
286 if (u != NULL) {
287
288 /* Optimization for empty strings */
289 if (size == 0 && unicode_empty != NULL) {
290 Py_INCREF(unicode_empty);
291 return (PyObject *)unicode_empty;
292 }
293
294 /* Single character Unicode objects in the Latin-1 range are
295 shared when using this constructor */
296 if (size == 1 && *u < 256) {
297 unicode = unicode_latin1[*u];
298 if (!unicode) {
299 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000300 if (!unicode)
301 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000302 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000303 unicode_latin1[*u] = unicode;
304 }
305 Py_INCREF(unicode);
306 return (PyObject *)unicode;
307 }
308 }
309
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the Unicode data into the new object */
315 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317
318 return (PyObject *)unicode;
319}
320
321#ifdef HAVE_WCHAR_H
322
323PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
324 int size)
325{
326 PyUnicodeObject *unicode;
327
328 if (w == NULL) {
329 PyErr_BadInternalCall();
330 return NULL;
331 }
332
333 unicode = _PyUnicode_New(size);
334 if (!unicode)
335 return NULL;
336
337 /* Copy the wchar_t data into the new object */
338#ifdef HAVE_USABLE_WCHAR_T
339 memcpy(unicode->str, w, size * sizeof(wchar_t));
340#else
341 {
342 register Py_UNICODE *u;
343 register int i;
344 u = PyUnicode_AS_UNICODE(unicode);
345 for (i = size; i >= 0; i--)
346 *u++ = *w++;
347 }
348#endif
349
350 return (PyObject *)unicode;
351}
352
353int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
354 register wchar_t *w,
355 int size)
356{
357 if (unicode == NULL) {
358 PyErr_BadInternalCall();
359 return -1;
360 }
361 if (size > PyUnicode_GET_SIZE(unicode))
362 size = PyUnicode_GET_SIZE(unicode);
363#ifdef HAVE_USABLE_WCHAR_T
364 memcpy(w, unicode->str, size * sizeof(wchar_t));
365#else
366 {
367 register Py_UNICODE *u;
368 register int i;
369 u = PyUnicode_AS_UNICODE(unicode);
370 for (i = size; i >= 0; i--)
371 *w++ = *u++;
372 }
373#endif
374
375 return size;
376}
377
378#endif
379
380PyObject *PyUnicode_FromObject(register PyObject *obj)
381{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
383}
384
385PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
386 const char *encoding,
387 const char *errors)
388{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 const char *s;
390 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 int owned = 0;
392 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
394 if (obj == NULL) {
395 PyErr_BadInternalCall();
396 return NULL;
397 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000398
399 /* Coerce object */
400 if (PyInstance_Check(obj)) {
401 PyObject *func;
402 func = PyObject_GetAttrString(obj, "__str__");
403 if (func == NULL) {
404 PyErr_SetString(PyExc_TypeError,
405 "coercing to Unicode: instance doesn't define __str__");
406 return NULL;
407 }
408 obj = PyEval_CallObject(func, NULL);
409 Py_DECREF(func);
410 if (obj == NULL)
411 return NULL;
412 owned = 1;
413 }
414 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 v = obj;
417 if (encoding) {
418 PyErr_SetString(PyExc_TypeError,
419 "decoding Unicode is not supported");
420 return NULL;
421 }
422 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else if (PyString_Check(obj)) {
425 s = PyString_AS_STRING(obj);
426 len = PyString_GET_SIZE(obj);
427 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000428 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
429 /* Overwrite the error message with something more useful in
430 case of a TypeError. */
431 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000432 PyErr_Format(PyExc_TypeError,
433 "coercing to Unicode: need string or buffer, "
434 "%.80s found",
435 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000436 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000437 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000438
439 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 if (len == 0) {
441 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000442 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 else
445 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000446
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000448 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000449 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000450 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000451 return v;
452
453 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000454 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000456 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458}
459
460PyObject *PyUnicode_Decode(const char *s,
461 int size,
462 const char *encoding,
463 const char *errors)
464{
465 PyObject *buffer = NULL, *unicode;
466
Fred Drakee4315f52000-05-09 19:53:39 +0000467 if (encoding == NULL)
468 encoding = PyUnicode_GetDefaultEncoding();
469
470 /* Shortcuts for common default encodings */
471 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000473 else if (strcmp(encoding, "latin-1") == 0)
474 return PyUnicode_DecodeLatin1(s, size, errors);
475 else if (strcmp(encoding, "ascii") == 0)
476 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477
478 /* Decode via the codec registry */
479 buffer = PyBuffer_FromMemory((void *)s, size);
480 if (buffer == NULL)
481 goto onError;
482 unicode = PyCodec_Decode(buffer, encoding, errors);
483 if (unicode == NULL)
484 goto onError;
485 if (!PyUnicode_Check(unicode)) {
486 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000487 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488 unicode->ob_type->tp_name);
489 Py_DECREF(unicode);
490 goto onError;
491 }
492 Py_DECREF(buffer);
493 return unicode;
494
495 onError:
496 Py_XDECREF(buffer);
497 return NULL;
498}
499
500PyObject *PyUnicode_Encode(const Py_UNICODE *s,
501 int size,
502 const char *encoding,
503 const char *errors)
504{
505 PyObject *v, *unicode;
506
507 unicode = PyUnicode_FromUnicode(s, size);
508 if (unicode == NULL)
509 return NULL;
510 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
511 Py_DECREF(unicode);
512 return v;
513}
514
515PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
516 const char *encoding,
517 const char *errors)
518{
519 PyObject *v;
520
521 if (!PyUnicode_Check(unicode)) {
522 PyErr_BadArgument();
523 goto onError;
524 }
Fred Drakee4315f52000-05-09 19:53:39 +0000525
526 if (encoding == NULL)
527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (errors == NULL) {
531 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000532 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000533 else if (strcmp(encoding, "latin-1") == 0)
534 return PyUnicode_AsLatin1String(unicode);
535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_AsASCIIString(unicode);
537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000538
539 /* Encode via the codec registry */
540 v = PyCodec_Encode(unicode, encoding, errors);
541 if (v == NULL)
542 goto onError;
543 /* XXX Should we really enforce this ? */
544 if (!PyString_Check(v)) {
545 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000546 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547 v->ob_type->tp_name);
548 Py_DECREF(v);
549 goto onError;
550 }
551 return v;
552
553 onError:
554 return NULL;
555}
556
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000557/* Return a Python string holding the default encoded value of the
558 Unicode object.
559
560 The resulting string is cached in the Unicode object for subsequent
561 usage by this function. The cached version is needed to implement
562 the character buffer interface and will live (at least) as long as
563 the Unicode object itself.
564
565 The refcount of the string is *not* incremented.
566
567 *** Exported for internal use by the interpreter only !!! ***
568
569*/
570
571PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
572 const char *errors)
573{
574 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
575
576 if (v)
577 return v;
578 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
579 if (v && errors == NULL)
580 ((PyUnicodeObject *)unicode)->defenc = v;
581 return v;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
585{
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
590 return PyUnicode_AS_UNICODE(unicode);
591
592 onError:
593 return NULL;
594}
595
596int PyUnicode_GetSize(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_GET_SIZE(unicode);
603
604 onError:
605 return -1;
606}
607
Thomas Wouters78890102000-07-22 19:25:51 +0000608const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000609{
610 return unicode_default_encoding;
611}
612
613int PyUnicode_SetDefaultEncoding(const char *encoding)
614{
615 PyObject *v;
616
617 /* Make sure the encoding is valid. As side effect, this also
618 loads the encoding into the codec registry cache. */
619 v = _PyCodec_Lookup(encoding);
620 if (v == NULL)
621 goto onError;
622 Py_DECREF(v);
623 strncpy(unicode_default_encoding,
624 encoding,
625 sizeof(unicode_default_encoding));
626 return 0;
627
628 onError:
629 return -1;
630}
631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632/* --- UTF-8 Codec -------------------------------------------------------- */
633
634static
635char utf8_code_length[256] = {
636 /* Map UTF-8 encoded prefix byte to sequence length. zero means
637 illegal prefix. see RFC 2279 for details */
638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
652 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
653 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
654};
655
656static
657int utf8_decoding_error(const char **source,
658 Py_UNICODE **dest,
659 const char *errors,
660 const char *details)
661{
662 if ((errors == NULL) ||
663 (strcmp(errors,"strict") == 0)) {
664 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000665 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 details);
667 return -1;
668 }
669 else if (strcmp(errors,"ignore") == 0) {
670 (*source)++;
671 return 0;
672 }
673 else if (strcmp(errors,"replace") == 0) {
674 (*source)++;
675 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
676 (*dest)++;
677 return 0;
678 }
679 else {
680 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000681 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682 errors);
683 return -1;
684 }
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_DecodeUTF8(const char *s,
688 int size,
689 const char *errors)
690{
691 int n;
692 const char *e;
693 PyUnicodeObject *unicode;
694 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000696
697 /* Note: size will always be longer than the resulting Unicode
698 character count */
699 unicode = _PyUnicode_New(size);
700 if (!unicode)
701 return NULL;
702 if (size == 0)
703 return (PyObject *)unicode;
704
705 /* Unpack UTF-8 encoded data */
706 p = unicode->str;
707 e = s + size;
708
709 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000710 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711
712 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000713 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 s++;
715 continue;
716 }
717
718 n = utf8_code_length[ch];
719
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if (s + n > e) {
721 errmsg = "unexpected end of data";
722 goto utf8Error;
723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724
725 switch (n) {
726
727 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 errmsg = "unexpected code byte";
729 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
731 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000732 errmsg = "internal error";
733 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000734
735 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000736 if ((s[1] & 0xc0) != 0x80) {
737 errmsg = "invalid data";
738 goto utf8Error;
739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 if (ch < 0x80) {
742 errmsg = "illegal encoding";
743 goto utf8Error;
744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000747 break;
748
749 case 3:
750 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 (s[2] & 0xc0) != 0x80) {
752 errmsg = "invalid data";
753 goto utf8Error;
754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000756 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
757 errmsg = "illegal encoding";
758 goto utf8Error;
759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000761 *p++ = (Py_UNICODE)ch;
762 break;
763
764 case 4:
765 if ((s[1] & 0xc0) != 0x80 ||
766 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000767 (s[3] & 0xc0) != 0x80) {
768 errmsg = "invalid data";
769 goto utf8Error;
770 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000771 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
772 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
773 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000774 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000775 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000776 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000777 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000778 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000779 errmsg = "illegal encoding";
780 goto utf8Error;
781 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000782#if Py_UNICODE_SIZE == 4
783 *p++ = (Py_UNICODE)ch;
784#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000785 /* compute and append the two surrogates: */
786
787 /* translate from 10000..10FFFF to 0..FFFF */
788 ch -= 0x10000;
789
790 /* high surrogate = top 10 bits added to D800 */
791 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
792
793 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +0000794 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000795#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000796 break;
797
798 default:
799 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000800 errmsg = "unsupported Unicode code range";
801 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000802 }
803 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000804 continue;
805
806 utf8Error:
807 if (utf8_decoding_error(&s, &p, errors, errmsg))
808 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809 }
810
811 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000812 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000813 goto onError;
814
815 return (PyObject *)unicode;
816
817onError:
818 Py_DECREF(unicode);
819 return NULL;
820}
821
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000822/* Not used anymore, now that the encoder supports UTF-16
823 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000824#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825static
826int utf8_encoding_error(const Py_UNICODE **source,
827 char **dest,
828 const char *errors,
829 const char *details)
830{
831 if ((errors == NULL) ||
832 (strcmp(errors,"strict") == 0)) {
833 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000834 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835 details);
836 return -1;
837 }
838 else if (strcmp(errors,"ignore") == 0) {
839 return 0;
840 }
841 else if (strcmp(errors,"replace") == 0) {
842 **dest = '?';
843 (*dest)++;
844 return 0;
845 }
846 else {
847 PyErr_Format(PyExc_ValueError,
848 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000849 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850 errors);
851 return -1;
852 }
853}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000854#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000855
856PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
857 int size,
858 const char *errors)
859{
860 PyObject *v;
861 char *p;
862 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000863 Py_UCS4 ch2;
864 unsigned int cbAllocated = 3 * size;
865 unsigned int cbWritten = 0;
866 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000868 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000869 if (v == NULL)
870 return NULL;
871 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000872 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873
874 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000875 while (i < size) {
876 Py_UCS4 ch = s[i++];
877 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000878 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 cbWritten++;
880 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000881 else if (ch < 0x0800) {
882 *p++ = 0xc0 | (ch >> 6);
883 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000884 cbWritten += 2;
885 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000886 else if (ch < 0x10000) {
887#if Py_UNICODE_SIZE == 4
888 *p++ = 0xe0 | (ch>>12);
889 *p++ = 0x80 | ((ch>>6) & 0x3f);
890 *p++ = 0x80 | (ch & 0x3f);
891 cbWritten += 3;
892#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000893 /* Check for high surrogate */
894 if (0xD800 <= ch && ch <= 0xDBFF) {
895 if (i != size) {
896 ch2 = s[i];
897 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
898
899 if (cbWritten >= (cbAllocated - 4)) {
900 /* Provide enough room for some more
901 surrogates */
902 cbAllocated += 4*10;
903 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000904 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000905 }
906
907 /* combine the two values */
908 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
909
910 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000911 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000912 i++;
913 cbWritten += 4;
914 }
915 }
916 }
917 else {
918 *p++ = (char)(0xe0 | (ch >> 12));
919 cbWritten += 3;
920 }
921 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
922 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000923#endif
924 } else {
925 *p++ = 0xf0 | (ch>>18);
926 *p++ = 0x80 | ((ch>>12) & 0x3f);
927 *p++ = 0x80 | ((ch>>6) & 0x3f);
928 *p++ = 0x80 | (ch & 0x3f);
929 cbWritten += 4;
930 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000931 }
932 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000933 if (_PyString_Resize(&v, p - q))
934 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000935 return v;
936
937 onError:
938 Py_DECREF(v);
939 return NULL;
940}
941
Guido van Rossumd57fd912000-03-10 22:53:23 +0000942PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
943{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000944 if (!PyUnicode_Check(unicode)) {
945 PyErr_BadArgument();
946 return NULL;
947 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000948 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
949 PyUnicode_GET_SIZE(unicode),
950 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951}
952
953/* --- UTF-16 Codec ------------------------------------------------------- */
954
955static
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000956int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000957 Py_UNICODE **dest,
958 const char *errors,
959 const char *details)
960{
961 if ((errors == NULL) ||
962 (strcmp(errors,"strict") == 0)) {
963 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000964 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965 details);
966 return -1;
967 }
968 else if (strcmp(errors,"ignore") == 0) {
969 return 0;
970 }
971 else if (strcmp(errors,"replace") == 0) {
972 if (dest) {
973 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
974 (*dest)++;
975 }
976 return 0;
977 }
978 else {
979 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000980 "UTF-16 decoding error; "
981 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000982 errors);
983 return -1;
984 }
985}
986
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987PyObject *PyUnicode_DecodeUTF16(const char *s,
988 int size,
989 const char *errors,
990 int *byteorder)
991{
992 PyUnicodeObject *unicode;
993 Py_UNICODE *p;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000994 const Py_UCS2 *q, *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000995 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000996 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000997
998 /* size should be an even number */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000999 if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001000 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
1001 return NULL;
1002 /* The remaining input chars are ignored if we fall through
1003 here... */
1004 }
1005
1006 /* Note: size will always be longer than the resulting Unicode
1007 character count */
1008 unicode = _PyUnicode_New(size);
1009 if (!unicode)
1010 return NULL;
1011 if (size == 0)
1012 return (PyObject *)unicode;
1013
1014 /* Unpack UTF-16 encoded data */
1015 p = unicode->str;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001016 q = (Py_UCS2 *)s;
1017 e = q + (size / sizeof(Py_UCS2));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001018
1019 if (byteorder)
1020 bo = *byteorder;
1021
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001022 /* Check for BOM marks (U+FEFF) in the input and adjust current
1023 byte order setting accordingly. In native mode, the leading BOM
1024 mark is skipped, in all other modes, it is copied to the output
1025 stream as-is (giving a ZWNBSP character). */
1026 if (bo == 0) {
1027#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1028 if (*q == 0xFEFF) {
1029 q++;
1030 bo = -1;
1031 } else if (*q == 0xFFFE) {
1032 q++;
1033 bo = 1;
1034 }
1035#else
1036 if (*q == 0xFEFF) {
1037 q++;
1038 bo = 1;
1039 } else if (*q == 0xFFFE) {
1040 q++;
1041 bo = -1;
1042 }
1043#endif
1044 }
1045
Guido van Rossumd57fd912000-03-10 22:53:23 +00001046 while (q < e) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001047 register Py_UCS2 ch = *q++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001048
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001049 /* Swap input bytes if needed. (This assumes
1050 sizeof(Py_UNICODE) == 2 !) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 if (bo == 1)
1053 ch = (ch >> 8) | (ch << 8);
1054#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001055 if (bo == -1)
1056 ch = (ch >> 8) | (ch << 8);
1057#endif
1058 if (ch < 0xD800 || ch > 0xDFFF) {
1059 *p++ = ch;
1060 continue;
1061 }
1062
1063 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001064 if (q >= e) {
1065 errmsg = "unexpected end of data";
1066 goto utf16Error;
1067 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 if (0xDC00 <= *q && *q <= 0xDFFF) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001069 Py_UCS2 ch2 = *q++;
1070#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1071 if (bo == 1)
1072 ch = (ch >> 8) | (ch << 8);
1073#else
1074 if (bo == -1)
1075 ch = (ch >> 8) | (ch << 8);
1076#endif
1077 if (0xD800 <= ch && ch <= 0xDBFF) {
1078#if Py_UNICODE_SIZE == 2
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 /* This is valid data (a UTF-16 surrogate pair), but
1080 we are not able to store this information since our
1081 Py_UNICODE type only has 16 bits... this might
1082 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001083 errmsg = "code pairs are not supported";
1084 goto utf16Error;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001085#else
1086 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001088#endif
1089
1090 }
1091 else {
1092 errmsg = "illegal UTF-16 surrogate";
1093 goto utf16Error;
1094 }
1095
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001097 errmsg = "illegal encoding";
1098 /* Fall through to report the error */
1099
1100 utf16Error:
1101 if (utf16_decoding_error(&q, &p, errors, errmsg))
1102 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001103 }
1104
1105 if (byteorder)
1106 *byteorder = bo;
1107
1108 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001109 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001110 goto onError;
1111
1112 return (PyObject *)unicode;
1113
1114onError:
1115 Py_DECREF(unicode);
1116 return NULL;
1117}
1118
1119#undef UTF16_ERROR
1120
1121PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1122 int size,
1123 const char *errors,
1124 int byteorder)
1125{
1126 PyObject *v;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001127 Py_UCS2 *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001128 char *q;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001129 int i, pairs, doswap = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001131 for (i = pairs = 0; i < size; i++)
1132 if (s[i] >= 0x10000)
1133 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 v = PyString_FromStringAndSize(NULL,
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001135 sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001136 if (v == NULL)
1137 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139 q = PyString_AS_STRING(v);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001140 p = (Py_UCS2 *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141 if (byteorder == 0)
1142 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001143 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001144 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001145 if (byteorder == 0 ||
1146#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1147 byteorder == -1
1148#else
1149 byteorder == 1
1150#endif
1151 )
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001152 doswap = 0;
1153 while (size-- > 0) {
1154 Py_UNICODE ch = *s++;
1155 Py_UNICODE ch2 = 0;
1156 if (ch >= 0x10000) {
1157 ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
1158 ch = 0xD800|((ch-0x10000)>>10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001160 if (doswap){
1161 *p++ = (ch >> 8) | (ch << 8);
1162 if (ch2)
1163 *p++ = (ch2 >> 8) | (ch2 << 8);
1164 }else{
1165 *p++ = ch;
1166 if(ch2)
1167 *p++ = ch2;
1168 }
1169 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 return v;
1171}
1172
1173PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1174{
1175 if (!PyUnicode_Check(unicode)) {
1176 PyErr_BadArgument();
1177 return NULL;
1178 }
1179 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1180 PyUnicode_GET_SIZE(unicode),
1181 NULL,
1182 0);
1183}
1184
1185/* --- Unicode Escape Codec ----------------------------------------------- */
1186
1187static
1188int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001189 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190 const char *errors,
1191 const char *details)
1192{
1193 if ((errors == NULL) ||
1194 (strcmp(errors,"strict") == 0)) {
1195 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001196 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197 details);
1198 return -1;
1199 }
1200 else if (strcmp(errors,"ignore") == 0) {
1201 return 0;
1202 }
1203 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001204 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 return 0;
1206 }
1207 else {
1208 PyErr_Format(PyExc_ValueError,
1209 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001210 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 errors);
1212 return -1;
1213 }
1214}
1215
Fredrik Lundh06d12682001-01-24 07:59:11 +00001216static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001217
Guido van Rossumd57fd912000-03-10 22:53:23 +00001218PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1219 int size,
1220 const char *errors)
1221{
1222 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001223 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001224 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001225 char* message;
1226 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1227
Guido van Rossumd57fd912000-03-10 22:53:23 +00001228 /* Escaped strings will always be longer than the resulting
1229 Unicode string, so we start with size here and then reduce the
1230 length after conversion to the true value. */
1231 v = _PyUnicode_New(size);
1232 if (v == NULL)
1233 goto onError;
1234 if (size == 0)
1235 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001236
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237 p = buf = PyUnicode_AS_UNICODE(v);
1238 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001239
Guido van Rossumd57fd912000-03-10 22:53:23 +00001240 while (s < end) {
1241 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001242 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001243 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001244
1245 /* Non-escape characters are interpreted as Unicode ordinals */
1246 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001247 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 continue;
1249 }
1250
1251 /* \ - Escapes */
1252 s++;
1253 switch (*s++) {
1254
1255 /* \x escapes */
1256 case '\n': break;
1257 case '\\': *p++ = '\\'; break;
1258 case '\'': *p++ = '\''; break;
1259 case '\"': *p++ = '\"'; break;
1260 case 'b': *p++ = '\b'; break;
1261 case 'f': *p++ = '\014'; break; /* FF */
1262 case 't': *p++ = '\t'; break;
1263 case 'n': *p++ = '\n'; break;
1264 case 'r': *p++ = '\r'; break;
1265 case 'v': *p++ = '\013'; break; /* VT */
1266 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1267
1268 /* \OOO (octal) escapes */
1269 case '0': case '1': case '2': case '3':
1270 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001271 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001272 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001273 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001274 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001275 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001276 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001277 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278 break;
1279
Fredrik Lundhccc74732001-02-18 22:13:49 +00001280 /* hex escapes */
1281 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001282 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001283 digits = 2;
1284 message = "truncated \\xXX escape";
1285 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286
Fredrik Lundhccc74732001-02-18 22:13:49 +00001287 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001289 digits = 4;
1290 message = "truncated \\uXXXX escape";
1291 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292
Fredrik Lundhccc74732001-02-18 22:13:49 +00001293 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001294 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001295 digits = 8;
1296 message = "truncated \\UXXXXXXXX escape";
1297 hexescape:
1298 chr = 0;
1299 for (i = 0; i < digits; i++) {
1300 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001301 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001302 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001303 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001304 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001305 i++;
1306 break;
1307 }
1308 chr = (chr<<4) & ~0xF;
1309 if (c >= '0' && c <= '9')
1310 chr += c - '0';
1311 else if (c >= 'a' && c <= 'f')
1312 chr += 10 + c - 'a';
1313 else
1314 chr += 10 + c - 'A';
1315 }
1316 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001317 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001318 /* when we get here, chr is a 32-bit unicode character */
1319 if (chr <= 0xffff)
1320 /* UCS-2 character */
1321 *p++ = (Py_UNICODE) chr;
1322 else if (chr <= 0x10ffff) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001323 /* UCS-4 character. Either store directly, or as surrogate pair. */
1324#if Py_UNICODE_SIZE == 4
1325 *p++ = chr;
1326#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001327 chr -= 0x10000L;
1328 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001329 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001330#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001331 } else {
1332 if (unicodeescape_decoding_error(
1333 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001334 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001335 )
1336 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001337 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001338 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001339 break;
1340
1341 /* \N{name} */
1342 case 'N':
1343 message = "malformed \\N character escape";
1344 if (ucnhash_CAPI == NULL) {
1345 /* load the unicode data module */
1346 PyObject *m, *v;
1347 m = PyImport_ImportModule("unicodedata");
1348 if (m == NULL)
1349 goto ucnhashError;
1350 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1351 Py_DECREF(m);
1352 if (v == NULL)
1353 goto ucnhashError;
1354 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1355 Py_DECREF(v);
1356 if (ucnhash_CAPI == NULL)
1357 goto ucnhashError;
1358 }
1359 if (*s == '{') {
1360 const char *start = s+1;
1361 /* look for the closing brace */
1362 while (*s != '}' && s < end)
1363 s++;
1364 if (s > start && s < end && *s == '}') {
1365 /* found a name. look it up in the unicode database */
1366 message = "unknown Unicode character name";
1367 s++;
1368 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1369 goto store;
1370 }
1371 }
1372 if (unicodeescape_decoding_error(&s, &x, errors, message))
1373 goto onError;
1374 *p++ = x;
1375 break;
1376
1377 default:
1378 *p++ = '\\';
1379 *p++ = (unsigned char)s[-1];
1380 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001381 }
1382 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001383 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001384 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 return (PyObject *)v;
1386
Fredrik Lundhccc74732001-02-18 22:13:49 +00001387ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001388 PyErr_SetString(
1389 PyExc_UnicodeError,
1390 "\\N escapes not supported (can't load unicodedata module)"
1391 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001392 return NULL;
1393
Fredrik Lundhccc74732001-02-18 22:13:49 +00001394onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001395 Py_XDECREF(v);
1396 return NULL;
1397}
1398
1399/* Return a Unicode-Escape string version of the Unicode object.
1400
1401 If quotes is true, the string is enclosed in u"" or u'' quotes as
1402 appropriate.
1403
1404*/
1405
Barry Warsaw51ac5802000-03-20 16:36:48 +00001406static const Py_UNICODE *findchar(const Py_UNICODE *s,
1407 int size,
1408 Py_UNICODE ch);
1409
Guido van Rossumd57fd912000-03-10 22:53:23 +00001410static
1411PyObject *unicodeescape_string(const Py_UNICODE *s,
1412 int size,
1413 int quotes)
1414{
1415 PyObject *repr;
1416 char *p;
1417 char *q;
1418
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001419 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420
1421 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1422 if (repr == NULL)
1423 return NULL;
1424
1425 p = q = PyString_AS_STRING(repr);
1426
1427 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001428 *p++ = 'u';
1429 *p++ = (findchar(s, size, '\'') &&
1430 !findchar(s, size, '"')) ? '"' : '\'';
1431 }
1432 while (size-- > 0) {
1433 Py_UNICODE ch = *s++;
1434 /* Escape quotes */
Fredrik Lundh30831632001-06-26 15:11:00 +00001435 if (quotes && (ch == (Py_UNICODE) q[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001436 *p++ = '\\';
1437 *p++ = (char) ch;
1438 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001439 /* Map 21-bit characters to '\U00xxxxxx' */
1440 else if (ch >= 0x10000) {
1441 *p++ = '\\';
1442 *p++ = 'U';
1443 *p++ = hexdigit[(ch >> 28) & 0xf];
1444 *p++ = hexdigit[(ch >> 24) & 0xf];
1445 *p++ = hexdigit[(ch >> 20) & 0xf];
1446 *p++ = hexdigit[(ch >> 16) & 0xf];
1447 *p++ = hexdigit[(ch >> 12) & 0xf];
1448 *p++ = hexdigit[(ch >> 8) & 0xf];
1449 *p++ = hexdigit[(ch >> 4) & 0xf];
1450 *p++ = hexdigit[ch & 15];
1451 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001452 /* Map 16-bit characters to '\uxxxx' */
1453 else if (ch >= 256) {
1454 *p++ = '\\';
1455 *p++ = 'u';
1456 *p++ = hexdigit[(ch >> 12) & 0xf];
1457 *p++ = hexdigit[(ch >> 8) & 0xf];
1458 *p++ = hexdigit[(ch >> 4) & 0xf];
1459 *p++ = hexdigit[ch & 15];
1460 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001461 /* Map special whitespace to '\t', \n', '\r' */
1462 else if (ch == '\t') {
1463 *p++ = '\\';
1464 *p++ = 't';
1465 }
1466 else if (ch == '\n') {
1467 *p++ = '\\';
1468 *p++ = 'n';
1469 }
1470 else if (ch == '\r') {
1471 *p++ = '\\';
1472 *p++ = 'r';
1473 }
1474 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001475 else if (ch < ' ' || ch >= 128) {
1476 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001477 *p++ = 'x';
1478 *p++ = hexdigit[(ch >> 4) & 0xf];
1479 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 }
1481 /* Copy everything else as-is */
1482 else
1483 *p++ = (char) ch;
1484 }
1485 if (quotes)
1486 *p++ = q[1];
1487
1488 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001489 if (_PyString_Resize(&repr, p - q))
1490 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001491
1492 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001493
1494 onError:
1495 Py_DECREF(repr);
1496 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497}
1498
1499PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1500 int size)
1501{
1502 return unicodeescape_string(s, size, 0);
1503}
1504
1505PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1506{
1507 if (!PyUnicode_Check(unicode)) {
1508 PyErr_BadArgument();
1509 return NULL;
1510 }
1511 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1512 PyUnicode_GET_SIZE(unicode));
1513}
1514
1515/* --- Raw Unicode Escape Codec ------------------------------------------- */
1516
1517PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1518 int size,
1519 const char *errors)
1520{
1521 PyUnicodeObject *v;
1522 Py_UNICODE *p, *buf;
1523 const char *end;
1524 const char *bs;
1525
1526 /* Escaped strings will always be longer than the resulting
1527 Unicode string, so we start with size here and then reduce the
1528 length after conversion to the true value. */
1529 v = _PyUnicode_New(size);
1530 if (v == NULL)
1531 goto onError;
1532 if (size == 0)
1533 return (PyObject *)v;
1534 p = buf = PyUnicode_AS_UNICODE(v);
1535 end = s + size;
1536 while (s < end) {
1537 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001538 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001539 int i;
1540
1541 /* Non-escape characters are interpreted as Unicode ordinals */
1542 if (*s != '\\') {
1543 *p++ = (unsigned char)*s++;
1544 continue;
1545 }
1546
1547 /* \u-escapes are only interpreted iff the number of leading
1548 backslashes if odd */
1549 bs = s;
1550 for (;s < end;) {
1551 if (*s != '\\')
1552 break;
1553 *p++ = (unsigned char)*s++;
1554 }
1555 if (((s - bs) & 1) == 0 ||
1556 s >= end ||
1557 *s != 'u') {
1558 continue;
1559 }
1560 p--;
1561 s++;
1562
1563 /* \uXXXX with 4 hex digits */
1564 for (x = 0, i = 0; i < 4; i++) {
1565 c = (unsigned char)s[i];
1566 if (!isxdigit(c)) {
1567 if (unicodeescape_decoding_error(&s, &x, errors,
1568 "truncated \\uXXXX"))
1569 goto onError;
1570 i++;
1571 break;
1572 }
1573 x = (x<<4) & ~0xF;
1574 if (c >= '0' && c <= '9')
1575 x += c - '0';
1576 else if (c >= 'a' && c <= 'f')
1577 x += 10 + c - 'a';
1578 else
1579 x += 10 + c - 'A';
1580 }
1581 s += i;
1582 *p++ = x;
1583 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001584 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001585 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 return (PyObject *)v;
1587
1588 onError:
1589 Py_XDECREF(v);
1590 return NULL;
1591}
1592
1593PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1594 int size)
1595{
1596 PyObject *repr;
1597 char *p;
1598 char *q;
1599
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001600 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001601
1602 repr = PyString_FromStringAndSize(NULL, 6 * size);
1603 if (repr == NULL)
1604 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001605 if (size == 0)
1606 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001607
1608 p = q = PyString_AS_STRING(repr);
1609 while (size-- > 0) {
1610 Py_UNICODE ch = *s++;
1611 /* Map 16-bit characters to '\uxxxx' */
1612 if (ch >= 256) {
1613 *p++ = '\\';
1614 *p++ = 'u';
1615 *p++ = hexdigit[(ch >> 12) & 0xf];
1616 *p++ = hexdigit[(ch >> 8) & 0xf];
1617 *p++ = hexdigit[(ch >> 4) & 0xf];
1618 *p++ = hexdigit[ch & 15];
1619 }
1620 /* Copy everything else as-is */
1621 else
1622 *p++ = (char) ch;
1623 }
1624 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001625 if (_PyString_Resize(&repr, p - q))
1626 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001627
1628 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001629
1630 onError:
1631 Py_DECREF(repr);
1632 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001633}
1634
1635PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1636{
1637 if (!PyUnicode_Check(unicode)) {
1638 PyErr_BadArgument();
1639 return NULL;
1640 }
1641 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1642 PyUnicode_GET_SIZE(unicode));
1643}
1644
1645/* --- Latin-1 Codec ------------------------------------------------------ */
1646
1647PyObject *PyUnicode_DecodeLatin1(const char *s,
1648 int size,
1649 const char *errors)
1650{
1651 PyUnicodeObject *v;
1652 Py_UNICODE *p;
1653
1654 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001655 if (size == 1 && *(unsigned char*)s < 256) {
1656 Py_UNICODE r = *(unsigned char*)s;
1657 return PyUnicode_FromUnicode(&r, 1);
1658 }
1659
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 v = _PyUnicode_New(size);
1661 if (v == NULL)
1662 goto onError;
1663 if (size == 0)
1664 return (PyObject *)v;
1665 p = PyUnicode_AS_UNICODE(v);
1666 while (size-- > 0)
1667 *p++ = (unsigned char)*s++;
1668 return (PyObject *)v;
1669
1670 onError:
1671 Py_XDECREF(v);
1672 return NULL;
1673}
1674
1675static
1676int latin1_encoding_error(const Py_UNICODE **source,
1677 char **dest,
1678 const char *errors,
1679 const char *details)
1680{
1681 if ((errors == NULL) ||
1682 (strcmp(errors,"strict") == 0)) {
1683 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001684 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001685 details);
1686 return -1;
1687 }
1688 else if (strcmp(errors,"ignore") == 0) {
1689 return 0;
1690 }
1691 else if (strcmp(errors,"replace") == 0) {
1692 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001693 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001694 return 0;
1695 }
1696 else {
1697 PyErr_Format(PyExc_ValueError,
1698 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001699 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001700 errors);
1701 return -1;
1702 }
1703}
1704
1705PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1706 int size,
1707 const char *errors)
1708{
1709 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001710 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001711
Guido van Rossumd57fd912000-03-10 22:53:23 +00001712 repr = PyString_FromStringAndSize(NULL, size);
1713 if (repr == NULL)
1714 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001715 if (size == 0)
1716 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717
1718 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001719 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720 while (size-- > 0) {
1721 Py_UNICODE ch = *p++;
1722 if (ch >= 256) {
1723 if (latin1_encoding_error(&p, &s, errors,
1724 "ordinal not in range(256)"))
1725 goto onError;
1726 }
1727 else
1728 *s++ = (char)ch;
1729 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001730 /* Resize if error handling skipped some characters */
1731 if (s - start < PyString_GET_SIZE(repr))
1732 if (_PyString_Resize(&repr, s - start))
1733 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001734 return repr;
1735
1736 onError:
1737 Py_DECREF(repr);
1738 return NULL;
1739}
1740
1741PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1742{
1743 if (!PyUnicode_Check(unicode)) {
1744 PyErr_BadArgument();
1745 return NULL;
1746 }
1747 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1748 PyUnicode_GET_SIZE(unicode),
1749 NULL);
1750}
1751
1752/* --- 7-bit ASCII Codec -------------------------------------------------- */
1753
1754static
1755int ascii_decoding_error(const char **source,
1756 Py_UNICODE **dest,
1757 const char *errors,
1758 const char *details)
1759{
1760 if ((errors == NULL) ||
1761 (strcmp(errors,"strict") == 0)) {
1762 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001763 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 details);
1765 return -1;
1766 }
1767 else if (strcmp(errors,"ignore") == 0) {
1768 return 0;
1769 }
1770 else if (strcmp(errors,"replace") == 0) {
1771 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1772 (*dest)++;
1773 return 0;
1774 }
1775 else {
1776 PyErr_Format(PyExc_ValueError,
1777 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001778 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001779 errors);
1780 return -1;
1781 }
1782}
1783
1784PyObject *PyUnicode_DecodeASCII(const char *s,
1785 int size,
1786 const char *errors)
1787{
1788 PyUnicodeObject *v;
1789 Py_UNICODE *p;
1790
1791 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001792 if (size == 1 && *(unsigned char*)s < 128) {
1793 Py_UNICODE r = *(unsigned char*)s;
1794 return PyUnicode_FromUnicode(&r, 1);
1795 }
1796
Guido van Rossumd57fd912000-03-10 22:53:23 +00001797 v = _PyUnicode_New(size);
1798 if (v == NULL)
1799 goto onError;
1800 if (size == 0)
1801 return (PyObject *)v;
1802 p = PyUnicode_AS_UNICODE(v);
1803 while (size-- > 0) {
1804 register unsigned char c;
1805
1806 c = (unsigned char)*s++;
1807 if (c < 128)
1808 *p++ = c;
1809 else if (ascii_decoding_error(&s, &p, errors,
1810 "ordinal not in range(128)"))
1811 goto onError;
1812 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001813 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001814 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001815 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 return (PyObject *)v;
1817
1818 onError:
1819 Py_XDECREF(v);
1820 return NULL;
1821}
1822
1823static
1824int ascii_encoding_error(const Py_UNICODE **source,
1825 char **dest,
1826 const char *errors,
1827 const char *details)
1828{
1829 if ((errors == NULL) ||
1830 (strcmp(errors,"strict") == 0)) {
1831 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001832 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001833 details);
1834 return -1;
1835 }
1836 else if (strcmp(errors,"ignore") == 0) {
1837 return 0;
1838 }
1839 else if (strcmp(errors,"replace") == 0) {
1840 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001841 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001842 return 0;
1843 }
1844 else {
1845 PyErr_Format(PyExc_ValueError,
1846 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001847 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001848 errors);
1849 return -1;
1850 }
1851}
1852
1853PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1854 int size,
1855 const char *errors)
1856{
1857 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001858 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001859
Guido van Rossumd57fd912000-03-10 22:53:23 +00001860 repr = PyString_FromStringAndSize(NULL, size);
1861 if (repr == NULL)
1862 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001863 if (size == 0)
1864 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865
1866 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001867 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 while (size-- > 0) {
1869 Py_UNICODE ch = *p++;
1870 if (ch >= 128) {
1871 if (ascii_encoding_error(&p, &s, errors,
1872 "ordinal not in range(128)"))
1873 goto onError;
1874 }
1875 else
1876 *s++ = (char)ch;
1877 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001878 /* Resize if error handling skipped some characters */
1879 if (s - start < PyString_GET_SIZE(repr))
1880 if (_PyString_Resize(&repr, s - start))
1881 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882 return repr;
1883
1884 onError:
1885 Py_DECREF(repr);
1886 return NULL;
1887}
1888
1889PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1890{
1891 if (!PyUnicode_Check(unicode)) {
1892 PyErr_BadArgument();
1893 return NULL;
1894 }
1895 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1896 PyUnicode_GET_SIZE(unicode),
1897 NULL);
1898}
1899
Fredrik Lundh30831632001-06-26 15:11:00 +00001900#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001901
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001902/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001903
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001904PyObject *PyUnicode_DecodeMBCS(const char *s,
1905 int size,
1906 const char *errors)
1907{
1908 PyUnicodeObject *v;
1909 Py_UNICODE *p;
1910
1911 /* First get the size of the result */
1912 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001913 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001914 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1915
1916 v = _PyUnicode_New(usize);
1917 if (v == NULL)
1918 return NULL;
1919 if (usize == 0)
1920 return (PyObject *)v;
1921 p = PyUnicode_AS_UNICODE(v);
1922 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1923 Py_DECREF(v);
1924 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1925 }
1926
1927 return (PyObject *)v;
1928}
1929
1930PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1931 int size,
1932 const char *errors)
1933{
1934 PyObject *repr;
1935 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001936 DWORD mbcssize;
1937
1938 /* If there are no characters, bail now! */
1939 if (size==0)
1940 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001941
1942 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001943 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001944 if (mbcssize==0)
1945 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1946
1947 repr = PyString_FromStringAndSize(NULL, mbcssize);
1948 if (repr == NULL)
1949 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001950 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001951 return repr;
1952
1953 /* Do the conversion */
1954 s = PyString_AS_STRING(repr);
1955 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1956 Py_DECREF(repr);
1957 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1958 }
1959 return repr;
1960}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001961
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001962#endif /* MS_WIN32 */
1963
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964/* --- Character Mapping Codec -------------------------------------------- */
1965
1966static
1967int charmap_decoding_error(const char **source,
1968 Py_UNICODE **dest,
1969 const char *errors,
1970 const char *details)
1971{
1972 if ((errors == NULL) ||
1973 (strcmp(errors,"strict") == 0)) {
1974 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001975 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 details);
1977 return -1;
1978 }
1979 else if (strcmp(errors,"ignore") == 0) {
1980 return 0;
1981 }
1982 else if (strcmp(errors,"replace") == 0) {
1983 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1984 (*dest)++;
1985 return 0;
1986 }
1987 else {
1988 PyErr_Format(PyExc_ValueError,
1989 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001990 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001991 errors);
1992 return -1;
1993 }
1994}
1995
1996PyObject *PyUnicode_DecodeCharmap(const char *s,
1997 int size,
1998 PyObject *mapping,
1999 const char *errors)
2000{
2001 PyUnicodeObject *v;
2002 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002003 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002004
2005 /* Default to Latin-1 */
2006 if (mapping == NULL)
2007 return PyUnicode_DecodeLatin1(s, size, errors);
2008
2009 v = _PyUnicode_New(size);
2010 if (v == NULL)
2011 goto onError;
2012 if (size == 0)
2013 return (PyObject *)v;
2014 p = PyUnicode_AS_UNICODE(v);
2015 while (size-- > 0) {
2016 unsigned char ch = *s++;
2017 PyObject *w, *x;
2018
2019 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2020 w = PyInt_FromLong((long)ch);
2021 if (w == NULL)
2022 goto onError;
2023 x = PyObject_GetItem(mapping, w);
2024 Py_DECREF(w);
2025 if (x == NULL) {
2026 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002027 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002028 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002029 x = Py_None;
2030 Py_INCREF(x);
2031 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002032 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002033 }
2034
2035 /* Apply mapping */
2036 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002037 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002038 if (value < 0 || value > 65535) {
2039 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002040 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 Py_DECREF(x);
2042 goto onError;
2043 }
2044 *p++ = (Py_UNICODE)value;
2045 }
2046 else if (x == Py_None) {
2047 /* undefined mapping */
2048 if (charmap_decoding_error(&s, &p, errors,
2049 "character maps to <undefined>")) {
2050 Py_DECREF(x);
2051 goto onError;
2052 }
2053 }
2054 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002055 int targetsize = PyUnicode_GET_SIZE(x);
2056
2057 if (targetsize == 1)
2058 /* 1-1 mapping */
2059 *p++ = *PyUnicode_AS_UNICODE(x);
2060
2061 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002063 if (targetsize > extrachars) {
2064 /* resize first */
2065 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2066 int needed = (targetsize - extrachars) + \
2067 (targetsize << 2);
2068 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002069 if (_PyUnicode_Resize(&v,
2070 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002071 Py_DECREF(x);
2072 goto onError;
2073 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002074 p = PyUnicode_AS_UNICODE(v) + oldpos;
2075 }
2076 Py_UNICODE_COPY(p,
2077 PyUnicode_AS_UNICODE(x),
2078 targetsize);
2079 p += targetsize;
2080 extrachars -= targetsize;
2081 }
2082 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002083 }
2084 else {
2085 /* wrong return value */
2086 PyErr_SetString(PyExc_TypeError,
2087 "character mapping must return integer, None or unicode");
2088 Py_DECREF(x);
2089 goto onError;
2090 }
2091 Py_DECREF(x);
2092 }
2093 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002094 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002095 goto onError;
2096 return (PyObject *)v;
2097
2098 onError:
2099 Py_XDECREF(v);
2100 return NULL;
2101}
2102
2103static
2104int charmap_encoding_error(const Py_UNICODE **source,
2105 char **dest,
2106 const char *errors,
2107 const char *details)
2108{
2109 if ((errors == NULL) ||
2110 (strcmp(errors,"strict") == 0)) {
2111 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002112 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002113 details);
2114 return -1;
2115 }
2116 else if (strcmp(errors,"ignore") == 0) {
2117 return 0;
2118 }
2119 else if (strcmp(errors,"replace") == 0) {
2120 **dest = '?';
2121 (*dest)++;
2122 return 0;
2123 }
2124 else {
2125 PyErr_Format(PyExc_ValueError,
2126 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002127 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128 errors);
2129 return -1;
2130 }
2131}
2132
2133PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2134 int size,
2135 PyObject *mapping,
2136 const char *errors)
2137{
2138 PyObject *v;
2139 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002140 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002141
2142 /* Default to Latin-1 */
2143 if (mapping == NULL)
2144 return PyUnicode_EncodeLatin1(p, size, errors);
2145
2146 v = PyString_FromStringAndSize(NULL, size);
2147 if (v == NULL)
2148 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002149 if (size == 0)
2150 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151 s = PyString_AS_STRING(v);
2152 while (size-- > 0) {
2153 Py_UNICODE ch = *p++;
2154 PyObject *w, *x;
2155
2156 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2157 w = PyInt_FromLong((long)ch);
2158 if (w == NULL)
2159 goto onError;
2160 x = PyObject_GetItem(mapping, w);
2161 Py_DECREF(w);
2162 if (x == NULL) {
2163 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002164 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002166 x = Py_None;
2167 Py_INCREF(x);
2168 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002169 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002170 }
2171
2172 /* Apply mapping */
2173 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002174 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (value < 0 || value > 255) {
2176 PyErr_SetString(PyExc_TypeError,
2177 "character mapping must be in range(256)");
2178 Py_DECREF(x);
2179 goto onError;
2180 }
2181 *s++ = (char)value;
2182 }
2183 else if (x == Py_None) {
2184 /* undefined mapping */
2185 if (charmap_encoding_error(&p, &s, errors,
2186 "character maps to <undefined>")) {
2187 Py_DECREF(x);
2188 goto onError;
2189 }
2190 }
2191 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002192 int targetsize = PyString_GET_SIZE(x);
2193
2194 if (targetsize == 1)
2195 /* 1-1 mapping */
2196 *s++ = *PyString_AS_STRING(x);
2197
2198 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002199 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002200 if (targetsize > extrachars) {
2201 /* resize first */
2202 int oldpos = (int)(s - PyString_AS_STRING(v));
2203 int needed = (targetsize - extrachars) + \
2204 (targetsize << 2);
2205 extrachars += needed;
2206 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002207 Py_DECREF(x);
2208 goto onError;
2209 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002210 s = PyString_AS_STRING(v) + oldpos;
2211 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002212 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002213 s += targetsize;
2214 extrachars -= targetsize;
2215 }
2216 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217 }
2218 else {
2219 /* wrong return value */
2220 PyErr_SetString(PyExc_TypeError,
2221 "character mapping must return integer, None or unicode");
2222 Py_DECREF(x);
2223 goto onError;
2224 }
2225 Py_DECREF(x);
2226 }
2227 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2228 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2229 goto onError;
2230 return v;
2231
2232 onError:
2233 Py_DECREF(v);
2234 return NULL;
2235}
2236
2237PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2238 PyObject *mapping)
2239{
2240 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2241 PyErr_BadArgument();
2242 return NULL;
2243 }
2244 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2245 PyUnicode_GET_SIZE(unicode),
2246 mapping,
2247 NULL);
2248}
2249
2250static
2251int translate_error(const Py_UNICODE **source,
2252 Py_UNICODE **dest,
2253 const char *errors,
2254 const char *details)
2255{
2256 if ((errors == NULL) ||
2257 (strcmp(errors,"strict") == 0)) {
2258 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002259 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002260 details);
2261 return -1;
2262 }
2263 else if (strcmp(errors,"ignore") == 0) {
2264 return 0;
2265 }
2266 else if (strcmp(errors,"replace") == 0) {
2267 **dest = '?';
2268 (*dest)++;
2269 return 0;
2270 }
2271 else {
2272 PyErr_Format(PyExc_ValueError,
2273 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002274 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002275 errors);
2276 return -1;
2277 }
2278}
2279
2280PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2281 int size,
2282 PyObject *mapping,
2283 const char *errors)
2284{
2285 PyUnicodeObject *v;
2286 Py_UNICODE *p;
2287
2288 if (mapping == NULL) {
2289 PyErr_BadArgument();
2290 return NULL;
2291 }
2292
2293 /* Output will never be longer than input */
2294 v = _PyUnicode_New(size);
2295 if (v == NULL)
2296 goto onError;
2297 if (size == 0)
2298 goto done;
2299 p = PyUnicode_AS_UNICODE(v);
2300 while (size-- > 0) {
2301 Py_UNICODE ch = *s++;
2302 PyObject *w, *x;
2303
2304 /* Get mapping */
2305 w = PyInt_FromLong(ch);
2306 if (w == NULL)
2307 goto onError;
2308 x = PyObject_GetItem(mapping, w);
2309 Py_DECREF(w);
2310 if (x == NULL) {
2311 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2312 /* No mapping found: default to 1-1 mapping */
2313 PyErr_Clear();
2314 *p++ = ch;
2315 continue;
2316 }
2317 goto onError;
2318 }
2319
2320 /* Apply mapping */
2321 if (PyInt_Check(x))
2322 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2323 else if (x == Py_None) {
2324 /* undefined mapping */
2325 if (translate_error(&s, &p, errors,
2326 "character maps to <undefined>")) {
2327 Py_DECREF(x);
2328 goto onError;
2329 }
2330 }
2331 else if (PyUnicode_Check(x)) {
2332 if (PyUnicode_GET_SIZE(x) != 1) {
2333 /* 1-n mapping */
2334 PyErr_SetString(PyExc_NotImplementedError,
2335 "1-n mappings are currently not implemented");
2336 Py_DECREF(x);
2337 goto onError;
2338 }
2339 *p++ = *PyUnicode_AS_UNICODE(x);
2340 }
2341 else {
2342 /* wrong return value */
2343 PyErr_SetString(PyExc_TypeError,
2344 "translate mapping must return integer, None or unicode");
2345 Py_DECREF(x);
2346 goto onError;
2347 }
2348 Py_DECREF(x);
2349 }
2350 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002351 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002352 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353
2354 done:
2355 return (PyObject *)v;
2356
2357 onError:
2358 Py_XDECREF(v);
2359 return NULL;
2360}
2361
2362PyObject *PyUnicode_Translate(PyObject *str,
2363 PyObject *mapping,
2364 const char *errors)
2365{
2366 PyObject *result;
2367
2368 str = PyUnicode_FromObject(str);
2369 if (str == NULL)
2370 goto onError;
2371 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2372 PyUnicode_GET_SIZE(str),
2373 mapping,
2374 errors);
2375 Py_DECREF(str);
2376 return result;
2377
2378 onError:
2379 Py_XDECREF(str);
2380 return NULL;
2381}
2382
Guido van Rossum9e896b32000-04-05 20:11:21 +00002383/* --- Decimal Encoder ---------------------------------------------------- */
2384
2385int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2386 int length,
2387 char *output,
2388 const char *errors)
2389{
2390 Py_UNICODE *p, *end;
2391
2392 if (output == NULL) {
2393 PyErr_BadArgument();
2394 return -1;
2395 }
2396
2397 p = s;
2398 end = s + length;
2399 while (p < end) {
2400 register Py_UNICODE ch = *p++;
2401 int decimal;
2402
2403 if (Py_UNICODE_ISSPACE(ch)) {
2404 *output++ = ' ';
2405 continue;
2406 }
2407 decimal = Py_UNICODE_TODECIMAL(ch);
2408 if (decimal >= 0) {
2409 *output++ = '0' + decimal;
2410 continue;
2411 }
Guido van Rossumba477042000-04-06 18:18:10 +00002412 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002413 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002414 continue;
2415 }
2416 /* All other characters are considered invalid */
2417 if (errors == NULL || strcmp(errors, "strict") == 0) {
2418 PyErr_SetString(PyExc_ValueError,
2419 "invalid decimal Unicode string");
2420 goto onError;
2421 }
2422 else if (strcmp(errors, "ignore") == 0)
2423 continue;
2424 else if (strcmp(errors, "replace") == 0) {
2425 *output++ = '?';
2426 continue;
2427 }
2428 }
2429 /* 0-terminate the output string */
2430 *output++ = '\0';
2431 return 0;
2432
2433 onError:
2434 return -1;
2435}
2436
Guido van Rossumd57fd912000-03-10 22:53:23 +00002437/* --- Helpers ------------------------------------------------------------ */
2438
2439static
2440int count(PyUnicodeObject *self,
2441 int start,
2442 int end,
2443 PyUnicodeObject *substring)
2444{
2445 int count = 0;
2446
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002447 if (start < 0)
2448 start += self->length;
2449 if (start < 0)
2450 start = 0;
2451 if (end > self->length)
2452 end = self->length;
2453 if (end < 0)
2454 end += self->length;
2455 if (end < 0)
2456 end = 0;
2457
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002458 if (substring->length == 0)
2459 return (end - start + 1);
2460
Guido van Rossumd57fd912000-03-10 22:53:23 +00002461 end -= substring->length;
2462
2463 while (start <= end)
2464 if (Py_UNICODE_MATCH(self, start, substring)) {
2465 count++;
2466 start += substring->length;
2467 } else
2468 start++;
2469
2470 return count;
2471}
2472
2473int PyUnicode_Count(PyObject *str,
2474 PyObject *substr,
2475 int start,
2476 int end)
2477{
2478 int result;
2479
2480 str = PyUnicode_FromObject(str);
2481 if (str == NULL)
2482 return -1;
2483 substr = PyUnicode_FromObject(substr);
2484 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002485 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002486 return -1;
2487 }
2488
2489 result = count((PyUnicodeObject *)str,
2490 start, end,
2491 (PyUnicodeObject *)substr);
2492
2493 Py_DECREF(str);
2494 Py_DECREF(substr);
2495 return result;
2496}
2497
2498static
2499int findstring(PyUnicodeObject *self,
2500 PyUnicodeObject *substring,
2501 int start,
2502 int end,
2503 int direction)
2504{
2505 if (start < 0)
2506 start += self->length;
2507 if (start < 0)
2508 start = 0;
2509
2510 if (substring->length == 0)
2511 return start;
2512
2513 if (end > self->length)
2514 end = self->length;
2515 if (end < 0)
2516 end += self->length;
2517 if (end < 0)
2518 end = 0;
2519
2520 end -= substring->length;
2521
2522 if (direction < 0) {
2523 for (; end >= start; end--)
2524 if (Py_UNICODE_MATCH(self, end, substring))
2525 return end;
2526 } else {
2527 for (; start <= end; start++)
2528 if (Py_UNICODE_MATCH(self, start, substring))
2529 return start;
2530 }
2531
2532 return -1;
2533}
2534
2535int PyUnicode_Find(PyObject *str,
2536 PyObject *substr,
2537 int start,
2538 int end,
2539 int direction)
2540{
2541 int result;
2542
2543 str = PyUnicode_FromObject(str);
2544 if (str == NULL)
2545 return -1;
2546 substr = PyUnicode_FromObject(substr);
2547 if (substr == NULL) {
2548 Py_DECREF(substr);
2549 return -1;
2550 }
2551
2552 result = findstring((PyUnicodeObject *)str,
2553 (PyUnicodeObject *)substr,
2554 start, end, direction);
2555 Py_DECREF(str);
2556 Py_DECREF(substr);
2557 return result;
2558}
2559
2560static
2561int tailmatch(PyUnicodeObject *self,
2562 PyUnicodeObject *substring,
2563 int start,
2564 int end,
2565 int direction)
2566{
2567 if (start < 0)
2568 start += self->length;
2569 if (start < 0)
2570 start = 0;
2571
2572 if (substring->length == 0)
2573 return 1;
2574
2575 if (end > self->length)
2576 end = self->length;
2577 if (end < 0)
2578 end += self->length;
2579 if (end < 0)
2580 end = 0;
2581
2582 end -= substring->length;
2583 if (end < start)
2584 return 0;
2585
2586 if (direction > 0) {
2587 if (Py_UNICODE_MATCH(self, end, substring))
2588 return 1;
2589 } else {
2590 if (Py_UNICODE_MATCH(self, start, substring))
2591 return 1;
2592 }
2593
2594 return 0;
2595}
2596
2597int PyUnicode_Tailmatch(PyObject *str,
2598 PyObject *substr,
2599 int start,
2600 int end,
2601 int direction)
2602{
2603 int result;
2604
2605 str = PyUnicode_FromObject(str);
2606 if (str == NULL)
2607 return -1;
2608 substr = PyUnicode_FromObject(substr);
2609 if (substr == NULL) {
2610 Py_DECREF(substr);
2611 return -1;
2612 }
2613
2614 result = tailmatch((PyUnicodeObject *)str,
2615 (PyUnicodeObject *)substr,
2616 start, end, direction);
2617 Py_DECREF(str);
2618 Py_DECREF(substr);
2619 return result;
2620}
2621
2622static
2623const Py_UNICODE *findchar(const Py_UNICODE *s,
2624 int size,
2625 Py_UNICODE ch)
2626{
2627 /* like wcschr, but doesn't stop at NULL characters */
2628
2629 while (size-- > 0) {
2630 if (*s == ch)
2631 return s;
2632 s++;
2633 }
2634
2635 return NULL;
2636}
2637
2638/* Apply fixfct filter to the Unicode object self and return a
2639 reference to the modified object */
2640
2641static
2642PyObject *fixup(PyUnicodeObject *self,
2643 int (*fixfct)(PyUnicodeObject *s))
2644{
2645
2646 PyUnicodeObject *u;
2647
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002648 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002649 if (u == NULL)
2650 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002651
2652 Py_UNICODE_COPY(u->str, self->str, self->length);
2653
Guido van Rossumd57fd912000-03-10 22:53:23 +00002654 if (!fixfct(u)) {
2655 /* fixfct should return TRUE if it modified the buffer. If
2656 FALSE, return a reference to the original buffer instead
2657 (to save space, not time) */
2658 Py_INCREF(self);
2659 Py_DECREF(u);
2660 return (PyObject*) self;
2661 }
2662 return (PyObject*) u;
2663}
2664
2665static
2666int fixupper(PyUnicodeObject *self)
2667{
2668 int len = self->length;
2669 Py_UNICODE *s = self->str;
2670 int status = 0;
2671
2672 while (len-- > 0) {
2673 register Py_UNICODE ch;
2674
2675 ch = Py_UNICODE_TOUPPER(*s);
2676 if (ch != *s) {
2677 status = 1;
2678 *s = ch;
2679 }
2680 s++;
2681 }
2682
2683 return status;
2684}
2685
2686static
2687int fixlower(PyUnicodeObject *self)
2688{
2689 int len = self->length;
2690 Py_UNICODE *s = self->str;
2691 int status = 0;
2692
2693 while (len-- > 0) {
2694 register Py_UNICODE ch;
2695
2696 ch = Py_UNICODE_TOLOWER(*s);
2697 if (ch != *s) {
2698 status = 1;
2699 *s = ch;
2700 }
2701 s++;
2702 }
2703
2704 return status;
2705}
2706
2707static
2708int fixswapcase(PyUnicodeObject *self)
2709{
2710 int len = self->length;
2711 Py_UNICODE *s = self->str;
2712 int status = 0;
2713
2714 while (len-- > 0) {
2715 if (Py_UNICODE_ISUPPER(*s)) {
2716 *s = Py_UNICODE_TOLOWER(*s);
2717 status = 1;
2718 } else if (Py_UNICODE_ISLOWER(*s)) {
2719 *s = Py_UNICODE_TOUPPER(*s);
2720 status = 1;
2721 }
2722 s++;
2723 }
2724
2725 return status;
2726}
2727
2728static
2729int fixcapitalize(PyUnicodeObject *self)
2730{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002731 int len = self->length;
2732 Py_UNICODE *s = self->str;
2733 int status = 0;
2734
2735 if (len == 0)
2736 return 0;
2737 if (Py_UNICODE_ISLOWER(*s)) {
2738 *s = Py_UNICODE_TOUPPER(*s);
2739 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002740 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002741 s++;
2742 while (--len > 0) {
2743 if (Py_UNICODE_ISUPPER(*s)) {
2744 *s = Py_UNICODE_TOLOWER(*s);
2745 status = 1;
2746 }
2747 s++;
2748 }
2749 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002750}
2751
2752static
2753int fixtitle(PyUnicodeObject *self)
2754{
2755 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2756 register Py_UNICODE *e;
2757 int previous_is_cased;
2758
2759 /* Shortcut for single character strings */
2760 if (PyUnicode_GET_SIZE(self) == 1) {
2761 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2762 if (*p != ch) {
2763 *p = ch;
2764 return 1;
2765 }
2766 else
2767 return 0;
2768 }
2769
2770 e = p + PyUnicode_GET_SIZE(self);
2771 previous_is_cased = 0;
2772 for (; p < e; p++) {
2773 register const Py_UNICODE ch = *p;
2774
2775 if (previous_is_cased)
2776 *p = Py_UNICODE_TOLOWER(ch);
2777 else
2778 *p = Py_UNICODE_TOTITLE(ch);
2779
2780 if (Py_UNICODE_ISLOWER(ch) ||
2781 Py_UNICODE_ISUPPER(ch) ||
2782 Py_UNICODE_ISTITLE(ch))
2783 previous_is_cased = 1;
2784 else
2785 previous_is_cased = 0;
2786 }
2787 return 1;
2788}
2789
2790PyObject *PyUnicode_Join(PyObject *separator,
2791 PyObject *seq)
2792{
2793 Py_UNICODE *sep;
2794 int seplen;
2795 PyUnicodeObject *res = NULL;
2796 int reslen = 0;
2797 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002798 int sz = 100;
2799 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002800 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002801
Tim Peters2cfe3682001-05-05 05:36:48 +00002802 it = PyObject_GetIter(seq);
2803 if (it == NULL)
2804 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805
2806 if (separator == NULL) {
2807 Py_UNICODE blank = ' ';
2808 sep = &blank;
2809 seplen = 1;
2810 }
2811 else {
2812 separator = PyUnicode_FromObject(separator);
2813 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002814 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002815 sep = PyUnicode_AS_UNICODE(separator);
2816 seplen = PyUnicode_GET_SIZE(separator);
2817 }
2818
2819 res = _PyUnicode_New(sz);
2820 if (res == NULL)
2821 goto onError;
2822 p = PyUnicode_AS_UNICODE(res);
2823 reslen = 0;
2824
Tim Peters2cfe3682001-05-05 05:36:48 +00002825 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002826 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002827 PyObject *item = PyIter_Next(it);
2828 if (item == NULL) {
2829 if (PyErr_Occurred())
2830 goto onError;
2831 break;
2832 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 if (!PyUnicode_Check(item)) {
2834 PyObject *v;
2835 v = PyUnicode_FromObject(item);
2836 Py_DECREF(item);
2837 item = v;
2838 if (item == NULL)
2839 goto onError;
2840 }
2841 itemlen = PyUnicode_GET_SIZE(item);
2842 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002843 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 goto onError;
2845 sz *= 2;
2846 p = PyUnicode_AS_UNICODE(res) + reslen;
2847 }
2848 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002849 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002850 p += seplen;
2851 reslen += seplen;
2852 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002853 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002854 p += itemlen;
2855 reslen += itemlen;
2856 Py_DECREF(item);
2857 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002858 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 goto onError;
2860
2861 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002862 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002863 return (PyObject *)res;
2864
2865 onError:
2866 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002867 Py_XDECREF(res);
2868 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002869 return NULL;
2870}
2871
2872static
2873PyUnicodeObject *pad(PyUnicodeObject *self,
2874 int left,
2875 int right,
2876 Py_UNICODE fill)
2877{
2878 PyUnicodeObject *u;
2879
2880 if (left < 0)
2881 left = 0;
2882 if (right < 0)
2883 right = 0;
2884
2885 if (left == 0 && right == 0) {
2886 Py_INCREF(self);
2887 return self;
2888 }
2889
2890 u = _PyUnicode_New(left + self->length + right);
2891 if (u) {
2892 if (left)
2893 Py_UNICODE_FILL(u->str, fill, left);
2894 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2895 if (right)
2896 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2897 }
2898
2899 return u;
2900}
2901
2902#define SPLIT_APPEND(data, left, right) \
2903 str = PyUnicode_FromUnicode(data + left, right - left); \
2904 if (!str) \
2905 goto onError; \
2906 if (PyList_Append(list, str)) { \
2907 Py_DECREF(str); \
2908 goto onError; \
2909 } \
2910 else \
2911 Py_DECREF(str);
2912
2913static
2914PyObject *split_whitespace(PyUnicodeObject *self,
2915 PyObject *list,
2916 int maxcount)
2917{
2918 register int i;
2919 register int j;
2920 int len = self->length;
2921 PyObject *str;
2922
2923 for (i = j = 0; i < len; ) {
2924 /* find a token */
2925 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2926 i++;
2927 j = i;
2928 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2929 i++;
2930 if (j < i) {
2931 if (maxcount-- <= 0)
2932 break;
2933 SPLIT_APPEND(self->str, j, i);
2934 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2935 i++;
2936 j = i;
2937 }
2938 }
2939 if (j < len) {
2940 SPLIT_APPEND(self->str, j, len);
2941 }
2942 return list;
2943
2944 onError:
2945 Py_DECREF(list);
2946 return NULL;
2947}
2948
2949PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002950 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002951{
2952 register int i;
2953 register int j;
2954 int len;
2955 PyObject *list;
2956 PyObject *str;
2957 Py_UNICODE *data;
2958
2959 string = PyUnicode_FromObject(string);
2960 if (string == NULL)
2961 return NULL;
2962 data = PyUnicode_AS_UNICODE(string);
2963 len = PyUnicode_GET_SIZE(string);
2964
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965 list = PyList_New(0);
2966 if (!list)
2967 goto onError;
2968
2969 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002970 int eol;
2971
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 /* Find a line and append it */
2973 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2974 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002975
2976 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002977 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002978 if (i < len) {
2979 if (data[i] == '\r' && i + 1 < len &&
2980 data[i+1] == '\n')
2981 i += 2;
2982 else
2983 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002984 if (keepends)
2985 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002986 }
Guido van Rossum86662912000-04-11 15:38:46 +00002987 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002988 j = i;
2989 }
2990 if (j < len) {
2991 SPLIT_APPEND(data, j, len);
2992 }
2993
2994 Py_DECREF(string);
2995 return list;
2996
2997 onError:
2998 Py_DECREF(list);
2999 Py_DECREF(string);
3000 return NULL;
3001}
3002
3003static
3004PyObject *split_char(PyUnicodeObject *self,
3005 PyObject *list,
3006 Py_UNICODE ch,
3007 int maxcount)
3008{
3009 register int i;
3010 register int j;
3011 int len = self->length;
3012 PyObject *str;
3013
3014 for (i = j = 0; i < len; ) {
3015 if (self->str[i] == ch) {
3016 if (maxcount-- <= 0)
3017 break;
3018 SPLIT_APPEND(self->str, j, i);
3019 i = j = i + 1;
3020 } else
3021 i++;
3022 }
3023 if (j <= len) {
3024 SPLIT_APPEND(self->str, j, len);
3025 }
3026 return list;
3027
3028 onError:
3029 Py_DECREF(list);
3030 return NULL;
3031}
3032
3033static
3034PyObject *split_substring(PyUnicodeObject *self,
3035 PyObject *list,
3036 PyUnicodeObject *substring,
3037 int maxcount)
3038{
3039 register int i;
3040 register int j;
3041 int len = self->length;
3042 int sublen = substring->length;
3043 PyObject *str;
3044
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003045 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003046 if (Py_UNICODE_MATCH(self, i, substring)) {
3047 if (maxcount-- <= 0)
3048 break;
3049 SPLIT_APPEND(self->str, j, i);
3050 i = j = i + sublen;
3051 } else
3052 i++;
3053 }
3054 if (j <= len) {
3055 SPLIT_APPEND(self->str, j, len);
3056 }
3057 return list;
3058
3059 onError:
3060 Py_DECREF(list);
3061 return NULL;
3062}
3063
3064#undef SPLIT_APPEND
3065
3066static
3067PyObject *split(PyUnicodeObject *self,
3068 PyUnicodeObject *substring,
3069 int maxcount)
3070{
3071 PyObject *list;
3072
3073 if (maxcount < 0)
3074 maxcount = INT_MAX;
3075
3076 list = PyList_New(0);
3077 if (!list)
3078 return NULL;
3079
3080 if (substring == NULL)
3081 return split_whitespace(self,list,maxcount);
3082
3083 else if (substring->length == 1)
3084 return split_char(self,list,substring->str[0],maxcount);
3085
3086 else if (substring->length == 0) {
3087 Py_DECREF(list);
3088 PyErr_SetString(PyExc_ValueError, "empty separator");
3089 return NULL;
3090 }
3091 else
3092 return split_substring(self,list,substring,maxcount);
3093}
3094
3095static
3096PyObject *strip(PyUnicodeObject *self,
3097 int left,
3098 int right)
3099{
3100 Py_UNICODE *p = self->str;
3101 int start = 0;
3102 int end = self->length;
3103
3104 if (left)
3105 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3106 start++;
3107
3108 if (right)
3109 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3110 end--;
3111
3112 if (start == 0 && end == self->length) {
3113 /* couldn't strip anything off, return original string */
3114 Py_INCREF(self);
3115 return (PyObject*) self;
3116 }
3117
3118 return (PyObject*) PyUnicode_FromUnicode(
3119 self->str + start,
3120 end - start
3121 );
3122}
3123
3124static
3125PyObject *replace(PyUnicodeObject *self,
3126 PyUnicodeObject *str1,
3127 PyUnicodeObject *str2,
3128 int maxcount)
3129{
3130 PyUnicodeObject *u;
3131
3132 if (maxcount < 0)
3133 maxcount = INT_MAX;
3134
3135 if (str1->length == 1 && str2->length == 1) {
3136 int i;
3137
3138 /* replace characters */
3139 if (!findchar(self->str, self->length, str1->str[0])) {
3140 /* nothing to replace, return original string */
3141 Py_INCREF(self);
3142 u = self;
3143 } else {
3144 Py_UNICODE u1 = str1->str[0];
3145 Py_UNICODE u2 = str2->str[0];
3146
3147 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003148 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003149 self->length
3150 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003151 if (u != NULL) {
3152 Py_UNICODE_COPY(u->str, self->str,
3153 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003154 for (i = 0; i < u->length; i++)
3155 if (u->str[i] == u1) {
3156 if (--maxcount < 0)
3157 break;
3158 u->str[i] = u2;
3159 }
3160 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162
3163 } else {
3164 int n, i;
3165 Py_UNICODE *p;
3166
3167 /* replace strings */
3168 n = count(self, 0, self->length, str1);
3169 if (n > maxcount)
3170 n = maxcount;
3171 if (n == 0) {
3172 /* nothing to replace, return original string */
3173 Py_INCREF(self);
3174 u = self;
3175 } else {
3176 u = _PyUnicode_New(
3177 self->length + n * (str2->length - str1->length));
3178 if (u) {
3179 i = 0;
3180 p = u->str;
3181 while (i <= self->length - str1->length)
3182 if (Py_UNICODE_MATCH(self, i, str1)) {
3183 /* replace string segment */
3184 Py_UNICODE_COPY(p, str2->str, str2->length);
3185 p += str2->length;
3186 i += str1->length;
3187 if (--n <= 0) {
3188 /* copy remaining part */
3189 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3190 break;
3191 }
3192 } else
3193 *p++ = self->str[i++];
3194 }
3195 }
3196 }
3197
3198 return (PyObject *) u;
3199}
3200
3201/* --- Unicode Object Methods --------------------------------------------- */
3202
3203static char title__doc__[] =
3204"S.title() -> unicode\n\
3205\n\
3206Return a titlecased version of S, i.e. words start with title case\n\
3207characters, all remaining cased characters have lower case.";
3208
3209static PyObject*
3210unicode_title(PyUnicodeObject *self, PyObject *args)
3211{
3212 if (!PyArg_NoArgs(args))
3213 return NULL;
3214 return fixup(self, fixtitle);
3215}
3216
3217static char capitalize__doc__[] =
3218"S.capitalize() -> unicode\n\
3219\n\
3220Return a capitalized version of S, i.e. make the first character\n\
3221have upper case.";
3222
3223static PyObject*
3224unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3225{
3226 if (!PyArg_NoArgs(args))
3227 return NULL;
3228 return fixup(self, fixcapitalize);
3229}
3230
3231#if 0
3232static char capwords__doc__[] =
3233"S.capwords() -> unicode\n\
3234\n\
3235Apply .capitalize() to all words in S and return the result with\n\
3236normalized whitespace (all whitespace strings are replaced by ' ').";
3237
3238static PyObject*
3239unicode_capwords(PyUnicodeObject *self, PyObject *args)
3240{
3241 PyObject *list;
3242 PyObject *item;
3243 int i;
3244
3245 if (!PyArg_NoArgs(args))
3246 return NULL;
3247
3248 /* Split into words */
3249 list = split(self, NULL, -1);
3250 if (!list)
3251 return NULL;
3252
3253 /* Capitalize each word */
3254 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3255 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3256 fixcapitalize);
3257 if (item == NULL)
3258 goto onError;
3259 Py_DECREF(PyList_GET_ITEM(list, i));
3260 PyList_SET_ITEM(list, i, item);
3261 }
3262
3263 /* Join the words to form a new string */
3264 item = PyUnicode_Join(NULL, list);
3265
3266onError:
3267 Py_DECREF(list);
3268 return (PyObject *)item;
3269}
3270#endif
3271
3272static char center__doc__[] =
3273"S.center(width) -> unicode\n\
3274\n\
3275Return S centered in a Unicode string of length width. Padding is done\n\
3276using spaces.";
3277
3278static PyObject *
3279unicode_center(PyUnicodeObject *self, PyObject *args)
3280{
3281 int marg, left;
3282 int width;
3283
3284 if (!PyArg_ParseTuple(args, "i:center", &width))
3285 return NULL;
3286
3287 if (self->length >= width) {
3288 Py_INCREF(self);
3289 return (PyObject*) self;
3290 }
3291
3292 marg = width - self->length;
3293 left = marg / 2 + (marg & width & 1);
3294
3295 return (PyObject*) pad(self, left, marg - left, ' ');
3296}
3297
Marc-André Lemburge5034372000-08-08 08:04:29 +00003298#if 0
3299
3300/* This code should go into some future Unicode collation support
3301 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003302 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003303
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003304/* speedy UTF-16 code point order comparison */
3305/* gleaned from: */
3306/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3307
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003308static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003309{
3310 0, 0, 0, 0, 0, 0, 0, 0,
3311 0, 0, 0, 0, 0, 0, 0, 0,
3312 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003313 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003314};
3315
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316static int
3317unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3318{
3319 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003320
Guido van Rossumd57fd912000-03-10 22:53:23 +00003321 Py_UNICODE *s1 = str1->str;
3322 Py_UNICODE *s2 = str2->str;
3323
3324 len1 = str1->length;
3325 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003326
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003328 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003329
3330 c1 = *s1++;
3331 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003332
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003333 if (c1 > (1<<11) * 26)
3334 c1 += utf16Fixup[c1>>11];
3335 if (c2 > (1<<11) * 26)
3336 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003337 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003338
3339 if (c1 != c2)
3340 return (c1 < c2) ? -1 : 1;
3341
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003342 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003343 }
3344
3345 return (len1 < len2) ? -1 : (len1 != len2);
3346}
3347
Marc-André Lemburge5034372000-08-08 08:04:29 +00003348#else
3349
3350static int
3351unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3352{
3353 register int len1, len2;
3354
3355 Py_UNICODE *s1 = str1->str;
3356 Py_UNICODE *s2 = str2->str;
3357
3358 len1 = str1->length;
3359 len2 = str2->length;
3360
3361 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003362 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003363
Fredrik Lundh45714e92001-06-26 16:39:36 +00003364 c1 = *s1++;
3365 c2 = *s2++;
3366
3367 if (c1 != c2)
3368 return (c1 < c2) ? -1 : 1;
3369
Marc-André Lemburge5034372000-08-08 08:04:29 +00003370 len1--; len2--;
3371 }
3372
3373 return (len1 < len2) ? -1 : (len1 != len2);
3374}
3375
3376#endif
3377
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378int PyUnicode_Compare(PyObject *left,
3379 PyObject *right)
3380{
3381 PyUnicodeObject *u = NULL, *v = NULL;
3382 int result;
3383
3384 /* Coerce the two arguments */
3385 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3386 if (u == NULL)
3387 goto onError;
3388 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3389 if (v == NULL)
3390 goto onError;
3391
Thomas Wouters7e474022000-07-16 12:04:32 +00003392 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393 if (v == u) {
3394 Py_DECREF(u);
3395 Py_DECREF(v);
3396 return 0;
3397 }
3398
3399 result = unicode_compare(u, v);
3400
3401 Py_DECREF(u);
3402 Py_DECREF(v);
3403 return result;
3404
3405onError:
3406 Py_XDECREF(u);
3407 Py_XDECREF(v);
3408 return -1;
3409}
3410
Guido van Rossum403d68b2000-03-13 15:55:09 +00003411int PyUnicode_Contains(PyObject *container,
3412 PyObject *element)
3413{
3414 PyUnicodeObject *u = NULL, *v = NULL;
3415 int result;
3416 register const Py_UNICODE *p, *e;
3417 register Py_UNICODE ch;
3418
3419 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003420 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003421 if (v == NULL) {
3422 PyErr_SetString(PyExc_TypeError,
3423 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003424 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003425 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003426 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3427 if (u == NULL) {
3428 Py_DECREF(v);
3429 goto onError;
3430 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003431
3432 /* Check v in u */
3433 if (PyUnicode_GET_SIZE(v) != 1) {
3434 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003435 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003436 goto onError;
3437 }
3438 ch = *PyUnicode_AS_UNICODE(v);
3439 p = PyUnicode_AS_UNICODE(u);
3440 e = p + PyUnicode_GET_SIZE(u);
3441 result = 0;
3442 while (p < e) {
3443 if (*p++ == ch) {
3444 result = 1;
3445 break;
3446 }
3447 }
3448
3449 Py_DECREF(u);
3450 Py_DECREF(v);
3451 return result;
3452
3453onError:
3454 Py_XDECREF(u);
3455 Py_XDECREF(v);
3456 return -1;
3457}
3458
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459/* Concat to string or Unicode object giving a new Unicode object. */
3460
3461PyObject *PyUnicode_Concat(PyObject *left,
3462 PyObject *right)
3463{
3464 PyUnicodeObject *u = NULL, *v = NULL, *w;
3465
3466 /* Coerce the two arguments */
3467 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3468 if (u == NULL)
3469 goto onError;
3470 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3471 if (v == NULL)
3472 goto onError;
3473
3474 /* Shortcuts */
3475 if (v == unicode_empty) {
3476 Py_DECREF(v);
3477 return (PyObject *)u;
3478 }
3479 if (u == unicode_empty) {
3480 Py_DECREF(u);
3481 return (PyObject *)v;
3482 }
3483
3484 /* Concat the two Unicode strings */
3485 w = _PyUnicode_New(u->length + v->length);
3486 if (w == NULL)
3487 goto onError;
3488 Py_UNICODE_COPY(w->str, u->str, u->length);
3489 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3490
3491 Py_DECREF(u);
3492 Py_DECREF(v);
3493 return (PyObject *)w;
3494
3495onError:
3496 Py_XDECREF(u);
3497 Py_XDECREF(v);
3498 return NULL;
3499}
3500
3501static char count__doc__[] =
3502"S.count(sub[, start[, end]]) -> int\n\
3503\n\
3504Return the number of occurrences of substring sub in Unicode string\n\
3505S[start:end]. Optional arguments start and end are\n\
3506interpreted as in slice notation.";
3507
3508static PyObject *
3509unicode_count(PyUnicodeObject *self, PyObject *args)
3510{
3511 PyUnicodeObject *substring;
3512 int start = 0;
3513 int end = INT_MAX;
3514 PyObject *result;
3515
Guido van Rossumb8872e62000-05-09 14:14:27 +00003516 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3517 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 return NULL;
3519
3520 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3521 (PyObject *)substring);
3522 if (substring == NULL)
3523 return NULL;
3524
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 if (start < 0)
3526 start += self->length;
3527 if (start < 0)
3528 start = 0;
3529 if (end > self->length)
3530 end = self->length;
3531 if (end < 0)
3532 end += self->length;
3533 if (end < 0)
3534 end = 0;
3535
3536 result = PyInt_FromLong((long) count(self, start, end, substring));
3537
3538 Py_DECREF(substring);
3539 return result;
3540}
3541
3542static char encode__doc__[] =
3543"S.encode([encoding[,errors]]) -> string\n\
3544\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003545Return an encoded string version of S. Default encoding is the current\n\
3546default string encoding. errors may be given to set a different error\n\
3547handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3548a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003549
3550static PyObject *
3551unicode_encode(PyUnicodeObject *self, PyObject *args)
3552{
3553 char *encoding = NULL;
3554 char *errors = NULL;
3555 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3556 return NULL;
3557 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3558}
3559
3560static char expandtabs__doc__[] =
3561"S.expandtabs([tabsize]) -> unicode\n\
3562\n\
3563Return a copy of S where all tab characters are expanded using spaces.\n\
3564If tabsize is not given, a tab size of 8 characters is assumed.";
3565
3566static PyObject*
3567unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3568{
3569 Py_UNICODE *e;
3570 Py_UNICODE *p;
3571 Py_UNICODE *q;
3572 int i, j;
3573 PyUnicodeObject *u;
3574 int tabsize = 8;
3575
3576 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3577 return NULL;
3578
Thomas Wouters7e474022000-07-16 12:04:32 +00003579 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003580 i = j = 0;
3581 e = self->str + self->length;
3582 for (p = self->str; p < e; p++)
3583 if (*p == '\t') {
3584 if (tabsize > 0)
3585 j += tabsize - (j % tabsize);
3586 }
3587 else {
3588 j++;
3589 if (*p == '\n' || *p == '\r') {
3590 i += j;
3591 j = 0;
3592 }
3593 }
3594
3595 /* Second pass: create output string and fill it */
3596 u = _PyUnicode_New(i + j);
3597 if (!u)
3598 return NULL;
3599
3600 j = 0;
3601 q = u->str;
3602
3603 for (p = self->str; p < e; p++)
3604 if (*p == '\t') {
3605 if (tabsize > 0) {
3606 i = tabsize - (j % tabsize);
3607 j += i;
3608 while (i--)
3609 *q++ = ' ';
3610 }
3611 }
3612 else {
3613 j++;
3614 *q++ = *p;
3615 if (*p == '\n' || *p == '\r')
3616 j = 0;
3617 }
3618
3619 return (PyObject*) u;
3620}
3621
3622static char find__doc__[] =
3623"S.find(sub [,start [,end]]) -> int\n\
3624\n\
3625Return the lowest index in S where substring sub is found,\n\
3626such that sub is contained within s[start,end]. Optional\n\
3627arguments start and end are interpreted as in slice notation.\n\
3628\n\
3629Return -1 on failure.";
3630
3631static PyObject *
3632unicode_find(PyUnicodeObject *self, PyObject *args)
3633{
3634 PyUnicodeObject *substring;
3635 int start = 0;
3636 int end = INT_MAX;
3637 PyObject *result;
3638
Guido van Rossumb8872e62000-05-09 14:14:27 +00003639 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3640 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003641 return NULL;
3642 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3643 (PyObject *)substring);
3644 if (substring == NULL)
3645 return NULL;
3646
3647 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3648
3649 Py_DECREF(substring);
3650 return result;
3651}
3652
3653static PyObject *
3654unicode_getitem(PyUnicodeObject *self, int index)
3655{
3656 if (index < 0 || index >= self->length) {
3657 PyErr_SetString(PyExc_IndexError, "string index out of range");
3658 return NULL;
3659 }
3660
3661 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3662}
3663
3664static long
3665unicode_hash(PyUnicodeObject *self)
3666{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003667 /* Since Unicode objects compare equal to their ASCII string
3668 counterparts, they should use the individual character values
3669 as basis for their hash value. This is needed to assure that
3670 strings and Unicode objects behave in the same way as
3671 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003672
Fredrik Lundhdde61642000-07-10 18:27:47 +00003673 register int len;
3674 register Py_UNICODE *p;
3675 register long x;
3676
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 if (self->hash != -1)
3678 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003679 len = PyUnicode_GET_SIZE(self);
3680 p = PyUnicode_AS_UNICODE(self);
3681 x = *p << 7;
3682 while (--len >= 0)
3683 x = (1000003*x) ^ *p++;
3684 x ^= PyUnicode_GET_SIZE(self);
3685 if (x == -1)
3686 x = -2;
3687 self->hash = x;
3688 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003689}
3690
3691static char index__doc__[] =
3692"S.index(sub [,start [,end]]) -> int\n\
3693\n\
3694Like S.find() but raise ValueError when the substring is not found.";
3695
3696static PyObject *
3697unicode_index(PyUnicodeObject *self, PyObject *args)
3698{
3699 int result;
3700 PyUnicodeObject *substring;
3701 int start = 0;
3702 int end = INT_MAX;
3703
Guido van Rossumb8872e62000-05-09 14:14:27 +00003704 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3705 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003706 return NULL;
3707
3708 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3709 (PyObject *)substring);
3710 if (substring == NULL)
3711 return NULL;
3712
3713 result = findstring(self, substring, start, end, 1);
3714
3715 Py_DECREF(substring);
3716 if (result < 0) {
3717 PyErr_SetString(PyExc_ValueError, "substring not found");
3718 return NULL;
3719 }
3720 return PyInt_FromLong(result);
3721}
3722
3723static char islower__doc__[] =
3724"S.islower() -> int\n\
3725\n\
3726Return 1 if all cased characters in S are lowercase and there is\n\
3727at least one cased character in S, 0 otherwise.";
3728
3729static PyObject*
3730unicode_islower(PyUnicodeObject *self, PyObject *args)
3731{
3732 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3733 register const Py_UNICODE *e;
3734 int cased;
3735
3736 if (!PyArg_NoArgs(args))
3737 return NULL;
3738
3739 /* Shortcut for single character strings */
3740 if (PyUnicode_GET_SIZE(self) == 1)
3741 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3742
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003743 /* Special case for empty strings */
3744 if (PyString_GET_SIZE(self) == 0)
3745 return PyInt_FromLong(0);
3746
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 e = p + PyUnicode_GET_SIZE(self);
3748 cased = 0;
3749 for (; p < e; p++) {
3750 register const Py_UNICODE ch = *p;
3751
3752 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3753 return PyInt_FromLong(0);
3754 else if (!cased && Py_UNICODE_ISLOWER(ch))
3755 cased = 1;
3756 }
3757 return PyInt_FromLong(cased);
3758}
3759
3760static char isupper__doc__[] =
3761"S.isupper() -> int\n\
3762\n\
3763Return 1 if all cased characters in S are uppercase and there is\n\
3764at least one cased character in S, 0 otherwise.";
3765
3766static PyObject*
3767unicode_isupper(PyUnicodeObject *self, PyObject *args)
3768{
3769 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3770 register const Py_UNICODE *e;
3771 int cased;
3772
3773 if (!PyArg_NoArgs(args))
3774 return NULL;
3775
3776 /* Shortcut for single character strings */
3777 if (PyUnicode_GET_SIZE(self) == 1)
3778 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3779
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003780 /* Special case for empty strings */
3781 if (PyString_GET_SIZE(self) == 0)
3782 return PyInt_FromLong(0);
3783
Guido van Rossumd57fd912000-03-10 22:53:23 +00003784 e = p + PyUnicode_GET_SIZE(self);
3785 cased = 0;
3786 for (; p < e; p++) {
3787 register const Py_UNICODE ch = *p;
3788
3789 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3790 return PyInt_FromLong(0);
3791 else if (!cased && Py_UNICODE_ISUPPER(ch))
3792 cased = 1;
3793 }
3794 return PyInt_FromLong(cased);
3795}
3796
3797static char istitle__doc__[] =
3798"S.istitle() -> int\n\
3799\n\
3800Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3801may only follow uncased characters and lowercase characters only cased\n\
3802ones. Return 0 otherwise.";
3803
3804static PyObject*
3805unicode_istitle(PyUnicodeObject *self, PyObject *args)
3806{
3807 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3808 register const Py_UNICODE *e;
3809 int cased, previous_is_cased;
3810
3811 if (!PyArg_NoArgs(args))
3812 return NULL;
3813
3814 /* Shortcut for single character strings */
3815 if (PyUnicode_GET_SIZE(self) == 1)
3816 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3817 (Py_UNICODE_ISUPPER(*p) != 0));
3818
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003819 /* Special case for empty strings */
3820 if (PyString_GET_SIZE(self) == 0)
3821 return PyInt_FromLong(0);
3822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823 e = p + PyUnicode_GET_SIZE(self);
3824 cased = 0;
3825 previous_is_cased = 0;
3826 for (; p < e; p++) {
3827 register const Py_UNICODE ch = *p;
3828
3829 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3830 if (previous_is_cased)
3831 return PyInt_FromLong(0);
3832 previous_is_cased = 1;
3833 cased = 1;
3834 }
3835 else if (Py_UNICODE_ISLOWER(ch)) {
3836 if (!previous_is_cased)
3837 return PyInt_FromLong(0);
3838 previous_is_cased = 1;
3839 cased = 1;
3840 }
3841 else
3842 previous_is_cased = 0;
3843 }
3844 return PyInt_FromLong(cased);
3845}
3846
3847static char isspace__doc__[] =
3848"S.isspace() -> int\n\
3849\n\
3850Return 1 if there are only whitespace characters in S,\n\
38510 otherwise.";
3852
3853static PyObject*
3854unicode_isspace(PyUnicodeObject *self, PyObject *args)
3855{
3856 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3857 register const Py_UNICODE *e;
3858
3859 if (!PyArg_NoArgs(args))
3860 return NULL;
3861
3862 /* Shortcut for single character strings */
3863 if (PyUnicode_GET_SIZE(self) == 1 &&
3864 Py_UNICODE_ISSPACE(*p))
3865 return PyInt_FromLong(1);
3866
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003867 /* Special case for empty strings */
3868 if (PyString_GET_SIZE(self) == 0)
3869 return PyInt_FromLong(0);
3870
Guido van Rossumd57fd912000-03-10 22:53:23 +00003871 e = p + PyUnicode_GET_SIZE(self);
3872 for (; p < e; p++) {
3873 if (!Py_UNICODE_ISSPACE(*p))
3874 return PyInt_FromLong(0);
3875 }
3876 return PyInt_FromLong(1);
3877}
3878
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003879static char isalpha__doc__[] =
3880"S.isalpha() -> int\n\
3881\n\
3882Return 1 if all characters in S are alphabetic\n\
3883and there is at least one character in S, 0 otherwise.";
3884
3885static PyObject*
3886unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3887{
3888 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3889 register const Py_UNICODE *e;
3890
3891 if (!PyArg_NoArgs(args))
3892 return NULL;
3893
3894 /* Shortcut for single character strings */
3895 if (PyUnicode_GET_SIZE(self) == 1 &&
3896 Py_UNICODE_ISALPHA(*p))
3897 return PyInt_FromLong(1);
3898
3899 /* Special case for empty strings */
3900 if (PyString_GET_SIZE(self) == 0)
3901 return PyInt_FromLong(0);
3902
3903 e = p + PyUnicode_GET_SIZE(self);
3904 for (; p < e; p++) {
3905 if (!Py_UNICODE_ISALPHA(*p))
3906 return PyInt_FromLong(0);
3907 }
3908 return PyInt_FromLong(1);
3909}
3910
3911static char isalnum__doc__[] =
3912"S.isalnum() -> int\n\
3913\n\
3914Return 1 if all characters in S are alphanumeric\n\
3915and there is at least one character in S, 0 otherwise.";
3916
3917static PyObject*
3918unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3919{
3920 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3921 register const Py_UNICODE *e;
3922
3923 if (!PyArg_NoArgs(args))
3924 return NULL;
3925
3926 /* Shortcut for single character strings */
3927 if (PyUnicode_GET_SIZE(self) == 1 &&
3928 Py_UNICODE_ISALNUM(*p))
3929 return PyInt_FromLong(1);
3930
3931 /* Special case for empty strings */
3932 if (PyString_GET_SIZE(self) == 0)
3933 return PyInt_FromLong(0);
3934
3935 e = p + PyUnicode_GET_SIZE(self);
3936 for (; p < e; p++) {
3937 if (!Py_UNICODE_ISALNUM(*p))
3938 return PyInt_FromLong(0);
3939 }
3940 return PyInt_FromLong(1);
3941}
3942
Guido van Rossumd57fd912000-03-10 22:53:23 +00003943static char isdecimal__doc__[] =
3944"S.isdecimal() -> int\n\
3945\n\
3946Return 1 if there are only decimal characters in S,\n\
39470 otherwise.";
3948
3949static PyObject*
3950unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3951{
3952 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3953 register const Py_UNICODE *e;
3954
3955 if (!PyArg_NoArgs(args))
3956 return NULL;
3957
3958 /* Shortcut for single character strings */
3959 if (PyUnicode_GET_SIZE(self) == 1 &&
3960 Py_UNICODE_ISDECIMAL(*p))
3961 return PyInt_FromLong(1);
3962
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003963 /* Special case for empty strings */
3964 if (PyString_GET_SIZE(self) == 0)
3965 return PyInt_FromLong(0);
3966
Guido van Rossumd57fd912000-03-10 22:53:23 +00003967 e = p + PyUnicode_GET_SIZE(self);
3968 for (; p < e; p++) {
3969 if (!Py_UNICODE_ISDECIMAL(*p))
3970 return PyInt_FromLong(0);
3971 }
3972 return PyInt_FromLong(1);
3973}
3974
3975static char isdigit__doc__[] =
3976"S.isdigit() -> int\n\
3977\n\
3978Return 1 if there are only digit characters in S,\n\
39790 otherwise.";
3980
3981static PyObject*
3982unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3983{
3984 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3985 register const Py_UNICODE *e;
3986
3987 if (!PyArg_NoArgs(args))
3988 return NULL;
3989
3990 /* Shortcut for single character strings */
3991 if (PyUnicode_GET_SIZE(self) == 1 &&
3992 Py_UNICODE_ISDIGIT(*p))
3993 return PyInt_FromLong(1);
3994
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003995 /* Special case for empty strings */
3996 if (PyString_GET_SIZE(self) == 0)
3997 return PyInt_FromLong(0);
3998
Guido van Rossumd57fd912000-03-10 22:53:23 +00003999 e = p + PyUnicode_GET_SIZE(self);
4000 for (; p < e; p++) {
4001 if (!Py_UNICODE_ISDIGIT(*p))
4002 return PyInt_FromLong(0);
4003 }
4004 return PyInt_FromLong(1);
4005}
4006
4007static char isnumeric__doc__[] =
4008"S.isnumeric() -> int\n\
4009\n\
4010Return 1 if there are only numeric characters in S,\n\
40110 otherwise.";
4012
4013static PyObject*
4014unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
4015{
4016 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4017 register const Py_UNICODE *e;
4018
4019 if (!PyArg_NoArgs(args))
4020 return NULL;
4021
4022 /* Shortcut for single character strings */
4023 if (PyUnicode_GET_SIZE(self) == 1 &&
4024 Py_UNICODE_ISNUMERIC(*p))
4025 return PyInt_FromLong(1);
4026
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004027 /* Special case for empty strings */
4028 if (PyString_GET_SIZE(self) == 0)
4029 return PyInt_FromLong(0);
4030
Guido van Rossumd57fd912000-03-10 22:53:23 +00004031 e = p + PyUnicode_GET_SIZE(self);
4032 for (; p < e; p++) {
4033 if (!Py_UNICODE_ISNUMERIC(*p))
4034 return PyInt_FromLong(0);
4035 }
4036 return PyInt_FromLong(1);
4037}
4038
4039static char join__doc__[] =
4040"S.join(sequence) -> unicode\n\
4041\n\
4042Return a string which is the concatenation of the strings in the\n\
4043sequence. The separator between elements is S.";
4044
4045static PyObject*
4046unicode_join(PyUnicodeObject *self, PyObject *args)
4047{
4048 PyObject *data;
4049 if (!PyArg_ParseTuple(args, "O:join", &data))
4050 return NULL;
4051
4052 return PyUnicode_Join((PyObject *)self, data);
4053}
4054
4055static int
4056unicode_length(PyUnicodeObject *self)
4057{
4058 return self->length;
4059}
4060
4061static char ljust__doc__[] =
4062"S.ljust(width) -> unicode\n\
4063\n\
4064Return S left justified in a Unicode string of length width. Padding is\n\
4065done using spaces.";
4066
4067static PyObject *
4068unicode_ljust(PyUnicodeObject *self, PyObject *args)
4069{
4070 int width;
4071 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4072 return NULL;
4073
4074 if (self->length >= width) {
4075 Py_INCREF(self);
4076 return (PyObject*) self;
4077 }
4078
4079 return (PyObject*) pad(self, 0, width - self->length, ' ');
4080}
4081
4082static char lower__doc__[] =
4083"S.lower() -> unicode\n\
4084\n\
4085Return a copy of the string S converted to lowercase.";
4086
4087static PyObject*
4088unicode_lower(PyUnicodeObject *self, PyObject *args)
4089{
4090 if (!PyArg_NoArgs(args))
4091 return NULL;
4092 return fixup(self, fixlower);
4093}
4094
4095static char lstrip__doc__[] =
4096"S.lstrip() -> unicode\n\
4097\n\
4098Return a copy of the string S with leading whitespace removed.";
4099
4100static PyObject *
4101unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4102{
4103 if (!PyArg_NoArgs(args))
4104 return NULL;
4105 return strip(self, 1, 0);
4106}
4107
4108static PyObject*
4109unicode_repeat(PyUnicodeObject *str, int len)
4110{
4111 PyUnicodeObject *u;
4112 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004113 int nchars;
4114 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115
4116 if (len < 0)
4117 len = 0;
4118
4119 if (len == 1) {
4120 /* no repeat, return original string */
4121 Py_INCREF(str);
4122 return (PyObject*) str;
4123 }
Tim Peters8f422462000-09-09 06:13:41 +00004124
4125 /* ensure # of chars needed doesn't overflow int and # of bytes
4126 * needed doesn't overflow size_t
4127 */
4128 nchars = len * str->length;
4129 if (len && nchars / len != str->length) {
4130 PyErr_SetString(PyExc_OverflowError,
4131 "repeated string is too long");
4132 return NULL;
4133 }
4134 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4135 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4136 PyErr_SetString(PyExc_OverflowError,
4137 "repeated string is too long");
4138 return NULL;
4139 }
4140 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004141 if (!u)
4142 return NULL;
4143
4144 p = u->str;
4145
4146 while (len-- > 0) {
4147 Py_UNICODE_COPY(p, str->str, str->length);
4148 p += str->length;
4149 }
4150
4151 return (PyObject*) u;
4152}
4153
4154PyObject *PyUnicode_Replace(PyObject *obj,
4155 PyObject *subobj,
4156 PyObject *replobj,
4157 int maxcount)
4158{
4159 PyObject *self;
4160 PyObject *str1;
4161 PyObject *str2;
4162 PyObject *result;
4163
4164 self = PyUnicode_FromObject(obj);
4165 if (self == NULL)
4166 return NULL;
4167 str1 = PyUnicode_FromObject(subobj);
4168 if (str1 == NULL) {
4169 Py_DECREF(self);
4170 return NULL;
4171 }
4172 str2 = PyUnicode_FromObject(replobj);
4173 if (str2 == NULL) {
4174 Py_DECREF(self);
4175 Py_DECREF(str1);
4176 return NULL;
4177 }
4178 result = replace((PyUnicodeObject *)self,
4179 (PyUnicodeObject *)str1,
4180 (PyUnicodeObject *)str2,
4181 maxcount);
4182 Py_DECREF(self);
4183 Py_DECREF(str1);
4184 Py_DECREF(str2);
4185 return result;
4186}
4187
4188static char replace__doc__[] =
4189"S.replace (old, new[, maxsplit]) -> unicode\n\
4190\n\
4191Return a copy of S with all occurrences of substring\n\
4192old replaced by new. If the optional argument maxsplit is\n\
4193given, only the first maxsplit occurrences are replaced.";
4194
4195static PyObject*
4196unicode_replace(PyUnicodeObject *self, PyObject *args)
4197{
4198 PyUnicodeObject *str1;
4199 PyUnicodeObject *str2;
4200 int maxcount = -1;
4201 PyObject *result;
4202
4203 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4204 return NULL;
4205 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4206 if (str1 == NULL)
4207 return NULL;
4208 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4209 if (str2 == NULL)
4210 return NULL;
4211
4212 result = replace(self, str1, str2, maxcount);
4213
4214 Py_DECREF(str1);
4215 Py_DECREF(str2);
4216 return result;
4217}
4218
4219static
4220PyObject *unicode_repr(PyObject *unicode)
4221{
4222 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4223 PyUnicode_GET_SIZE(unicode),
4224 1);
4225}
4226
4227static char rfind__doc__[] =
4228"S.rfind(sub [,start [,end]]) -> int\n\
4229\n\
4230Return the highest index in S where substring sub is found,\n\
4231such that sub is contained within s[start,end]. Optional\n\
4232arguments start and end are interpreted as in slice notation.\n\
4233\n\
4234Return -1 on failure.";
4235
4236static PyObject *
4237unicode_rfind(PyUnicodeObject *self, PyObject *args)
4238{
4239 PyUnicodeObject *substring;
4240 int start = 0;
4241 int end = INT_MAX;
4242 PyObject *result;
4243
Guido van Rossumb8872e62000-05-09 14:14:27 +00004244 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4245 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246 return NULL;
4247 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4248 (PyObject *)substring);
4249 if (substring == NULL)
4250 return NULL;
4251
4252 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4253
4254 Py_DECREF(substring);
4255 return result;
4256}
4257
4258static char rindex__doc__[] =
4259"S.rindex(sub [,start [,end]]) -> int\n\
4260\n\
4261Like S.rfind() but raise ValueError when the substring is not found.";
4262
4263static PyObject *
4264unicode_rindex(PyUnicodeObject *self, PyObject *args)
4265{
4266 int result;
4267 PyUnicodeObject *substring;
4268 int start = 0;
4269 int end = INT_MAX;
4270
Guido van Rossumb8872e62000-05-09 14:14:27 +00004271 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4272 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 return NULL;
4274 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4275 (PyObject *)substring);
4276 if (substring == NULL)
4277 return NULL;
4278
4279 result = findstring(self, substring, start, end, -1);
4280
4281 Py_DECREF(substring);
4282 if (result < 0) {
4283 PyErr_SetString(PyExc_ValueError, "substring not found");
4284 return NULL;
4285 }
4286 return PyInt_FromLong(result);
4287}
4288
4289static char rjust__doc__[] =
4290"S.rjust(width) -> unicode\n\
4291\n\
4292Return S right justified in a Unicode string of length width. Padding is\n\
4293done using spaces.";
4294
4295static PyObject *
4296unicode_rjust(PyUnicodeObject *self, PyObject *args)
4297{
4298 int width;
4299 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4300 return NULL;
4301
4302 if (self->length >= width) {
4303 Py_INCREF(self);
4304 return (PyObject*) self;
4305 }
4306
4307 return (PyObject*) pad(self, width - self->length, 0, ' ');
4308}
4309
4310static char rstrip__doc__[] =
4311"S.rstrip() -> unicode\n\
4312\n\
4313Return a copy of the string S with trailing whitespace removed.";
4314
4315static PyObject *
4316unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4317{
4318 if (!PyArg_NoArgs(args))
4319 return NULL;
4320 return strip(self, 0, 1);
4321}
4322
4323static PyObject*
4324unicode_slice(PyUnicodeObject *self, int start, int end)
4325{
4326 /* standard clamping */
4327 if (start < 0)
4328 start = 0;
4329 if (end < 0)
4330 end = 0;
4331 if (end > self->length)
4332 end = self->length;
4333 if (start == 0 && end == self->length) {
4334 /* full slice, return original string */
4335 Py_INCREF(self);
4336 return (PyObject*) self;
4337 }
4338 if (start > end)
4339 start = end;
4340 /* copy slice */
4341 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4342 end - start);
4343}
4344
4345PyObject *PyUnicode_Split(PyObject *s,
4346 PyObject *sep,
4347 int maxsplit)
4348{
4349 PyObject *result;
4350
4351 s = PyUnicode_FromObject(s);
4352 if (s == NULL)
4353 return NULL;
4354 if (sep != NULL) {
4355 sep = PyUnicode_FromObject(sep);
4356 if (sep == NULL) {
4357 Py_DECREF(s);
4358 return NULL;
4359 }
4360 }
4361
4362 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4363
4364 Py_DECREF(s);
4365 Py_XDECREF(sep);
4366 return result;
4367}
4368
4369static char split__doc__[] =
4370"S.split([sep [,maxsplit]]) -> list of strings\n\
4371\n\
4372Return a list of the words in S, using sep as the\n\
4373delimiter string. If maxsplit is given, at most maxsplit\n\
4374splits are done. If sep is not specified, any whitespace string\n\
4375is a separator.";
4376
4377static PyObject*
4378unicode_split(PyUnicodeObject *self, PyObject *args)
4379{
4380 PyObject *substring = Py_None;
4381 int maxcount = -1;
4382
4383 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4384 return NULL;
4385
4386 if (substring == Py_None)
4387 return split(self, NULL, maxcount);
4388 else if (PyUnicode_Check(substring))
4389 return split(self, (PyUnicodeObject *)substring, maxcount);
4390 else
4391 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4392}
4393
4394static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004395"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396\n\
4397Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004398Line breaks are not included in the resulting list unless keepends\n\
4399is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004400
4401static PyObject*
4402unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4403{
Guido van Rossum86662912000-04-11 15:38:46 +00004404 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004405
Guido van Rossum86662912000-04-11 15:38:46 +00004406 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407 return NULL;
4408
Guido van Rossum86662912000-04-11 15:38:46 +00004409 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410}
4411
4412static
4413PyObject *unicode_str(PyUnicodeObject *self)
4414{
Fred Drakee4315f52000-05-09 19:53:39 +00004415 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004416}
4417
4418static char strip__doc__[] =
4419"S.strip() -> unicode\n\
4420\n\
4421Return a copy of S with leading and trailing whitespace removed.";
4422
4423static PyObject *
4424unicode_strip(PyUnicodeObject *self, PyObject *args)
4425{
4426 if (!PyArg_NoArgs(args))
4427 return NULL;
4428 return strip(self, 1, 1);
4429}
4430
4431static char swapcase__doc__[] =
4432"S.swapcase() -> unicode\n\
4433\n\
4434Return a copy of S with uppercase characters converted to lowercase\n\
4435and vice versa.";
4436
4437static PyObject*
4438unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4439{
4440 if (!PyArg_NoArgs(args))
4441 return NULL;
4442 return fixup(self, fixswapcase);
4443}
4444
4445static char translate__doc__[] =
4446"S.translate(table) -> unicode\n\
4447\n\
4448Return a copy of the string S, where all characters have been mapped\n\
4449through the given translation table, which must be a mapping of\n\
4450Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4451are left untouched. Characters mapped to None are deleted.";
4452
4453static PyObject*
4454unicode_translate(PyUnicodeObject *self, PyObject *args)
4455{
4456 PyObject *table;
4457
4458 if (!PyArg_ParseTuple(args, "O:translate", &table))
4459 return NULL;
4460 return PyUnicode_TranslateCharmap(self->str,
4461 self->length,
4462 table,
4463 "ignore");
4464}
4465
4466static char upper__doc__[] =
4467"S.upper() -> unicode\n\
4468\n\
4469Return a copy of S converted to uppercase.";
4470
4471static PyObject*
4472unicode_upper(PyUnicodeObject *self, PyObject *args)
4473{
4474 if (!PyArg_NoArgs(args))
4475 return NULL;
4476 return fixup(self, fixupper);
4477}
4478
4479#if 0
4480static char zfill__doc__[] =
4481"S.zfill(width) -> unicode\n\
4482\n\
4483Pad a numeric string x with zeros on the left, to fill a field\n\
4484of the specified width. The string x is never truncated.";
4485
4486static PyObject *
4487unicode_zfill(PyUnicodeObject *self, PyObject *args)
4488{
4489 int fill;
4490 PyUnicodeObject *u;
4491
4492 int width;
4493 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4494 return NULL;
4495
4496 if (self->length >= width) {
4497 Py_INCREF(self);
4498 return (PyObject*) self;
4499 }
4500
4501 fill = width - self->length;
4502
4503 u = pad(self, fill, 0, '0');
4504
4505 if (u->str[fill] == '+' || u->str[fill] == '-') {
4506 /* move sign to beginning of string */
4507 u->str[0] = u->str[fill];
4508 u->str[fill] = '0';
4509 }
4510
4511 return (PyObject*) u;
4512}
4513#endif
4514
4515#if 0
4516static PyObject*
4517unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4518{
4519 if (!PyArg_NoArgs(args))
4520 return NULL;
4521 return PyInt_FromLong(unicode_freelist_size);
4522}
4523#endif
4524
4525static char startswith__doc__[] =
4526"S.startswith(prefix[, start[, end]]) -> int\n\
4527\n\
4528Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4529optional start, test S beginning at that position. With optional end, stop\n\
4530comparing S at that position.";
4531
4532static PyObject *
4533unicode_startswith(PyUnicodeObject *self,
4534 PyObject *args)
4535{
4536 PyUnicodeObject *substring;
4537 int start = 0;
4538 int end = INT_MAX;
4539 PyObject *result;
4540
Guido van Rossumb8872e62000-05-09 14:14:27 +00004541 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4542 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004543 return NULL;
4544 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4545 (PyObject *)substring);
4546 if (substring == NULL)
4547 return NULL;
4548
4549 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4550
4551 Py_DECREF(substring);
4552 return result;
4553}
4554
4555
4556static char endswith__doc__[] =
4557"S.endswith(suffix[, start[, end]]) -> int\n\
4558\n\
4559Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4560optional start, test S beginning at that position. With optional end, stop\n\
4561comparing S at that position.";
4562
4563static PyObject *
4564unicode_endswith(PyUnicodeObject *self,
4565 PyObject *args)
4566{
4567 PyUnicodeObject *substring;
4568 int start = 0;
4569 int end = INT_MAX;
4570 PyObject *result;
4571
Guido van Rossumb8872e62000-05-09 14:14:27 +00004572 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4573 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004574 return NULL;
4575 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4576 (PyObject *)substring);
4577 if (substring == NULL)
4578 return NULL;
4579
4580 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4581
4582 Py_DECREF(substring);
4583 return result;
4584}
4585
4586
4587static PyMethodDef unicode_methods[] = {
4588
4589 /* Order is according to common usage: often used methods should
4590 appear first, since lookup is done sequentially. */
4591
4592 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4593 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4594 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4595 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4596 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4597 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4598 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4599 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4600 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4601 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4602 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4603 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4604 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4605 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4606/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4607 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4608 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4609 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4610 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4611 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4612 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4613 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4614 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4615 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4616 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4617 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4618 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4619 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4620 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4621 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4622 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4623 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4624 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004625 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4626 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627#if 0
4628 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4629 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4630#endif
4631
4632#if 0
4633 /* This one is just used for debugging the implementation. */
4634 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4635#endif
4636
4637 {NULL, NULL}
4638};
4639
4640static PyObject *
4641unicode_getattr(PyUnicodeObject *self, char *name)
4642{
4643 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4644}
4645
4646static PySequenceMethods unicode_as_sequence = {
4647 (inquiry) unicode_length, /* sq_length */
4648 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4649 (intargfunc) unicode_repeat, /* sq_repeat */
4650 (intargfunc) unicode_getitem, /* sq_item */
4651 (intintargfunc) unicode_slice, /* sq_slice */
4652 0, /* sq_ass_item */
4653 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004654 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004655};
4656
4657static int
4658unicode_buffer_getreadbuf(PyUnicodeObject *self,
4659 int index,
4660 const void **ptr)
4661{
4662 if (index != 0) {
4663 PyErr_SetString(PyExc_SystemError,
4664 "accessing non-existent unicode segment");
4665 return -1;
4666 }
4667 *ptr = (void *) self->str;
4668 return PyUnicode_GET_DATA_SIZE(self);
4669}
4670
4671static int
4672unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4673 const void **ptr)
4674{
4675 PyErr_SetString(PyExc_TypeError,
4676 "cannot use unicode as modifyable buffer");
4677 return -1;
4678}
4679
4680static int
4681unicode_buffer_getsegcount(PyUnicodeObject *self,
4682 int *lenp)
4683{
4684 if (lenp)
4685 *lenp = PyUnicode_GET_DATA_SIZE(self);
4686 return 1;
4687}
4688
4689static int
4690unicode_buffer_getcharbuf(PyUnicodeObject *self,
4691 int index,
4692 const void **ptr)
4693{
4694 PyObject *str;
4695
4696 if (index != 0) {
4697 PyErr_SetString(PyExc_SystemError,
4698 "accessing non-existent unicode segment");
4699 return -1;
4700 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004701 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 if (str == NULL)
4703 return -1;
4704 *ptr = (void *) PyString_AS_STRING(str);
4705 return PyString_GET_SIZE(str);
4706}
4707
4708/* Helpers for PyUnicode_Format() */
4709
4710static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004711getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712{
4713 int argidx = *p_argidx;
4714 if (argidx < arglen) {
4715 (*p_argidx)++;
4716 if (arglen < 0)
4717 return args;
4718 else
4719 return PyTuple_GetItem(args, argidx);
4720 }
4721 PyErr_SetString(PyExc_TypeError,
4722 "not enough arguments for format string");
4723 return NULL;
4724}
4725
4726#define F_LJUST (1<<0)
4727#define F_SIGN (1<<1)
4728#define F_BLANK (1<<2)
4729#define F_ALT (1<<3)
4730#define F_ZERO (1<<4)
4731
4732static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734{
4735 register int i;
4736 int len;
4737 va_list va;
4738 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740
4741 /* First, format the string as char array, then expand to Py_UNICODE
4742 array. */
4743 charbuffer = (char *)buffer;
4744 len = vsprintf(charbuffer, format, va);
4745 for (i = len - 1; i >= 0; i--)
4746 buffer[i] = (Py_UNICODE) charbuffer[i];
4747
4748 va_end(va);
4749 return len;
4750}
4751
4752static int
4753formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004754 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755 int flags,
4756 int prec,
4757 int type,
4758 PyObject *v)
4759{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004760 /* fmt = '%#.' + `prec` + `type`
4761 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 char fmt[20];
4763 double x;
4764
4765 x = PyFloat_AsDouble(v);
4766 if (x == -1.0 && PyErr_Occurred())
4767 return -1;
4768 if (prec < 0)
4769 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4771 type = 'g';
4772 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004773 /* worst case length calc to ensure no buffer overrun:
4774 fmt = %#.<prec>g
4775 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4776 for any double rep.)
4777 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4778 If prec=0 the effective precision is 1 (the leading digit is
4779 always given), therefore increase by one to 10+prec. */
4780 if (buflen <= (size_t)10 + (size_t)prec) {
4781 PyErr_SetString(PyExc_OverflowError,
4782 "formatted float is too long (precision too long?)");
4783 return -1;
4784 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 return usprintf(buf, fmt, x);
4786}
4787
Tim Peters38fd5b62000-09-21 05:43:11 +00004788static PyObject*
4789formatlong(PyObject *val, int flags, int prec, int type)
4790{
4791 char *buf;
4792 int i, len;
4793 PyObject *str; /* temporary string object. */
4794 PyUnicodeObject *result;
4795
4796 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4797 if (!str)
4798 return NULL;
4799 result = _PyUnicode_New(len);
4800 for (i = 0; i < len; i++)
4801 result->str[i] = buf[i];
4802 result->str[len] = 0;
4803 Py_DECREF(str);
4804 return (PyObject*)result;
4805}
4806
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807static int
4808formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004809 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810 int flags,
4811 int prec,
4812 int type,
4813 PyObject *v)
4814{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004815 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004816 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4817 + 1 + 1 = 24*/
4818 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004819 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004820 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004821
4822 x = PyInt_AsLong(v);
4823 if (x == -1 && PyErr_Occurred())
4824 return -1;
4825 if (prec < 0)
4826 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004827 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4828 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4829 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4830 PyErr_SetString(PyExc_OverflowError,
4831 "formatted integer is too long (precision too long?)");
4832 return -1;
4833 }
Tim Petersfff53252001-04-12 18:38:48 +00004834 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4835 * but we want it (for consistency with other %#x conversions, and
4836 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004837 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4838 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4839 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004840 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004841 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4842 /* Only way to know what the platform does is to try it. */
4843 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4844 if (fmt[1] != (char)type) {
4845 /* Supply our own leading 0x/0X -- needed under std C */
4846 use_native_c_format = 0;
4847 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4848 }
4849 }
4850 if (use_native_c_format)
4851 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 return usprintf(buf, fmt, x);
4853}
4854
4855static int
4856formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004857 size_t buflen,
4858 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004860 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004861 if (PyUnicode_Check(v)) {
4862 if (PyUnicode_GET_SIZE(v) != 1)
4863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004864 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004865 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004867 else if (PyString_Check(v)) {
4868 if (PyString_GET_SIZE(v) != 1)
4869 goto onError;
4870 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4871 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004872
4873 else {
4874 /* Integer input truncated to a character */
4875 long x;
4876 x = PyInt_AsLong(v);
4877 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004878 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879 buf[0] = (char) x;
4880 }
4881 buf[1] = '\0';
4882 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004883
4884 onError:
4885 PyErr_SetString(PyExc_TypeError,
4886 "%c requires int or char");
4887 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888}
4889
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004890/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4891
4892 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4893 chars are formatted. XXX This is a magic number. Each formatting
4894 routine does bounds checking to ensure no overflow, but a better
4895 solution may be to malloc a buffer of appropriate size for each
4896 format. For now, the current solution is sufficient.
4897*/
4898#define FORMATBUFLEN (size_t)120
4899
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900PyObject *PyUnicode_Format(PyObject *format,
4901 PyObject *args)
4902{
4903 Py_UNICODE *fmt, *res;
4904 int fmtcnt, rescnt, reslen, arglen, argidx;
4905 int args_owned = 0;
4906 PyUnicodeObject *result = NULL;
4907 PyObject *dict = NULL;
4908 PyObject *uformat;
4909
4910 if (format == NULL || args == NULL) {
4911 PyErr_BadInternalCall();
4912 return NULL;
4913 }
4914 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004915 if (uformat == NULL)
4916 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 fmt = PyUnicode_AS_UNICODE(uformat);
4918 fmtcnt = PyUnicode_GET_SIZE(uformat);
4919
4920 reslen = rescnt = fmtcnt + 100;
4921 result = _PyUnicode_New(reslen);
4922 if (result == NULL)
4923 goto onError;
4924 res = PyUnicode_AS_UNICODE(result);
4925
4926 if (PyTuple_Check(args)) {
4927 arglen = PyTuple_Size(args);
4928 argidx = 0;
4929 }
4930 else {
4931 arglen = -1;
4932 argidx = -2;
4933 }
4934 if (args->ob_type->tp_as_mapping)
4935 dict = args;
4936
4937 while (--fmtcnt >= 0) {
4938 if (*fmt != '%') {
4939 if (--rescnt < 0) {
4940 rescnt = fmtcnt + 100;
4941 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004942 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004943 return NULL;
4944 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4945 --rescnt;
4946 }
4947 *res++ = *fmt++;
4948 }
4949 else {
4950 /* Got a format specifier */
4951 int flags = 0;
4952 int width = -1;
4953 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004954 Py_UNICODE c = '\0';
4955 Py_UNICODE fill;
4956 PyObject *v = NULL;
4957 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004958 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004959 Py_UNICODE sign;
4960 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004961 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962
4963 fmt++;
4964 if (*fmt == '(') {
4965 Py_UNICODE *keystart;
4966 int keylen;
4967 PyObject *key;
4968 int pcount = 1;
4969
4970 if (dict == NULL) {
4971 PyErr_SetString(PyExc_TypeError,
4972 "format requires a mapping");
4973 goto onError;
4974 }
4975 ++fmt;
4976 --fmtcnt;
4977 keystart = fmt;
4978 /* Skip over balanced parentheses */
4979 while (pcount > 0 && --fmtcnt >= 0) {
4980 if (*fmt == ')')
4981 --pcount;
4982 else if (*fmt == '(')
4983 ++pcount;
4984 fmt++;
4985 }
4986 keylen = fmt - keystart - 1;
4987 if (fmtcnt < 0 || pcount > 0) {
4988 PyErr_SetString(PyExc_ValueError,
4989 "incomplete format key");
4990 goto onError;
4991 }
Fred Drakee4315f52000-05-09 19:53:39 +00004992 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 then looked up since Python uses strings to hold
4994 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004995 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004996 key = PyUnicode_EncodeUTF8(keystart,
4997 keylen,
4998 NULL);
4999 if (key == NULL)
5000 goto onError;
5001 if (args_owned) {
5002 Py_DECREF(args);
5003 args_owned = 0;
5004 }
5005 args = PyObject_GetItem(dict, key);
5006 Py_DECREF(key);
5007 if (args == NULL) {
5008 goto onError;
5009 }
5010 args_owned = 1;
5011 arglen = -1;
5012 argidx = -2;
5013 }
5014 while (--fmtcnt >= 0) {
5015 switch (c = *fmt++) {
5016 case '-': flags |= F_LJUST; continue;
5017 case '+': flags |= F_SIGN; continue;
5018 case ' ': flags |= F_BLANK; continue;
5019 case '#': flags |= F_ALT; continue;
5020 case '0': flags |= F_ZERO; continue;
5021 }
5022 break;
5023 }
5024 if (c == '*') {
5025 v = getnextarg(args, arglen, &argidx);
5026 if (v == NULL)
5027 goto onError;
5028 if (!PyInt_Check(v)) {
5029 PyErr_SetString(PyExc_TypeError,
5030 "* wants int");
5031 goto onError;
5032 }
5033 width = PyInt_AsLong(v);
5034 if (width < 0) {
5035 flags |= F_LJUST;
5036 width = -width;
5037 }
5038 if (--fmtcnt >= 0)
5039 c = *fmt++;
5040 }
5041 else if (c >= '0' && c <= '9') {
5042 width = c - '0';
5043 while (--fmtcnt >= 0) {
5044 c = *fmt++;
5045 if (c < '0' || c > '9')
5046 break;
5047 if ((width*10) / 10 != width) {
5048 PyErr_SetString(PyExc_ValueError,
5049 "width too big");
5050 goto onError;
5051 }
5052 width = width*10 + (c - '0');
5053 }
5054 }
5055 if (c == '.') {
5056 prec = 0;
5057 if (--fmtcnt >= 0)
5058 c = *fmt++;
5059 if (c == '*') {
5060 v = getnextarg(args, arglen, &argidx);
5061 if (v == NULL)
5062 goto onError;
5063 if (!PyInt_Check(v)) {
5064 PyErr_SetString(PyExc_TypeError,
5065 "* wants int");
5066 goto onError;
5067 }
5068 prec = PyInt_AsLong(v);
5069 if (prec < 0)
5070 prec = 0;
5071 if (--fmtcnt >= 0)
5072 c = *fmt++;
5073 }
5074 else if (c >= '0' && c <= '9') {
5075 prec = c - '0';
5076 while (--fmtcnt >= 0) {
5077 c = Py_CHARMASK(*fmt++);
5078 if (c < '0' || c > '9')
5079 break;
5080 if ((prec*10) / 10 != prec) {
5081 PyErr_SetString(PyExc_ValueError,
5082 "prec too big");
5083 goto onError;
5084 }
5085 prec = prec*10 + (c - '0');
5086 }
5087 }
5088 } /* prec */
5089 if (fmtcnt >= 0) {
5090 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 if (--fmtcnt >= 0)
5092 c = *fmt++;
5093 }
5094 }
5095 if (fmtcnt < 0) {
5096 PyErr_SetString(PyExc_ValueError,
5097 "incomplete format");
5098 goto onError;
5099 }
5100 if (c != '%') {
5101 v = getnextarg(args, arglen, &argidx);
5102 if (v == NULL)
5103 goto onError;
5104 }
5105 sign = 0;
5106 fill = ' ';
5107 switch (c) {
5108
5109 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005110 pbuf = formatbuf;
5111 /* presume that buffer length is at least 1 */
5112 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005113 len = 1;
5114 break;
5115
5116 case 's':
5117 case 'r':
5118 if (PyUnicode_Check(v) && c == 's') {
5119 temp = v;
5120 Py_INCREF(temp);
5121 }
5122 else {
5123 PyObject *unicode;
5124 if (c == 's')
5125 temp = PyObject_Str(v);
5126 else
5127 temp = PyObject_Repr(v);
5128 if (temp == NULL)
5129 goto onError;
5130 if (!PyString_Check(temp)) {
5131 /* XXX Note: this should never happen, since
5132 PyObject_Repr() and PyObject_Str() assure
5133 this */
5134 Py_DECREF(temp);
5135 PyErr_SetString(PyExc_TypeError,
5136 "%s argument has non-string str()");
5137 goto onError;
5138 }
Fred Drakee4315f52000-05-09 19:53:39 +00005139 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005141 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 "strict");
5143 Py_DECREF(temp);
5144 temp = unicode;
5145 if (temp == NULL)
5146 goto onError;
5147 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005148 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 len = PyUnicode_GET_SIZE(temp);
5150 if (prec >= 0 && len > prec)
5151 len = prec;
5152 break;
5153
5154 case 'i':
5155 case 'd':
5156 case 'u':
5157 case 'o':
5158 case 'x':
5159 case 'X':
5160 if (c == 'i')
5161 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005162 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005163 temp = formatlong(v, flags, prec, c);
5164 if (!temp)
5165 goto onError;
5166 pbuf = PyUnicode_AS_UNICODE(temp);
5167 len = PyUnicode_GET_SIZE(temp);
5168 /* unbounded ints can always produce
5169 a sign character! */
5170 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005171 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005172 else {
5173 pbuf = formatbuf;
5174 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5175 flags, prec, c, v);
5176 if (len < 0)
5177 goto onError;
5178 /* only d conversion is signed */
5179 sign = c == 'd';
5180 }
5181 if (flags & F_ZERO)
5182 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 break;
5184
5185 case 'e':
5186 case 'E':
5187 case 'f':
5188 case 'g':
5189 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005190 pbuf = formatbuf;
5191 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5192 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 if (len < 0)
5194 goto onError;
5195 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005196 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197 fill = '0';
5198 break;
5199
5200 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005201 pbuf = formatbuf;
5202 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203 if (len < 0)
5204 goto onError;
5205 break;
5206
5207 default:
5208 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005209 "unsupported format character '%c' (0x%x) "
5210 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005211 (31<=c && c<=126) ? c : '?',
5212 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213 goto onError;
5214 }
5215 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005216 if (*pbuf == '-' || *pbuf == '+') {
5217 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005218 len--;
5219 }
5220 else if (flags & F_SIGN)
5221 sign = '+';
5222 else if (flags & F_BLANK)
5223 sign = ' ';
5224 else
5225 sign = 0;
5226 }
5227 if (width < len)
5228 width = len;
5229 if (rescnt < width + (sign != 0)) {
5230 reslen -= rescnt;
5231 rescnt = width + fmtcnt + 100;
5232 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005233 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005234 return NULL;
5235 res = PyUnicode_AS_UNICODE(result)
5236 + reslen - rescnt;
5237 }
5238 if (sign) {
5239 if (fill != ' ')
5240 *res++ = sign;
5241 rescnt--;
5242 if (width > len)
5243 width--;
5244 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005245 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5246 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005247 assert(pbuf[1] == c);
5248 if (fill != ' ') {
5249 *res++ = *pbuf++;
5250 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005251 }
Tim Petersfff53252001-04-12 18:38:48 +00005252 rescnt -= 2;
5253 width -= 2;
5254 if (width < 0)
5255 width = 0;
5256 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005257 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005258 if (width > len && !(flags & F_LJUST)) {
5259 do {
5260 --rescnt;
5261 *res++ = fill;
5262 } while (--width > len);
5263 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005264 if (fill == ' ') {
5265 if (sign)
5266 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005267 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005268 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005269 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005270 *res++ = *pbuf++;
5271 *res++ = *pbuf++;
5272 }
5273 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005274 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005275 res += len;
5276 rescnt -= len;
5277 while (--width >= len) {
5278 --rescnt;
5279 *res++ = ' ';
5280 }
5281 if (dict && (argidx < arglen) && c != '%') {
5282 PyErr_SetString(PyExc_TypeError,
5283 "not all arguments converted");
5284 goto onError;
5285 }
5286 Py_XDECREF(temp);
5287 } /* '%' */
5288 } /* until end */
5289 if (argidx < arglen && !dict) {
5290 PyErr_SetString(PyExc_TypeError,
5291 "not all arguments converted");
5292 goto onError;
5293 }
5294
5295 if (args_owned) {
5296 Py_DECREF(args);
5297 }
5298 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005299 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005300 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301 return (PyObject *)result;
5302
5303 onError:
5304 Py_XDECREF(result);
5305 Py_DECREF(uformat);
5306 if (args_owned) {
5307 Py_DECREF(args);
5308 }
5309 return NULL;
5310}
5311
5312static PyBufferProcs unicode_as_buffer = {
5313 (getreadbufferproc) unicode_buffer_getreadbuf,
5314 (getwritebufferproc) unicode_buffer_getwritebuf,
5315 (getsegcountproc) unicode_buffer_getsegcount,
5316 (getcharbufferproc) unicode_buffer_getcharbuf,
5317};
5318
5319PyTypeObject PyUnicode_Type = {
5320 PyObject_HEAD_INIT(&PyType_Type)
5321 0, /* ob_size */
5322 "unicode", /* tp_name */
5323 sizeof(PyUnicodeObject), /* tp_size */
5324 0, /* tp_itemsize */
5325 /* Slots */
5326 (destructor)_PyUnicode_Free, /* tp_dealloc */
5327 0, /* tp_print */
5328 (getattrfunc)unicode_getattr, /* tp_getattr */
5329 0, /* tp_setattr */
5330 (cmpfunc) unicode_compare, /* tp_compare */
5331 (reprfunc) unicode_repr, /* tp_repr */
5332 0, /* tp_as_number */
5333 &unicode_as_sequence, /* tp_as_sequence */
5334 0, /* tp_as_mapping */
5335 (hashfunc) unicode_hash, /* tp_hash*/
5336 0, /* tp_call*/
5337 (reprfunc) unicode_str, /* tp_str */
5338 (getattrofunc) NULL, /* tp_getattro */
5339 (setattrofunc) NULL, /* tp_setattro */
5340 &unicode_as_buffer, /* tp_as_buffer */
5341 Py_TPFLAGS_DEFAULT, /* tp_flags */
5342};
5343
5344/* Initialize the Unicode implementation */
5345
Thomas Wouters78890102000-07-22 19:25:51 +00005346void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005347{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005348 int i;
5349
Fred Drakee4315f52000-05-09 19:53:39 +00005350 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005351 unicode_freelist = NULL;
5352 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005353 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005354 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005355 for (i = 0; i < 256; i++)
5356 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005357}
5358
5359/* Finalize the Unicode implementation */
5360
5361void
Thomas Wouters78890102000-07-22 19:25:51 +00005362_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005363{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005364 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005365 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005367 Py_XDECREF(unicode_empty);
5368 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005369
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005370 for (i = 0; i < 256; i++) {
5371 if (unicode_latin1[i]) {
5372 Py_DECREF(unicode_latin1[i]);
5373 unicode_latin1[i] = NULL;
5374 }
5375 }
5376
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005377 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378 PyUnicodeObject *v = u;
5379 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005380 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005381 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005382 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005383 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005384 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005385 unicode_freelist = NULL;
5386 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005387}