blob: a46df163f6f951cad2a249218812ae64a19343a8 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000106Py_UNICODE
107PyUnicode_GetMax()
108{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000109#ifdef Py_UNICODE_WIDE
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000110 return 0x10FFFF;
111#else
112 /* This is actually an illegal character, so it should
113 not be passed to unichr. */
114 return 0xFFFF;
115#endif
116}
117
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118/* --- Unicode Object ----------------------------------------------------- */
119
120static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000121int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000122 int length)
123{
124 void *oldstr;
125
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000126 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000130 /* Resizing shared object (unicode_empty or single character
131 objects) in-place is not allowed. Use PyUnicode_Resize()
132 instead ! */
133 if (unicode == unicode_empty ||
134 (unicode->length == 1 &&
135 unicode->str[0] < 256 &&
136 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000137 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000138 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 return -1;
140 }
141
142 /* We allocate one more byte to make sure the string is
143 Ux0000 terminated -- XXX is this needed ? */
144 oldstr = unicode->str;
145 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
146 if (!unicode->str) {
147 unicode->str = oldstr;
148 PyErr_NoMemory();
149 return -1;
150 }
151 unicode->str[length] = 0;
152 unicode->length = length;
153
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000154 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000155 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000156 if (unicode->defenc) {
157 Py_DECREF(unicode->defenc);
158 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000159 }
160 unicode->hash = -1;
161
162 return 0;
163}
164
165/* We allocate one more byte to make sure the string is
166 Ux0000 terminated -- XXX is this needed ?
167
168 XXX This allocator could further be enhanced by assuring that the
169 free list never reduces its size below 1.
170
171*/
172
173static
174PyUnicodeObject *_PyUnicode_New(int length)
175{
176 register PyUnicodeObject *unicode;
177
178 /* Optimization for empty strings */
179 if (length == 0 && unicode_empty != NULL) {
180 Py_INCREF(unicode_empty);
181 return unicode_empty;
182 }
183
184 /* Unicode freelist & memory allocation */
185 if (unicode_freelist) {
186 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000187 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000190 /* Keep-Alive optimization: we only upsize the buffer,
191 never downsize it. */
192 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000193 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000194 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000195 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 }
197 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000198 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000200 }
201 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 }
203 else {
204 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
205 if (unicode == NULL)
206 return NULL;
207 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
208 }
209
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 if (!unicode->str) {
211 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000213 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 unicode->str[length] = 0;
215 unicode->length = length;
216 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000217 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000219
220 onError:
221 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000222 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000223 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000224}
225
226static
227void _PyUnicode_Free(register PyUnicodeObject *unicode)
228{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000230 /* Keep-Alive optimization */
231 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000232 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 unicode->str = NULL;
234 unicode->length = 0;
235 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000236 if (unicode->defenc) {
237 Py_DECREF(unicode->defenc);
238 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000239 }
240 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 *(PyUnicodeObject **)unicode = unicode_freelist;
242 unicode_freelist = unicode;
243 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000244 }
245 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000246 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000247 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000248 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000249 }
250}
251
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000252int PyUnicode_Resize(PyObject **unicode,
253 int length)
254{
255 register PyUnicodeObject *v;
256
257 /* Argument checks */
258 if (unicode == NULL) {
259 PyErr_BadInternalCall();
260 return -1;
261 }
262 v = (PyUnicodeObject *)*unicode;
263 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
264 PyErr_BadInternalCall();
265 return -1;
266 }
267
268 /* Resizing unicode_empty and single character objects is not
269 possible since these are being shared. We simply return a fresh
270 copy with the same Unicode content. */
271 if (v->length != length &&
272 (v == unicode_empty || v->length == 1)) {
273 PyUnicodeObject *w = _PyUnicode_New(length);
274 if (w == NULL)
275 return -1;
276 Py_UNICODE_COPY(w->str, v->str,
277 length < v->length ? length : v->length);
278 *unicode = (PyObject *)w;
279 return 0;
280 }
281
282 /* Note that we don't have to modify *unicode for unshared Unicode
283 objects, since we can modify them in-place. */
284 return unicode_resize(v, length);
285}
286
287/* Internal API for use in unicodeobject.c only ! */
288#define _PyUnicode_Resize(unicodevar, length) \
289 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
290
Guido van Rossumd57fd912000-03-10 22:53:23 +0000291PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
292 int size)
293{
294 PyUnicodeObject *unicode;
295
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000296 /* If the Unicode data is known at construction time, we can apply
297 some optimizations which share commonly used objects. */
298 if (u != NULL) {
299
300 /* Optimization for empty strings */
301 if (size == 0 && unicode_empty != NULL) {
302 Py_INCREF(unicode_empty);
303 return (PyObject *)unicode_empty;
304 }
305
306 /* Single character Unicode objects in the Latin-1 range are
307 shared when using this constructor */
308 if (size == 1 && *u < 256) {
309 unicode = unicode_latin1[*u];
310 if (!unicode) {
311 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000312 if (!unicode)
313 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000314 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000315 unicode_latin1[*u] = unicode;
316 }
317 Py_INCREF(unicode);
318 return (PyObject *)unicode;
319 }
320 }
321
Guido van Rossumd57fd912000-03-10 22:53:23 +0000322 unicode = _PyUnicode_New(size);
323 if (!unicode)
324 return NULL;
325
326 /* Copy the Unicode data into the new object */
327 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000328 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000329
330 return (PyObject *)unicode;
331}
332
333#ifdef HAVE_WCHAR_H
334
335PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
336 int size)
337{
338 PyUnicodeObject *unicode;
339
340 if (w == NULL) {
341 PyErr_BadInternalCall();
342 return NULL;
343 }
344
345 unicode = _PyUnicode_New(size);
346 if (!unicode)
347 return NULL;
348
349 /* Copy the wchar_t data into the new object */
350#ifdef HAVE_USABLE_WCHAR_T
351 memcpy(unicode->str, w, size * sizeof(wchar_t));
352#else
353 {
354 register Py_UNICODE *u;
355 register int i;
356 u = PyUnicode_AS_UNICODE(unicode);
357 for (i = size; i >= 0; i--)
358 *u++ = *w++;
359 }
360#endif
361
362 return (PyObject *)unicode;
363}
364
365int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
366 register wchar_t *w,
367 int size)
368{
369 if (unicode == NULL) {
370 PyErr_BadInternalCall();
371 return -1;
372 }
373 if (size > PyUnicode_GET_SIZE(unicode))
374 size = PyUnicode_GET_SIZE(unicode);
375#ifdef HAVE_USABLE_WCHAR_T
376 memcpy(w, unicode->str, size * sizeof(wchar_t));
377#else
378 {
379 register Py_UNICODE *u;
380 register int i;
381 u = PyUnicode_AS_UNICODE(unicode);
382 for (i = size; i >= 0; i--)
383 *w++ = *u++;
384 }
385#endif
386
387 return size;
388}
389
390#endif
391
392PyObject *PyUnicode_FromObject(register PyObject *obj)
393{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000394 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
395}
396
397PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
398 const char *encoding,
399 const char *errors)
400{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000401 const char *s;
402 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000403 int owned = 0;
404 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 if (obj == NULL) {
407 PyErr_BadInternalCall();
408 return NULL;
409 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000410
411 /* Coerce object */
412 if (PyInstance_Check(obj)) {
413 PyObject *func;
414 func = PyObject_GetAttrString(obj, "__str__");
415 if (func == NULL) {
416 PyErr_SetString(PyExc_TypeError,
417 "coercing to Unicode: instance doesn't define __str__");
418 return NULL;
419 }
420 obj = PyEval_CallObject(func, NULL);
421 Py_DECREF(func);
422 if (obj == NULL)
423 return NULL;
424 owned = 1;
425 }
426 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000427 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000428 v = obj;
429 if (encoding) {
430 PyErr_SetString(PyExc_TypeError,
431 "decoding Unicode is not supported");
432 return NULL;
433 }
434 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000435 }
436 else if (PyString_Check(obj)) {
437 s = PyString_AS_STRING(obj);
438 len = PyString_GET_SIZE(obj);
439 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000440 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
441 /* Overwrite the error message with something more useful in
442 case of a TypeError. */
443 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000444 PyErr_Format(PyExc_TypeError,
445 "coercing to Unicode: need string or buffer, "
446 "%.80s found",
447 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000448 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000449 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000450
451 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000452 if (len == 0) {
453 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000454 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000455 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000456 else
457 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000458
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000459 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000460 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000461 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000462 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000463 return v;
464
465 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000466 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000467 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000468 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000469 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000470}
471
472PyObject *PyUnicode_Decode(const char *s,
473 int size,
474 const char *encoding,
475 const char *errors)
476{
477 PyObject *buffer = NULL, *unicode;
478
Fred Drakee4315f52000-05-09 19:53:39 +0000479 if (encoding == NULL)
480 encoding = PyUnicode_GetDefaultEncoding();
481
482 /* Shortcuts for common default encodings */
483 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000484 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000485 else if (strcmp(encoding, "latin-1") == 0)
486 return PyUnicode_DecodeLatin1(s, size, errors);
487 else if (strcmp(encoding, "ascii") == 0)
488 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000489
490 /* Decode via the codec registry */
491 buffer = PyBuffer_FromMemory((void *)s, size);
492 if (buffer == NULL)
493 goto onError;
494 unicode = PyCodec_Decode(buffer, encoding, errors);
495 if (unicode == NULL)
496 goto onError;
497 if (!PyUnicode_Check(unicode)) {
498 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000499 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000500 unicode->ob_type->tp_name);
501 Py_DECREF(unicode);
502 goto onError;
503 }
504 Py_DECREF(buffer);
505 return unicode;
506
507 onError:
508 Py_XDECREF(buffer);
509 return NULL;
510}
511
512PyObject *PyUnicode_Encode(const Py_UNICODE *s,
513 int size,
514 const char *encoding,
515 const char *errors)
516{
517 PyObject *v, *unicode;
518
519 unicode = PyUnicode_FromUnicode(s, size);
520 if (unicode == NULL)
521 return NULL;
522 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
523 Py_DECREF(unicode);
524 return v;
525}
526
527PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
528 const char *encoding,
529 const char *errors)
530{
531 PyObject *v;
532
533 if (!PyUnicode_Check(unicode)) {
534 PyErr_BadArgument();
535 goto onError;
536 }
Fred Drakee4315f52000-05-09 19:53:39 +0000537
538 if (encoding == NULL)
539 encoding = PyUnicode_GetDefaultEncoding();
540
541 /* Shortcuts for common default encodings */
542 if (errors == NULL) {
543 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000544 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000545 else if (strcmp(encoding, "latin-1") == 0)
546 return PyUnicode_AsLatin1String(unicode);
547 else if (strcmp(encoding, "ascii") == 0)
548 return PyUnicode_AsASCIIString(unicode);
549 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000550
551 /* Encode via the codec registry */
552 v = PyCodec_Encode(unicode, encoding, errors);
553 if (v == NULL)
554 goto onError;
555 /* XXX Should we really enforce this ? */
556 if (!PyString_Check(v)) {
557 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000558 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000559 v->ob_type->tp_name);
560 Py_DECREF(v);
561 goto onError;
562 }
563 return v;
564
565 onError:
566 return NULL;
567}
568
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000569/* Return a Python string holding the default encoded value of the
570 Unicode object.
571
572 The resulting string is cached in the Unicode object for subsequent
573 usage by this function. The cached version is needed to implement
574 the character buffer interface and will live (at least) as long as
575 the Unicode object itself.
576
577 The refcount of the string is *not* incremented.
578
579 *** Exported for internal use by the interpreter only !!! ***
580
581*/
582
583PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
584 const char *errors)
585{
586 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
587
588 if (v)
589 return v;
590 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
591 if (v && errors == NULL)
592 ((PyUnicodeObject *)unicode)->defenc = v;
593 return v;
594}
595
Guido van Rossumd57fd912000-03-10 22:53:23 +0000596Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_AS_UNICODE(unicode);
603
604 onError:
605 return NULL;
606}
607
608int PyUnicode_GetSize(PyObject *unicode)
609{
610 if (!PyUnicode_Check(unicode)) {
611 PyErr_BadArgument();
612 goto onError;
613 }
614 return PyUnicode_GET_SIZE(unicode);
615
616 onError:
617 return -1;
618}
619
Thomas Wouters78890102000-07-22 19:25:51 +0000620const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000621{
622 return unicode_default_encoding;
623}
624
625int PyUnicode_SetDefaultEncoding(const char *encoding)
626{
627 PyObject *v;
628
629 /* Make sure the encoding is valid. As side effect, this also
630 loads the encoding into the codec registry cache. */
631 v = _PyCodec_Lookup(encoding);
632 if (v == NULL)
633 goto onError;
634 Py_DECREF(v);
635 strncpy(unicode_default_encoding,
636 encoding,
637 sizeof(unicode_default_encoding));
638 return 0;
639
640 onError:
641 return -1;
642}
643
Guido van Rossumd57fd912000-03-10 22:53:23 +0000644/* --- UTF-8 Codec -------------------------------------------------------- */
645
646static
647char utf8_code_length[256] = {
648 /* Map UTF-8 encoded prefix byte to sequence length. zero means
649 illegal prefix. see RFC 2279 for details */
650 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
651 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
652 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
653 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
655 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
656 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
657 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
658 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
659 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
660 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
661 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
662 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
663 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
664 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
665 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
666};
667
668static
669int utf8_decoding_error(const char **source,
670 Py_UNICODE **dest,
671 const char *errors,
672 const char *details)
673{
674 if ((errors == NULL) ||
675 (strcmp(errors,"strict") == 0)) {
676 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000677 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678 details);
679 return -1;
680 }
681 else if (strcmp(errors,"ignore") == 0) {
682 (*source)++;
683 return 0;
684 }
685 else if (strcmp(errors,"replace") == 0) {
686 (*source)++;
687 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
688 (*dest)++;
689 return 0;
690 }
691 else {
692 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000693 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694 errors);
695 return -1;
696 }
697}
698
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699PyObject *PyUnicode_DecodeUTF8(const char *s,
700 int size,
701 const char *errors)
702{
703 int n;
704 const char *e;
705 PyUnicodeObject *unicode;
706 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000707 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000708
709 /* Note: size will always be longer than the resulting Unicode
710 character count */
711 unicode = _PyUnicode_New(size);
712 if (!unicode)
713 return NULL;
714 if (size == 0)
715 return (PyObject *)unicode;
716
717 /* Unpack UTF-8 encoded data */
718 p = unicode->str;
719 e = s + size;
720
721 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000722 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723
724 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000725 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000726 s++;
727 continue;
728 }
729
730 n = utf8_code_length[ch];
731
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000732 if (s + n > e) {
733 errmsg = "unexpected end of data";
734 goto utf8Error;
735 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000736
737 switch (n) {
738
739 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000740 errmsg = "unexpected code byte";
741 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000742
743 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000744 errmsg = "internal error";
745 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000746
747 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000748 if ((s[1] & 0xc0) != 0x80) {
749 errmsg = "invalid data";
750 goto utf8Error;
751 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000752 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000753 if (ch < 0x80) {
754 errmsg = "illegal encoding";
755 goto utf8Error;
756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000757 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000758 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000759 break;
760
761 case 3:
762 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000763 (s[2] & 0xc0) != 0x80) {
764 errmsg = "invalid data";
765 goto utf8Error;
766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000768 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
769 errmsg = "illegal encoding";
770 goto utf8Error;
771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000773 *p++ = (Py_UNICODE)ch;
774 break;
775
776 case 4:
777 if ((s[1] & 0xc0) != 0x80 ||
778 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000779 (s[3] & 0xc0) != 0x80) {
780 errmsg = "invalid data";
781 goto utf8Error;
782 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000783 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
784 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
785 /* validate and convert to UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000786 if ((ch < 0x10000) /* minimum value allowed for 4
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000787 byte encoding */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000788 || (ch > 0x10ffff)) /* maximum value allowed for
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000789 UTF-16 */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000790 {
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000791 errmsg = "illegal encoding";
792 goto utf8Error;
793 }
Fredrik Lundh8f455852001-06-27 18:59:43 +0000794#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000795 *p++ = (Py_UNICODE)ch;
796#else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000797 /* compute and append the two surrogates: */
798
799 /* translate from 10000..10FFFF to 0..FFFF */
800 ch -= 0x10000;
801
802 /* high surrogate = top 10 bits added to D800 */
803 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
804
805 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +0000806 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000807#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 break;
809
810 default:
811 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000812 errmsg = "unsupported Unicode code range";
813 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000814 }
815 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000816 continue;
817
818 utf8Error:
819 if (utf8_decoding_error(&s, &p, errors, errmsg))
820 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000821 }
822
823 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000824 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 goto onError;
826
827 return (PyObject *)unicode;
828
829onError:
830 Py_DECREF(unicode);
831 return NULL;
832}
833
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000834/* Not used anymore, now that the encoder supports UTF-16
835 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000836#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837static
838int utf8_encoding_error(const Py_UNICODE **source,
839 char **dest,
840 const char *errors,
841 const char *details)
842{
843 if ((errors == NULL) ||
844 (strcmp(errors,"strict") == 0)) {
845 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000846 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000847 details);
848 return -1;
849 }
850 else if (strcmp(errors,"ignore") == 0) {
851 return 0;
852 }
853 else if (strcmp(errors,"replace") == 0) {
854 **dest = '?';
855 (*dest)++;
856 return 0;
857 }
858 else {
859 PyErr_Format(PyExc_ValueError,
860 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000861 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862 errors);
863 return -1;
864 }
865}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000866#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000867
868PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
869 int size,
870 const char *errors)
871{
872 PyObject *v;
873 char *p;
874 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000875 Py_UCS4 ch2;
876 unsigned int cbAllocated = 3 * size;
877 unsigned int cbWritten = 0;
878 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000879
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000880 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000881 if (v == NULL)
882 return NULL;
883 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000884 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885
886 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000887 while (i < size) {
888 Py_UCS4 ch = s[i++];
889 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000891 cbWritten++;
892 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000893 else if (ch < 0x0800) {
894 *p++ = 0xc0 | (ch >> 6);
895 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000896 cbWritten += 2;
897 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000898 else if (ch < 0x10000) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000899 /* Check for high surrogate */
900 if (0xD800 <= ch && ch <= 0xDBFF) {
901 if (i != size) {
902 ch2 = s[i];
903 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
904
905 if (cbWritten >= (cbAllocated - 4)) {
906 /* Provide enough room for some more
907 surrogates */
908 cbAllocated += 4*10;
909 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000910 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000911 }
912
913 /* combine the two values */
914 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
915
916 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000917 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000918 i++;
919 cbWritten += 4;
920 }
921 }
922 }
923 else {
924 *p++ = (char)(0xe0 | (ch >> 12));
925 cbWritten += 3;
926 }
927 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
928 *p++ = (char)(0x80 | (ch & 0x3f));
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000929 } else {
930 *p++ = 0xf0 | (ch>>18);
931 *p++ = 0x80 | ((ch>>12) & 0x3f);
932 *p++ = 0x80 | ((ch>>6) & 0x3f);
933 *p++ = 0x80 | (ch & 0x3f);
934 cbWritten += 4;
935 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 }
937 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000938 if (_PyString_Resize(&v, p - q))
939 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000940 return v;
941
942 onError:
943 Py_DECREF(v);
944 return NULL;
945}
946
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
948{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 if (!PyUnicode_Check(unicode)) {
950 PyErr_BadArgument();
951 return NULL;
952 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000953 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
954 PyUnicode_GET_SIZE(unicode),
955 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000956}
957
958/* --- UTF-16 Codec ------------------------------------------------------- */
959
960static
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000961int utf16_decoding_error(const Py_UCS2 **source,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000962 Py_UNICODE **dest,
963 const char *errors,
964 const char *details)
965{
966 if ((errors == NULL) ||
967 (strcmp(errors,"strict") == 0)) {
968 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000969 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000970 details);
971 return -1;
972 }
973 else if (strcmp(errors,"ignore") == 0) {
974 return 0;
975 }
976 else if (strcmp(errors,"replace") == 0) {
977 if (dest) {
978 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
979 (*dest)++;
980 }
981 return 0;
982 }
983 else {
984 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000985 "UTF-16 decoding error; "
986 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000987 errors);
988 return -1;
989 }
990}
991
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992PyObject *PyUnicode_DecodeUTF16(const char *s,
993 int size,
994 const char *errors,
995 int *byteorder)
996{
997 PyUnicodeObject *unicode;
998 Py_UNICODE *p;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +0000999 const Py_UCS2 *q, *e;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001000 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001001 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001002
1003 /* size should be an even number */
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001004 if (size % sizeof(Py_UCS2) != 0) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
1006 return NULL;
1007 /* The remaining input chars are ignored if we fall through
1008 here... */
1009 }
1010
1011 /* Note: size will always be longer than the resulting Unicode
1012 character count */
1013 unicode = _PyUnicode_New(size);
1014 if (!unicode)
1015 return NULL;
1016 if (size == 0)
1017 return (PyObject *)unicode;
1018
1019 /* Unpack UTF-16 encoded data */
1020 p = unicode->str;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001021 q = (Py_UCS2 *)s;
1022 e = q + (size / sizeof(Py_UCS2));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023
1024 if (byteorder)
1025 bo = *byteorder;
1026
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001027 /* Check for BOM marks (U+FEFF) in the input and adjust current
1028 byte order setting accordingly. In native mode, the leading BOM
1029 mark is skipped, in all other modes, it is copied to the output
1030 stream as-is (giving a ZWNBSP character). */
1031 if (bo == 0) {
1032#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1033 if (*q == 0xFEFF) {
1034 q++;
1035 bo = -1;
1036 } else if (*q == 0xFFFE) {
1037 q++;
1038 bo = 1;
1039 }
1040#else
1041 if (*q == 0xFEFF) {
1042 q++;
1043 bo = 1;
1044 } else if (*q == 0xFFFE) {
1045 q++;
1046 bo = -1;
1047 }
1048#endif
1049 }
1050
Guido van Rossumd57fd912000-03-10 22:53:23 +00001051 while (q < e) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001052 register Py_UCS2 ch = *q++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001054 /* Swap input bytes if needed. (This assumes
1055 sizeof(Py_UNICODE) == 2 !) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001056#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 if (bo == 1)
1058 ch = (ch >> 8) | (ch << 8);
1059#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 if (bo == -1)
1061 ch = (ch >> 8) | (ch << 8);
1062#endif
1063 if (ch < 0xD800 || ch > 0xDFFF) {
1064 *p++ = ch;
1065 continue;
1066 }
1067
1068 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001069 if (q >= e) {
1070 errmsg = "unexpected end of data";
1071 goto utf16Error;
1072 }
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001073 if (0xD800 <= ch && ch <= 0xDBFF) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001074 Py_UCS2 ch2 = *q++;
1075#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1076 if (bo == 1)
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001077 ch2 = (ch2 >> 8) | (ch2 << 8);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001078#else
1079 if (bo == -1)
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001080 ch2 = (ch2 >> 8) | (ch2 << 8);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001081#endif
Martin v. Löwisac93bc22001-06-26 22:43:40 +00001082 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
Fredrik Lundh8f455852001-06-27 18:59:43 +00001083#ifndef Py_UNICODE_WIDE
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084 /* This is valid data (a UTF-16 surrogate pair), but
1085 we are not able to store this information since our
1086 Py_UNICODE type only has 16 bits... this might
1087 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001088 errmsg = "code pairs are not supported";
1089 goto utf16Error;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001090#else
1091 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 continue;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001093#endif
1094
1095 }
1096 else {
1097 errmsg = "illegal UTF-16 surrogate";
1098 goto utf16Error;
1099 }
1100
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001102 errmsg = "illegal encoding";
1103 /* Fall through to report the error */
1104
1105 utf16Error:
1106 if (utf16_decoding_error(&q, &p, errors, errmsg))
1107 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 }
1109
1110 if (byteorder)
1111 *byteorder = bo;
1112
1113 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001114 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 goto onError;
1116
1117 return (PyObject *)unicode;
1118
1119onError:
1120 Py_DECREF(unicode);
1121 return NULL;
1122}
1123
1124#undef UTF16_ERROR
1125
1126PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1127 int size,
1128 const char *errors,
1129 int byteorder)
1130{
1131 PyObject *v;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001132 Py_UCS2 *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133 char *q;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001134 int i, pairs, doswap = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001136 for (i = pairs = 0; i < size; i++)
1137 if (s[i] >= 0x10000)
1138 pairs++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001139 v = PyString_FromStringAndSize(NULL,
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001140 sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141 if (v == NULL)
1142 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143
1144 q = PyString_AS_STRING(v);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001145 p = (Py_UCS2 *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146 if (byteorder == 0)
1147 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001148 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001149 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001150 if (byteorder == 0 ||
1151#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1152 byteorder == -1
1153#else
1154 byteorder == 1
1155#endif
1156 )
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001157 doswap = 0;
1158 while (size-- > 0) {
1159 Py_UNICODE ch = *s++;
1160 Py_UNICODE ch2 = 0;
1161 if (ch >= 0x10000) {
1162 ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
1163 ch = 0xD800|((ch-0x10000)>>10);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 }
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001165 if (doswap){
1166 *p++ = (ch >> 8) | (ch << 8);
1167 if (ch2)
1168 *p++ = (ch2 >> 8) | (ch2 << 8);
1169 }else{
1170 *p++ = ch;
1171 if(ch2)
1172 *p++ = ch2;
1173 }
1174 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 return v;
1176}
1177
1178PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1179{
1180 if (!PyUnicode_Check(unicode)) {
1181 PyErr_BadArgument();
1182 return NULL;
1183 }
1184 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1185 PyUnicode_GET_SIZE(unicode),
1186 NULL,
1187 0);
1188}
1189
1190/* --- Unicode Escape Codec ----------------------------------------------- */
1191
1192static
1193int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001194 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195 const char *errors,
1196 const char *details)
1197{
1198 if ((errors == NULL) ||
1199 (strcmp(errors,"strict") == 0)) {
1200 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001201 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 details);
1203 return -1;
1204 }
1205 else if (strcmp(errors,"ignore") == 0) {
1206 return 0;
1207 }
1208 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001209 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 return 0;
1211 }
1212 else {
1213 PyErr_Format(PyExc_ValueError,
1214 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001215 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 errors);
1217 return -1;
1218 }
1219}
1220
Fredrik Lundh06d12682001-01-24 07:59:11 +00001221static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001222
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1224 int size,
1225 const char *errors)
1226{
1227 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001228 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001230 char* message;
1231 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1232
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 /* Escaped strings will always be longer than the resulting
1234 Unicode string, so we start with size here and then reduce the
1235 length after conversion to the true value. */
1236 v = _PyUnicode_New(size);
1237 if (v == NULL)
1238 goto onError;
1239 if (size == 0)
1240 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001241
Guido van Rossumd57fd912000-03-10 22:53:23 +00001242 p = buf = PyUnicode_AS_UNICODE(v);
1243 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001244
Guido van Rossumd57fd912000-03-10 22:53:23 +00001245 while (s < end) {
1246 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001247 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001248 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249
1250 /* Non-escape characters are interpreted as Unicode ordinals */
1251 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001252 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 continue;
1254 }
1255
1256 /* \ - Escapes */
1257 s++;
1258 switch (*s++) {
1259
1260 /* \x escapes */
1261 case '\n': break;
1262 case '\\': *p++ = '\\'; break;
1263 case '\'': *p++ = '\''; break;
1264 case '\"': *p++ = '\"'; break;
1265 case 'b': *p++ = '\b'; break;
1266 case 'f': *p++ = '\014'; break; /* FF */
1267 case 't': *p++ = '\t'; break;
1268 case 'n': *p++ = '\n'; break;
1269 case 'r': *p++ = '\r'; break;
1270 case 'v': *p++ = '\013'; break; /* VT */
1271 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1272
1273 /* \OOO (octal) escapes */
1274 case '0': case '1': case '2': case '3':
1275 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001276 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001277 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001278 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001279 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001280 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001282 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001283 break;
1284
Fredrik Lundhccc74732001-02-18 22:13:49 +00001285 /* hex escapes */
1286 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001288 digits = 2;
1289 message = "truncated \\xXX escape";
1290 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001291
Fredrik Lundhccc74732001-02-18 22:13:49 +00001292 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001294 digits = 4;
1295 message = "truncated \\uXXXX escape";
1296 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297
Fredrik Lundhccc74732001-02-18 22:13:49 +00001298 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001299 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001300 digits = 8;
1301 message = "truncated \\UXXXXXXXX escape";
1302 hexescape:
1303 chr = 0;
1304 for (i = 0; i < digits; i++) {
1305 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001306 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001307 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001308 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001309 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001310 i++;
1311 break;
1312 }
1313 chr = (chr<<4) & ~0xF;
1314 if (c >= '0' && c <= '9')
1315 chr += c - '0';
1316 else if (c >= 'a' && c <= 'f')
1317 chr += 10 + c - 'a';
1318 else
1319 chr += 10 + c - 'A';
1320 }
1321 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001322 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001323 /* when we get here, chr is a 32-bit unicode character */
1324 if (chr <= 0xffff)
1325 /* UCS-2 character */
1326 *p++ = (Py_UNICODE) chr;
1327 else if (chr <= 0x10ffff) {
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001328 /* UCS-4 character. Either store directly, or as surrogate pair. */
Fredrik Lundh8f455852001-06-27 18:59:43 +00001329#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001330 *p++ = chr;
1331#else
Fredrik Lundhdf846752000-09-03 11:29:49 +00001332 chr -= 0x10000L;
1333 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001334 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001335#endif
Fredrik Lundhdf846752000-09-03 11:29:49 +00001336 } else {
1337 if (unicodeescape_decoding_error(
1338 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001339 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001340 )
1341 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001342 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001343 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001344 break;
1345
1346 /* \N{name} */
1347 case 'N':
1348 message = "malformed \\N character escape";
1349 if (ucnhash_CAPI == NULL) {
1350 /* load the unicode data module */
1351 PyObject *m, *v;
1352 m = PyImport_ImportModule("unicodedata");
1353 if (m == NULL)
1354 goto ucnhashError;
1355 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1356 Py_DECREF(m);
1357 if (v == NULL)
1358 goto ucnhashError;
1359 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1360 Py_DECREF(v);
1361 if (ucnhash_CAPI == NULL)
1362 goto ucnhashError;
1363 }
1364 if (*s == '{') {
1365 const char *start = s+1;
1366 /* look for the closing brace */
1367 while (*s != '}' && s < end)
1368 s++;
1369 if (s > start && s < end && *s == '}') {
1370 /* found a name. look it up in the unicode database */
1371 message = "unknown Unicode character name";
1372 s++;
1373 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1374 goto store;
1375 }
1376 }
1377 if (unicodeescape_decoding_error(&s, &x, errors, message))
1378 goto onError;
1379 *p++ = x;
1380 break;
1381
1382 default:
1383 *p++ = '\\';
1384 *p++ = (unsigned char)s[-1];
1385 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 }
1387 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001388 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001389 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390 return (PyObject *)v;
1391
Fredrik Lundhccc74732001-02-18 22:13:49 +00001392ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001393 PyErr_SetString(
1394 PyExc_UnicodeError,
1395 "\\N escapes not supported (can't load unicodedata module)"
1396 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001397 return NULL;
1398
Fredrik Lundhccc74732001-02-18 22:13:49 +00001399onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001400 Py_XDECREF(v);
1401 return NULL;
1402}
1403
1404/* Return a Unicode-Escape string version of the Unicode object.
1405
1406 If quotes is true, the string is enclosed in u"" or u'' quotes as
1407 appropriate.
1408
1409*/
1410
Barry Warsaw51ac5802000-03-20 16:36:48 +00001411static const Py_UNICODE *findchar(const Py_UNICODE *s,
1412 int size,
1413 Py_UNICODE ch);
1414
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415static
1416PyObject *unicodeescape_string(const Py_UNICODE *s,
1417 int size,
1418 int quotes)
1419{
1420 PyObject *repr;
1421 char *p;
1422 char *q;
1423
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001424 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425
1426 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1427 if (repr == NULL)
1428 return NULL;
1429
1430 p = q = PyString_AS_STRING(repr);
1431
1432 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001433 *p++ = 'u';
1434 *p++ = (findchar(s, size, '\'') &&
1435 !findchar(s, size, '"')) ? '"' : '\'';
1436 }
1437 while (size-- > 0) {
1438 Py_UNICODE ch = *s++;
1439 /* Escape quotes */
Fredrik Lundh30831632001-06-26 15:11:00 +00001440 if (quotes && (ch == (Py_UNICODE) q[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 *p++ = '\\';
1442 *p++ = (char) ch;
1443 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001444#ifdef Py_UNICODE_WIDE
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00001445 /* Map 21-bit characters to '\U00xxxxxx' */
1446 else if (ch >= 0x10000) {
1447 *p++ = '\\';
1448 *p++ = 'U';
1449 *p++ = hexdigit[(ch >> 28) & 0xf];
1450 *p++ = hexdigit[(ch >> 24) & 0xf];
1451 *p++ = hexdigit[(ch >> 20) & 0xf];
1452 *p++ = hexdigit[(ch >> 16) & 0xf];
1453 *p++ = hexdigit[(ch >> 12) & 0xf];
1454 *p++ = hexdigit[(ch >> 8) & 0xf];
1455 *p++ = hexdigit[(ch >> 4) & 0xf];
1456 *p++ = hexdigit[ch & 15];
1457 }
Guido van Rossum0d42e0c2001-07-20 16:36:21 +00001458#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459 /* Map 16-bit characters to '\uxxxx' */
1460 else if (ch >= 256) {
1461 *p++ = '\\';
1462 *p++ = 'u';
1463 *p++ = hexdigit[(ch >> 12) & 0xf];
1464 *p++ = hexdigit[(ch >> 8) & 0xf];
1465 *p++ = hexdigit[(ch >> 4) & 0xf];
1466 *p++ = hexdigit[ch & 15];
1467 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001468 /* Map special whitespace to '\t', \n', '\r' */
1469 else if (ch == '\t') {
1470 *p++ = '\\';
1471 *p++ = 't';
1472 }
1473 else if (ch == '\n') {
1474 *p++ = '\\';
1475 *p++ = 'n';
1476 }
1477 else if (ch == '\r') {
1478 *p++ = '\\';
1479 *p++ = 'r';
1480 }
1481 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001482 else if (ch < ' ' || ch >= 128) {
1483 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001484 *p++ = 'x';
1485 *p++ = hexdigit[(ch >> 4) & 0xf];
1486 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487 }
1488 /* Copy everything else as-is */
1489 else
1490 *p++ = (char) ch;
1491 }
1492 if (quotes)
1493 *p++ = q[1];
1494
1495 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001496 if (_PyString_Resize(&repr, p - q))
1497 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001498
1499 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001500
1501 onError:
1502 Py_DECREF(repr);
1503 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001504}
1505
1506PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1507 int size)
1508{
1509 return unicodeescape_string(s, size, 0);
1510}
1511
1512PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1513{
1514 if (!PyUnicode_Check(unicode)) {
1515 PyErr_BadArgument();
1516 return NULL;
1517 }
1518 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1519 PyUnicode_GET_SIZE(unicode));
1520}
1521
1522/* --- Raw Unicode Escape Codec ------------------------------------------- */
1523
1524PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1525 int size,
1526 const char *errors)
1527{
1528 PyUnicodeObject *v;
1529 Py_UNICODE *p, *buf;
1530 const char *end;
1531 const char *bs;
1532
1533 /* Escaped strings will always be longer than the resulting
1534 Unicode string, so we start with size here and then reduce the
1535 length after conversion to the true value. */
1536 v = _PyUnicode_New(size);
1537 if (v == NULL)
1538 goto onError;
1539 if (size == 0)
1540 return (PyObject *)v;
1541 p = buf = PyUnicode_AS_UNICODE(v);
1542 end = s + size;
1543 while (s < end) {
1544 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001545 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001546 int i;
1547
1548 /* Non-escape characters are interpreted as Unicode ordinals */
1549 if (*s != '\\') {
1550 *p++ = (unsigned char)*s++;
1551 continue;
1552 }
1553
1554 /* \u-escapes are only interpreted iff the number of leading
1555 backslashes if odd */
1556 bs = s;
1557 for (;s < end;) {
1558 if (*s != '\\')
1559 break;
1560 *p++ = (unsigned char)*s++;
1561 }
1562 if (((s - bs) & 1) == 0 ||
1563 s >= end ||
1564 *s != 'u') {
1565 continue;
1566 }
1567 p--;
1568 s++;
1569
1570 /* \uXXXX with 4 hex digits */
1571 for (x = 0, i = 0; i < 4; i++) {
1572 c = (unsigned char)s[i];
1573 if (!isxdigit(c)) {
1574 if (unicodeescape_decoding_error(&s, &x, errors,
1575 "truncated \\uXXXX"))
1576 goto onError;
1577 i++;
1578 break;
1579 }
1580 x = (x<<4) & ~0xF;
1581 if (c >= '0' && c <= '9')
1582 x += c - '0';
1583 else if (c >= 'a' && c <= 'f')
1584 x += 10 + c - 'a';
1585 else
1586 x += 10 + c - 'A';
1587 }
1588 s += i;
1589 *p++ = x;
1590 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001591 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001592 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001593 return (PyObject *)v;
1594
1595 onError:
1596 Py_XDECREF(v);
1597 return NULL;
1598}
1599
1600PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1601 int size)
1602{
1603 PyObject *repr;
1604 char *p;
1605 char *q;
1606
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001607 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608
1609 repr = PyString_FromStringAndSize(NULL, 6 * size);
1610 if (repr == NULL)
1611 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001612 if (size == 0)
1613 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614
1615 p = q = PyString_AS_STRING(repr);
1616 while (size-- > 0) {
1617 Py_UNICODE ch = *s++;
1618 /* Map 16-bit characters to '\uxxxx' */
1619 if (ch >= 256) {
1620 *p++ = '\\';
1621 *p++ = 'u';
1622 *p++ = hexdigit[(ch >> 12) & 0xf];
1623 *p++ = hexdigit[(ch >> 8) & 0xf];
1624 *p++ = hexdigit[(ch >> 4) & 0xf];
1625 *p++ = hexdigit[ch & 15];
1626 }
1627 /* Copy everything else as-is */
1628 else
1629 *p++ = (char) ch;
1630 }
1631 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001632 if (_PyString_Resize(&repr, p - q))
1633 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001634
1635 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001636
1637 onError:
1638 Py_DECREF(repr);
1639 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001640}
1641
1642PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1643{
1644 if (!PyUnicode_Check(unicode)) {
1645 PyErr_BadArgument();
1646 return NULL;
1647 }
1648 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1649 PyUnicode_GET_SIZE(unicode));
1650}
1651
1652/* --- Latin-1 Codec ------------------------------------------------------ */
1653
1654PyObject *PyUnicode_DecodeLatin1(const char *s,
1655 int size,
1656 const char *errors)
1657{
1658 PyUnicodeObject *v;
1659 Py_UNICODE *p;
1660
1661 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001662 if (size == 1 && *(unsigned char*)s < 256) {
1663 Py_UNICODE r = *(unsigned char*)s;
1664 return PyUnicode_FromUnicode(&r, 1);
1665 }
1666
Guido van Rossumd57fd912000-03-10 22:53:23 +00001667 v = _PyUnicode_New(size);
1668 if (v == NULL)
1669 goto onError;
1670 if (size == 0)
1671 return (PyObject *)v;
1672 p = PyUnicode_AS_UNICODE(v);
1673 while (size-- > 0)
1674 *p++ = (unsigned char)*s++;
1675 return (PyObject *)v;
1676
1677 onError:
1678 Py_XDECREF(v);
1679 return NULL;
1680}
1681
1682static
1683int latin1_encoding_error(const Py_UNICODE **source,
1684 char **dest,
1685 const char *errors,
1686 const char *details)
1687{
1688 if ((errors == NULL) ||
1689 (strcmp(errors,"strict") == 0)) {
1690 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001691 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001692 details);
1693 return -1;
1694 }
1695 else if (strcmp(errors,"ignore") == 0) {
1696 return 0;
1697 }
1698 else if (strcmp(errors,"replace") == 0) {
1699 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001700 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001701 return 0;
1702 }
1703 else {
1704 PyErr_Format(PyExc_ValueError,
1705 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001706 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 errors);
1708 return -1;
1709 }
1710}
1711
1712PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1713 int size,
1714 const char *errors)
1715{
1716 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001717 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001718
Guido van Rossumd57fd912000-03-10 22:53:23 +00001719 repr = PyString_FromStringAndSize(NULL, size);
1720 if (repr == NULL)
1721 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001722 if (size == 0)
1723 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724
1725 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001726 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001727 while (size-- > 0) {
1728 Py_UNICODE ch = *p++;
1729 if (ch >= 256) {
1730 if (latin1_encoding_error(&p, &s, errors,
1731 "ordinal not in range(256)"))
1732 goto onError;
1733 }
1734 else
1735 *s++ = (char)ch;
1736 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001737 /* Resize if error handling skipped some characters */
1738 if (s - start < PyString_GET_SIZE(repr))
1739 if (_PyString_Resize(&repr, s - start))
1740 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001741 return repr;
1742
1743 onError:
1744 Py_DECREF(repr);
1745 return NULL;
1746}
1747
1748PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1749{
1750 if (!PyUnicode_Check(unicode)) {
1751 PyErr_BadArgument();
1752 return NULL;
1753 }
1754 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1755 PyUnicode_GET_SIZE(unicode),
1756 NULL);
1757}
1758
1759/* --- 7-bit ASCII Codec -------------------------------------------------- */
1760
1761static
1762int ascii_decoding_error(const char **source,
1763 Py_UNICODE **dest,
1764 const char *errors,
1765 const char *details)
1766{
1767 if ((errors == NULL) ||
1768 (strcmp(errors,"strict") == 0)) {
1769 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001770 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 details);
1772 return -1;
1773 }
1774 else if (strcmp(errors,"ignore") == 0) {
1775 return 0;
1776 }
1777 else if (strcmp(errors,"replace") == 0) {
1778 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1779 (*dest)++;
1780 return 0;
1781 }
1782 else {
1783 PyErr_Format(PyExc_ValueError,
1784 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001785 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 errors);
1787 return -1;
1788 }
1789}
1790
1791PyObject *PyUnicode_DecodeASCII(const char *s,
1792 int size,
1793 const char *errors)
1794{
1795 PyUnicodeObject *v;
1796 Py_UNICODE *p;
1797
1798 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001799 if (size == 1 && *(unsigned char*)s < 128) {
1800 Py_UNICODE r = *(unsigned char*)s;
1801 return PyUnicode_FromUnicode(&r, 1);
1802 }
1803
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804 v = _PyUnicode_New(size);
1805 if (v == NULL)
1806 goto onError;
1807 if (size == 0)
1808 return (PyObject *)v;
1809 p = PyUnicode_AS_UNICODE(v);
1810 while (size-- > 0) {
1811 register unsigned char c;
1812
1813 c = (unsigned char)*s++;
1814 if (c < 128)
1815 *p++ = c;
1816 else if (ascii_decoding_error(&s, &p, errors,
1817 "ordinal not in range(128)"))
1818 goto onError;
1819 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001820 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001821 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001822 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001823 return (PyObject *)v;
1824
1825 onError:
1826 Py_XDECREF(v);
1827 return NULL;
1828}
1829
1830static
1831int ascii_encoding_error(const Py_UNICODE **source,
1832 char **dest,
1833 const char *errors,
1834 const char *details)
1835{
1836 if ((errors == NULL) ||
1837 (strcmp(errors,"strict") == 0)) {
1838 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001839 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001840 details);
1841 return -1;
1842 }
1843 else if (strcmp(errors,"ignore") == 0) {
1844 return 0;
1845 }
1846 else if (strcmp(errors,"replace") == 0) {
1847 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001848 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001849 return 0;
1850 }
1851 else {
1852 PyErr_Format(PyExc_ValueError,
1853 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001854 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855 errors);
1856 return -1;
1857 }
1858}
1859
1860PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1861 int size,
1862 const char *errors)
1863{
1864 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001865 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001866
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867 repr = PyString_FromStringAndSize(NULL, size);
1868 if (repr == NULL)
1869 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001870 if (size == 0)
1871 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001872
1873 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001874 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001875 while (size-- > 0) {
1876 Py_UNICODE ch = *p++;
1877 if (ch >= 128) {
1878 if (ascii_encoding_error(&p, &s, errors,
1879 "ordinal not in range(128)"))
1880 goto onError;
1881 }
1882 else
1883 *s++ = (char)ch;
1884 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001885 /* Resize if error handling skipped some characters */
1886 if (s - start < PyString_GET_SIZE(repr))
1887 if (_PyString_Resize(&repr, s - start))
1888 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001889 return repr;
1890
1891 onError:
1892 Py_DECREF(repr);
1893 return NULL;
1894}
1895
1896PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1897{
1898 if (!PyUnicode_Check(unicode)) {
1899 PyErr_BadArgument();
1900 return NULL;
1901 }
1902 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1903 PyUnicode_GET_SIZE(unicode),
1904 NULL);
1905}
1906
Fredrik Lundh30831632001-06-26 15:11:00 +00001907#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001908
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001909/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001910
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001911PyObject *PyUnicode_DecodeMBCS(const char *s,
1912 int size,
1913 const char *errors)
1914{
1915 PyUnicodeObject *v;
1916 Py_UNICODE *p;
1917
1918 /* First get the size of the result */
1919 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001920 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001921 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1922
1923 v = _PyUnicode_New(usize);
1924 if (v == NULL)
1925 return NULL;
1926 if (usize == 0)
1927 return (PyObject *)v;
1928 p = PyUnicode_AS_UNICODE(v);
1929 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1930 Py_DECREF(v);
1931 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1932 }
1933
1934 return (PyObject *)v;
1935}
1936
1937PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1938 int size,
1939 const char *errors)
1940{
1941 PyObject *repr;
1942 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001943 DWORD mbcssize;
1944
1945 /* If there are no characters, bail now! */
1946 if (size==0)
1947 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001948
1949 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001950 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001951 if (mbcssize==0)
1952 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1953
1954 repr = PyString_FromStringAndSize(NULL, mbcssize);
1955 if (repr == NULL)
1956 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001957 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001958 return repr;
1959
1960 /* Do the conversion */
1961 s = PyString_AS_STRING(repr);
1962 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1963 Py_DECREF(repr);
1964 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1965 }
1966 return repr;
1967}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001968
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001969#endif /* MS_WIN32 */
1970
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971/* --- Character Mapping Codec -------------------------------------------- */
1972
1973static
1974int charmap_decoding_error(const char **source,
1975 Py_UNICODE **dest,
1976 const char *errors,
1977 const char *details)
1978{
1979 if ((errors == NULL) ||
1980 (strcmp(errors,"strict") == 0)) {
1981 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001982 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001983 details);
1984 return -1;
1985 }
1986 else if (strcmp(errors,"ignore") == 0) {
1987 return 0;
1988 }
1989 else if (strcmp(errors,"replace") == 0) {
1990 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1991 (*dest)++;
1992 return 0;
1993 }
1994 else {
1995 PyErr_Format(PyExc_ValueError,
1996 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001997 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001998 errors);
1999 return -1;
2000 }
2001}
2002
2003PyObject *PyUnicode_DecodeCharmap(const char *s,
2004 int size,
2005 PyObject *mapping,
2006 const char *errors)
2007{
2008 PyUnicodeObject *v;
2009 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002010 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002011
2012 /* Default to Latin-1 */
2013 if (mapping == NULL)
2014 return PyUnicode_DecodeLatin1(s, size, errors);
2015
2016 v = _PyUnicode_New(size);
2017 if (v == NULL)
2018 goto onError;
2019 if (size == 0)
2020 return (PyObject *)v;
2021 p = PyUnicode_AS_UNICODE(v);
2022 while (size-- > 0) {
2023 unsigned char ch = *s++;
2024 PyObject *w, *x;
2025
2026 /* Get mapping (char ordinal -> integer, Unicode char or None) */
2027 w = PyInt_FromLong((long)ch);
2028 if (w == NULL)
2029 goto onError;
2030 x = PyObject_GetItem(mapping, w);
2031 Py_DECREF(w);
2032 if (x == NULL) {
2033 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002034 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002035 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002036 x = Py_None;
2037 Py_INCREF(x);
2038 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002039 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040 }
2041
2042 /* Apply mapping */
2043 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002044 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045 if (value < 0 || value > 65535) {
2046 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00002047 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00002048 Py_DECREF(x);
2049 goto onError;
2050 }
2051 *p++ = (Py_UNICODE)value;
2052 }
2053 else if (x == Py_None) {
2054 /* undefined mapping */
2055 if (charmap_decoding_error(&s, &p, errors,
2056 "character maps to <undefined>")) {
2057 Py_DECREF(x);
2058 goto onError;
2059 }
2060 }
2061 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002062 int targetsize = PyUnicode_GET_SIZE(x);
2063
2064 if (targetsize == 1)
2065 /* 1-1 mapping */
2066 *p++ = *PyUnicode_AS_UNICODE(x);
2067
2068 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002070 if (targetsize > extrachars) {
2071 /* resize first */
2072 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2073 int needed = (targetsize - extrachars) + \
2074 (targetsize << 2);
2075 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002076 if (_PyUnicode_Resize(&v,
2077 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002078 Py_DECREF(x);
2079 goto onError;
2080 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002081 p = PyUnicode_AS_UNICODE(v) + oldpos;
2082 }
2083 Py_UNICODE_COPY(p,
2084 PyUnicode_AS_UNICODE(x),
2085 targetsize);
2086 p += targetsize;
2087 extrachars -= targetsize;
2088 }
2089 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002090 }
2091 else {
2092 /* wrong return value */
2093 PyErr_SetString(PyExc_TypeError,
2094 "character mapping must return integer, None or unicode");
2095 Py_DECREF(x);
2096 goto onError;
2097 }
2098 Py_DECREF(x);
2099 }
2100 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002101 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 goto onError;
2103 return (PyObject *)v;
2104
2105 onError:
2106 Py_XDECREF(v);
2107 return NULL;
2108}
2109
2110static
2111int charmap_encoding_error(const Py_UNICODE **source,
2112 char **dest,
2113 const char *errors,
2114 const char *details)
2115{
2116 if ((errors == NULL) ||
2117 (strcmp(errors,"strict") == 0)) {
2118 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002119 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002120 details);
2121 return -1;
2122 }
2123 else if (strcmp(errors,"ignore") == 0) {
2124 return 0;
2125 }
2126 else if (strcmp(errors,"replace") == 0) {
2127 **dest = '?';
2128 (*dest)++;
2129 return 0;
2130 }
2131 else {
2132 PyErr_Format(PyExc_ValueError,
2133 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002134 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135 errors);
2136 return -1;
2137 }
2138}
2139
2140PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2141 int size,
2142 PyObject *mapping,
2143 const char *errors)
2144{
2145 PyObject *v;
2146 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002147 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002148
2149 /* Default to Latin-1 */
2150 if (mapping == NULL)
2151 return PyUnicode_EncodeLatin1(p, size, errors);
2152
2153 v = PyString_FromStringAndSize(NULL, size);
2154 if (v == NULL)
2155 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002156 if (size == 0)
2157 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002158 s = PyString_AS_STRING(v);
2159 while (size-- > 0) {
2160 Py_UNICODE ch = *p++;
2161 PyObject *w, *x;
2162
2163 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2164 w = PyInt_FromLong((long)ch);
2165 if (w == NULL)
2166 goto onError;
2167 x = PyObject_GetItem(mapping, w);
2168 Py_DECREF(w);
2169 if (x == NULL) {
2170 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002171 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002172 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002173 x = Py_None;
2174 Py_INCREF(x);
2175 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002176 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002177 }
2178
2179 /* Apply mapping */
2180 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002181 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 if (value < 0 || value > 255) {
2183 PyErr_SetString(PyExc_TypeError,
2184 "character mapping must be in range(256)");
2185 Py_DECREF(x);
2186 goto onError;
2187 }
2188 *s++ = (char)value;
2189 }
2190 else if (x == Py_None) {
2191 /* undefined mapping */
2192 if (charmap_encoding_error(&p, &s, errors,
2193 "character maps to <undefined>")) {
2194 Py_DECREF(x);
2195 goto onError;
2196 }
2197 }
2198 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002199 int targetsize = PyString_GET_SIZE(x);
2200
2201 if (targetsize == 1)
2202 /* 1-1 mapping */
2203 *s++ = *PyString_AS_STRING(x);
2204
2205 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002207 if (targetsize > extrachars) {
2208 /* resize first */
2209 int oldpos = (int)(s - PyString_AS_STRING(v));
2210 int needed = (targetsize - extrachars) + \
2211 (targetsize << 2);
2212 extrachars += needed;
2213 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002214 Py_DECREF(x);
2215 goto onError;
2216 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002217 s = PyString_AS_STRING(v) + oldpos;
2218 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002219 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002220 s += targetsize;
2221 extrachars -= targetsize;
2222 }
2223 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 }
2225 else {
2226 /* wrong return value */
2227 PyErr_SetString(PyExc_TypeError,
2228 "character mapping must return integer, None or unicode");
2229 Py_DECREF(x);
2230 goto onError;
2231 }
2232 Py_DECREF(x);
2233 }
2234 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2235 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2236 goto onError;
2237 return v;
2238
2239 onError:
2240 Py_DECREF(v);
2241 return NULL;
2242}
2243
2244PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2245 PyObject *mapping)
2246{
2247 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2248 PyErr_BadArgument();
2249 return NULL;
2250 }
2251 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2252 PyUnicode_GET_SIZE(unicode),
2253 mapping,
2254 NULL);
2255}
2256
2257static
2258int translate_error(const Py_UNICODE **source,
2259 Py_UNICODE **dest,
2260 const char *errors,
2261 const char *details)
2262{
2263 if ((errors == NULL) ||
2264 (strcmp(errors,"strict") == 0)) {
2265 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002266 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002267 details);
2268 return -1;
2269 }
2270 else if (strcmp(errors,"ignore") == 0) {
2271 return 0;
2272 }
2273 else if (strcmp(errors,"replace") == 0) {
2274 **dest = '?';
2275 (*dest)++;
2276 return 0;
2277 }
2278 else {
2279 PyErr_Format(PyExc_ValueError,
2280 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002281 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002282 errors);
2283 return -1;
2284 }
2285}
2286
2287PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2288 int size,
2289 PyObject *mapping,
2290 const char *errors)
2291{
2292 PyUnicodeObject *v;
2293 Py_UNICODE *p;
2294
2295 if (mapping == NULL) {
2296 PyErr_BadArgument();
2297 return NULL;
2298 }
2299
2300 /* Output will never be longer than input */
2301 v = _PyUnicode_New(size);
2302 if (v == NULL)
2303 goto onError;
2304 if (size == 0)
2305 goto done;
2306 p = PyUnicode_AS_UNICODE(v);
2307 while (size-- > 0) {
2308 Py_UNICODE ch = *s++;
2309 PyObject *w, *x;
2310
2311 /* Get mapping */
2312 w = PyInt_FromLong(ch);
2313 if (w == NULL)
2314 goto onError;
2315 x = PyObject_GetItem(mapping, w);
2316 Py_DECREF(w);
2317 if (x == NULL) {
2318 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2319 /* No mapping found: default to 1-1 mapping */
2320 PyErr_Clear();
2321 *p++ = ch;
2322 continue;
2323 }
2324 goto onError;
2325 }
2326
2327 /* Apply mapping */
2328 if (PyInt_Check(x))
2329 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2330 else if (x == Py_None) {
2331 /* undefined mapping */
2332 if (translate_error(&s, &p, errors,
2333 "character maps to <undefined>")) {
2334 Py_DECREF(x);
2335 goto onError;
2336 }
2337 }
2338 else if (PyUnicode_Check(x)) {
2339 if (PyUnicode_GET_SIZE(x) != 1) {
2340 /* 1-n mapping */
2341 PyErr_SetString(PyExc_NotImplementedError,
2342 "1-n mappings are currently not implemented");
2343 Py_DECREF(x);
2344 goto onError;
2345 }
2346 *p++ = *PyUnicode_AS_UNICODE(x);
2347 }
2348 else {
2349 /* wrong return value */
2350 PyErr_SetString(PyExc_TypeError,
2351 "translate mapping must return integer, None or unicode");
2352 Py_DECREF(x);
2353 goto onError;
2354 }
2355 Py_DECREF(x);
2356 }
2357 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002358 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002359 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002360
2361 done:
2362 return (PyObject *)v;
2363
2364 onError:
2365 Py_XDECREF(v);
2366 return NULL;
2367}
2368
2369PyObject *PyUnicode_Translate(PyObject *str,
2370 PyObject *mapping,
2371 const char *errors)
2372{
2373 PyObject *result;
2374
2375 str = PyUnicode_FromObject(str);
2376 if (str == NULL)
2377 goto onError;
2378 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2379 PyUnicode_GET_SIZE(str),
2380 mapping,
2381 errors);
2382 Py_DECREF(str);
2383 return result;
2384
2385 onError:
2386 Py_XDECREF(str);
2387 return NULL;
2388}
2389
Guido van Rossum9e896b32000-04-05 20:11:21 +00002390/* --- Decimal Encoder ---------------------------------------------------- */
2391
2392int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2393 int length,
2394 char *output,
2395 const char *errors)
2396{
2397 Py_UNICODE *p, *end;
2398
2399 if (output == NULL) {
2400 PyErr_BadArgument();
2401 return -1;
2402 }
2403
2404 p = s;
2405 end = s + length;
2406 while (p < end) {
2407 register Py_UNICODE ch = *p++;
2408 int decimal;
2409
2410 if (Py_UNICODE_ISSPACE(ch)) {
2411 *output++ = ' ';
2412 continue;
2413 }
2414 decimal = Py_UNICODE_TODECIMAL(ch);
2415 if (decimal >= 0) {
2416 *output++ = '0' + decimal;
2417 continue;
2418 }
Guido van Rossumba477042000-04-06 18:18:10 +00002419 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002420 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002421 continue;
2422 }
2423 /* All other characters are considered invalid */
2424 if (errors == NULL || strcmp(errors, "strict") == 0) {
2425 PyErr_SetString(PyExc_ValueError,
2426 "invalid decimal Unicode string");
2427 goto onError;
2428 }
2429 else if (strcmp(errors, "ignore") == 0)
2430 continue;
2431 else if (strcmp(errors, "replace") == 0) {
2432 *output++ = '?';
2433 continue;
2434 }
2435 }
2436 /* 0-terminate the output string */
2437 *output++ = '\0';
2438 return 0;
2439
2440 onError:
2441 return -1;
2442}
2443
Guido van Rossumd57fd912000-03-10 22:53:23 +00002444/* --- Helpers ------------------------------------------------------------ */
2445
2446static
2447int count(PyUnicodeObject *self,
2448 int start,
2449 int end,
2450 PyUnicodeObject *substring)
2451{
2452 int count = 0;
2453
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002454 if (start < 0)
2455 start += self->length;
2456 if (start < 0)
2457 start = 0;
2458 if (end > self->length)
2459 end = self->length;
2460 if (end < 0)
2461 end += self->length;
2462 if (end < 0)
2463 end = 0;
2464
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002465 if (substring->length == 0)
2466 return (end - start + 1);
2467
Guido van Rossumd57fd912000-03-10 22:53:23 +00002468 end -= substring->length;
2469
2470 while (start <= end)
2471 if (Py_UNICODE_MATCH(self, start, substring)) {
2472 count++;
2473 start += substring->length;
2474 } else
2475 start++;
2476
2477 return count;
2478}
2479
2480int PyUnicode_Count(PyObject *str,
2481 PyObject *substr,
2482 int start,
2483 int end)
2484{
2485 int result;
2486
2487 str = PyUnicode_FromObject(str);
2488 if (str == NULL)
2489 return -1;
2490 substr = PyUnicode_FromObject(substr);
2491 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002492 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002493 return -1;
2494 }
2495
2496 result = count((PyUnicodeObject *)str,
2497 start, end,
2498 (PyUnicodeObject *)substr);
2499
2500 Py_DECREF(str);
2501 Py_DECREF(substr);
2502 return result;
2503}
2504
2505static
2506int findstring(PyUnicodeObject *self,
2507 PyUnicodeObject *substring,
2508 int start,
2509 int end,
2510 int direction)
2511{
2512 if (start < 0)
2513 start += self->length;
2514 if (start < 0)
2515 start = 0;
2516
2517 if (substring->length == 0)
2518 return start;
2519
2520 if (end > self->length)
2521 end = self->length;
2522 if (end < 0)
2523 end += self->length;
2524 if (end < 0)
2525 end = 0;
2526
2527 end -= substring->length;
2528
2529 if (direction < 0) {
2530 for (; end >= start; end--)
2531 if (Py_UNICODE_MATCH(self, end, substring))
2532 return end;
2533 } else {
2534 for (; start <= end; start++)
2535 if (Py_UNICODE_MATCH(self, start, substring))
2536 return start;
2537 }
2538
2539 return -1;
2540}
2541
2542int PyUnicode_Find(PyObject *str,
2543 PyObject *substr,
2544 int start,
2545 int end,
2546 int direction)
2547{
2548 int result;
2549
2550 str = PyUnicode_FromObject(str);
2551 if (str == NULL)
2552 return -1;
2553 substr = PyUnicode_FromObject(substr);
2554 if (substr == NULL) {
2555 Py_DECREF(substr);
2556 return -1;
2557 }
2558
2559 result = findstring((PyUnicodeObject *)str,
2560 (PyUnicodeObject *)substr,
2561 start, end, direction);
2562 Py_DECREF(str);
2563 Py_DECREF(substr);
2564 return result;
2565}
2566
2567static
2568int tailmatch(PyUnicodeObject *self,
2569 PyUnicodeObject *substring,
2570 int start,
2571 int end,
2572 int direction)
2573{
2574 if (start < 0)
2575 start += self->length;
2576 if (start < 0)
2577 start = 0;
2578
2579 if (substring->length == 0)
2580 return 1;
2581
2582 if (end > self->length)
2583 end = self->length;
2584 if (end < 0)
2585 end += self->length;
2586 if (end < 0)
2587 end = 0;
2588
2589 end -= substring->length;
2590 if (end < start)
2591 return 0;
2592
2593 if (direction > 0) {
2594 if (Py_UNICODE_MATCH(self, end, substring))
2595 return 1;
2596 } else {
2597 if (Py_UNICODE_MATCH(self, start, substring))
2598 return 1;
2599 }
2600
2601 return 0;
2602}
2603
2604int PyUnicode_Tailmatch(PyObject *str,
2605 PyObject *substr,
2606 int start,
2607 int end,
2608 int direction)
2609{
2610 int result;
2611
2612 str = PyUnicode_FromObject(str);
2613 if (str == NULL)
2614 return -1;
2615 substr = PyUnicode_FromObject(substr);
2616 if (substr == NULL) {
2617 Py_DECREF(substr);
2618 return -1;
2619 }
2620
2621 result = tailmatch((PyUnicodeObject *)str,
2622 (PyUnicodeObject *)substr,
2623 start, end, direction);
2624 Py_DECREF(str);
2625 Py_DECREF(substr);
2626 return result;
2627}
2628
2629static
2630const Py_UNICODE *findchar(const Py_UNICODE *s,
2631 int size,
2632 Py_UNICODE ch)
2633{
2634 /* like wcschr, but doesn't stop at NULL characters */
2635
2636 while (size-- > 0) {
2637 if (*s == ch)
2638 return s;
2639 s++;
2640 }
2641
2642 return NULL;
2643}
2644
2645/* Apply fixfct filter to the Unicode object self and return a
2646 reference to the modified object */
2647
2648static
2649PyObject *fixup(PyUnicodeObject *self,
2650 int (*fixfct)(PyUnicodeObject *s))
2651{
2652
2653 PyUnicodeObject *u;
2654
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002655 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002656 if (u == NULL)
2657 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002658
2659 Py_UNICODE_COPY(u->str, self->str, self->length);
2660
Guido van Rossumd57fd912000-03-10 22:53:23 +00002661 if (!fixfct(u)) {
2662 /* fixfct should return TRUE if it modified the buffer. If
2663 FALSE, return a reference to the original buffer instead
2664 (to save space, not time) */
2665 Py_INCREF(self);
2666 Py_DECREF(u);
2667 return (PyObject*) self;
2668 }
2669 return (PyObject*) u;
2670}
2671
2672static
2673int fixupper(PyUnicodeObject *self)
2674{
2675 int len = self->length;
2676 Py_UNICODE *s = self->str;
2677 int status = 0;
2678
2679 while (len-- > 0) {
2680 register Py_UNICODE ch;
2681
2682 ch = Py_UNICODE_TOUPPER(*s);
2683 if (ch != *s) {
2684 status = 1;
2685 *s = ch;
2686 }
2687 s++;
2688 }
2689
2690 return status;
2691}
2692
2693static
2694int fixlower(PyUnicodeObject *self)
2695{
2696 int len = self->length;
2697 Py_UNICODE *s = self->str;
2698 int status = 0;
2699
2700 while (len-- > 0) {
2701 register Py_UNICODE ch;
2702
2703 ch = Py_UNICODE_TOLOWER(*s);
2704 if (ch != *s) {
2705 status = 1;
2706 *s = ch;
2707 }
2708 s++;
2709 }
2710
2711 return status;
2712}
2713
2714static
2715int fixswapcase(PyUnicodeObject *self)
2716{
2717 int len = self->length;
2718 Py_UNICODE *s = self->str;
2719 int status = 0;
2720
2721 while (len-- > 0) {
2722 if (Py_UNICODE_ISUPPER(*s)) {
2723 *s = Py_UNICODE_TOLOWER(*s);
2724 status = 1;
2725 } else if (Py_UNICODE_ISLOWER(*s)) {
2726 *s = Py_UNICODE_TOUPPER(*s);
2727 status = 1;
2728 }
2729 s++;
2730 }
2731
2732 return status;
2733}
2734
2735static
2736int fixcapitalize(PyUnicodeObject *self)
2737{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002738 int len = self->length;
2739 Py_UNICODE *s = self->str;
2740 int status = 0;
2741
2742 if (len == 0)
2743 return 0;
2744 if (Py_UNICODE_ISLOWER(*s)) {
2745 *s = Py_UNICODE_TOUPPER(*s);
2746 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002747 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002748 s++;
2749 while (--len > 0) {
2750 if (Py_UNICODE_ISUPPER(*s)) {
2751 *s = Py_UNICODE_TOLOWER(*s);
2752 status = 1;
2753 }
2754 s++;
2755 }
2756 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002757}
2758
2759static
2760int fixtitle(PyUnicodeObject *self)
2761{
2762 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2763 register Py_UNICODE *e;
2764 int previous_is_cased;
2765
2766 /* Shortcut for single character strings */
2767 if (PyUnicode_GET_SIZE(self) == 1) {
2768 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2769 if (*p != ch) {
2770 *p = ch;
2771 return 1;
2772 }
2773 else
2774 return 0;
2775 }
2776
2777 e = p + PyUnicode_GET_SIZE(self);
2778 previous_is_cased = 0;
2779 for (; p < e; p++) {
2780 register const Py_UNICODE ch = *p;
2781
2782 if (previous_is_cased)
2783 *p = Py_UNICODE_TOLOWER(ch);
2784 else
2785 *p = Py_UNICODE_TOTITLE(ch);
2786
2787 if (Py_UNICODE_ISLOWER(ch) ||
2788 Py_UNICODE_ISUPPER(ch) ||
2789 Py_UNICODE_ISTITLE(ch))
2790 previous_is_cased = 1;
2791 else
2792 previous_is_cased = 0;
2793 }
2794 return 1;
2795}
2796
2797PyObject *PyUnicode_Join(PyObject *separator,
2798 PyObject *seq)
2799{
2800 Py_UNICODE *sep;
2801 int seplen;
2802 PyUnicodeObject *res = NULL;
2803 int reslen = 0;
2804 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002805 int sz = 100;
2806 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002807 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002808
Tim Peters2cfe3682001-05-05 05:36:48 +00002809 it = PyObject_GetIter(seq);
2810 if (it == NULL)
2811 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002812
2813 if (separator == NULL) {
2814 Py_UNICODE blank = ' ';
2815 sep = &blank;
2816 seplen = 1;
2817 }
2818 else {
2819 separator = PyUnicode_FromObject(separator);
2820 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002821 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822 sep = PyUnicode_AS_UNICODE(separator);
2823 seplen = PyUnicode_GET_SIZE(separator);
2824 }
2825
2826 res = _PyUnicode_New(sz);
2827 if (res == NULL)
2828 goto onError;
2829 p = PyUnicode_AS_UNICODE(res);
2830 reslen = 0;
2831
Tim Peters2cfe3682001-05-05 05:36:48 +00002832 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002833 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002834 PyObject *item = PyIter_Next(it);
2835 if (item == NULL) {
2836 if (PyErr_Occurred())
2837 goto onError;
2838 break;
2839 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002840 if (!PyUnicode_Check(item)) {
2841 PyObject *v;
2842 v = PyUnicode_FromObject(item);
2843 Py_DECREF(item);
2844 item = v;
2845 if (item == NULL)
2846 goto onError;
2847 }
2848 itemlen = PyUnicode_GET_SIZE(item);
2849 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002850 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 goto onError;
2852 sz *= 2;
2853 p = PyUnicode_AS_UNICODE(res) + reslen;
2854 }
2855 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002856 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 p += seplen;
2858 reslen += seplen;
2859 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002860 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 p += itemlen;
2862 reslen += itemlen;
2863 Py_DECREF(item);
2864 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002865 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002866 goto onError;
2867
2868 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002869 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002870 return (PyObject *)res;
2871
2872 onError:
2873 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002874 Py_XDECREF(res);
2875 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002876 return NULL;
2877}
2878
2879static
2880PyUnicodeObject *pad(PyUnicodeObject *self,
2881 int left,
2882 int right,
2883 Py_UNICODE fill)
2884{
2885 PyUnicodeObject *u;
2886
2887 if (left < 0)
2888 left = 0;
2889 if (right < 0)
2890 right = 0;
2891
2892 if (left == 0 && right == 0) {
2893 Py_INCREF(self);
2894 return self;
2895 }
2896
2897 u = _PyUnicode_New(left + self->length + right);
2898 if (u) {
2899 if (left)
2900 Py_UNICODE_FILL(u->str, fill, left);
2901 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2902 if (right)
2903 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2904 }
2905
2906 return u;
2907}
2908
2909#define SPLIT_APPEND(data, left, right) \
2910 str = PyUnicode_FromUnicode(data + left, right - left); \
2911 if (!str) \
2912 goto onError; \
2913 if (PyList_Append(list, str)) { \
2914 Py_DECREF(str); \
2915 goto onError; \
2916 } \
2917 else \
2918 Py_DECREF(str);
2919
2920static
2921PyObject *split_whitespace(PyUnicodeObject *self,
2922 PyObject *list,
2923 int maxcount)
2924{
2925 register int i;
2926 register int j;
2927 int len = self->length;
2928 PyObject *str;
2929
2930 for (i = j = 0; i < len; ) {
2931 /* find a token */
2932 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2933 i++;
2934 j = i;
2935 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2936 i++;
2937 if (j < i) {
2938 if (maxcount-- <= 0)
2939 break;
2940 SPLIT_APPEND(self->str, j, i);
2941 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2942 i++;
2943 j = i;
2944 }
2945 }
2946 if (j < len) {
2947 SPLIT_APPEND(self->str, j, len);
2948 }
2949 return list;
2950
2951 onError:
2952 Py_DECREF(list);
2953 return NULL;
2954}
2955
2956PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002957 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002958{
2959 register int i;
2960 register int j;
2961 int len;
2962 PyObject *list;
2963 PyObject *str;
2964 Py_UNICODE *data;
2965
2966 string = PyUnicode_FromObject(string);
2967 if (string == NULL)
2968 return NULL;
2969 data = PyUnicode_AS_UNICODE(string);
2970 len = PyUnicode_GET_SIZE(string);
2971
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 list = PyList_New(0);
2973 if (!list)
2974 goto onError;
2975
2976 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002977 int eol;
2978
Guido van Rossumd57fd912000-03-10 22:53:23 +00002979 /* Find a line and append it */
2980 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2981 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002982
2983 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002984 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002985 if (i < len) {
2986 if (data[i] == '\r' && i + 1 < len &&
2987 data[i+1] == '\n')
2988 i += 2;
2989 else
2990 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002991 if (keepends)
2992 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002993 }
Guido van Rossum86662912000-04-11 15:38:46 +00002994 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002995 j = i;
2996 }
2997 if (j < len) {
2998 SPLIT_APPEND(data, j, len);
2999 }
3000
3001 Py_DECREF(string);
3002 return list;
3003
3004 onError:
3005 Py_DECREF(list);
3006 Py_DECREF(string);
3007 return NULL;
3008}
3009
3010static
3011PyObject *split_char(PyUnicodeObject *self,
3012 PyObject *list,
3013 Py_UNICODE ch,
3014 int maxcount)
3015{
3016 register int i;
3017 register int j;
3018 int len = self->length;
3019 PyObject *str;
3020
3021 for (i = j = 0; i < len; ) {
3022 if (self->str[i] == ch) {
3023 if (maxcount-- <= 0)
3024 break;
3025 SPLIT_APPEND(self->str, j, i);
3026 i = j = i + 1;
3027 } else
3028 i++;
3029 }
3030 if (j <= len) {
3031 SPLIT_APPEND(self->str, j, len);
3032 }
3033 return list;
3034
3035 onError:
3036 Py_DECREF(list);
3037 return NULL;
3038}
3039
3040static
3041PyObject *split_substring(PyUnicodeObject *self,
3042 PyObject *list,
3043 PyUnicodeObject *substring,
3044 int maxcount)
3045{
3046 register int i;
3047 register int j;
3048 int len = self->length;
3049 int sublen = substring->length;
3050 PyObject *str;
3051
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00003052 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00003053 if (Py_UNICODE_MATCH(self, i, substring)) {
3054 if (maxcount-- <= 0)
3055 break;
3056 SPLIT_APPEND(self->str, j, i);
3057 i = j = i + sublen;
3058 } else
3059 i++;
3060 }
3061 if (j <= len) {
3062 SPLIT_APPEND(self->str, j, len);
3063 }
3064 return list;
3065
3066 onError:
3067 Py_DECREF(list);
3068 return NULL;
3069}
3070
3071#undef SPLIT_APPEND
3072
3073static
3074PyObject *split(PyUnicodeObject *self,
3075 PyUnicodeObject *substring,
3076 int maxcount)
3077{
3078 PyObject *list;
3079
3080 if (maxcount < 0)
3081 maxcount = INT_MAX;
3082
3083 list = PyList_New(0);
3084 if (!list)
3085 return NULL;
3086
3087 if (substring == NULL)
3088 return split_whitespace(self,list,maxcount);
3089
3090 else if (substring->length == 1)
3091 return split_char(self,list,substring->str[0],maxcount);
3092
3093 else if (substring->length == 0) {
3094 Py_DECREF(list);
3095 PyErr_SetString(PyExc_ValueError, "empty separator");
3096 return NULL;
3097 }
3098 else
3099 return split_substring(self,list,substring,maxcount);
3100}
3101
3102static
3103PyObject *strip(PyUnicodeObject *self,
3104 int left,
3105 int right)
3106{
3107 Py_UNICODE *p = self->str;
3108 int start = 0;
3109 int end = self->length;
3110
3111 if (left)
3112 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3113 start++;
3114
3115 if (right)
3116 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3117 end--;
3118
3119 if (start == 0 && end == self->length) {
3120 /* couldn't strip anything off, return original string */
3121 Py_INCREF(self);
3122 return (PyObject*) self;
3123 }
3124
3125 return (PyObject*) PyUnicode_FromUnicode(
3126 self->str + start,
3127 end - start
3128 );
3129}
3130
3131static
3132PyObject *replace(PyUnicodeObject *self,
3133 PyUnicodeObject *str1,
3134 PyUnicodeObject *str2,
3135 int maxcount)
3136{
3137 PyUnicodeObject *u;
3138
3139 if (maxcount < 0)
3140 maxcount = INT_MAX;
3141
3142 if (str1->length == 1 && str2->length == 1) {
3143 int i;
3144
3145 /* replace characters */
3146 if (!findchar(self->str, self->length, str1->str[0])) {
3147 /* nothing to replace, return original string */
3148 Py_INCREF(self);
3149 u = self;
3150 } else {
3151 Py_UNICODE u1 = str1->str[0];
3152 Py_UNICODE u2 = str2->str[0];
3153
3154 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003155 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 self->length
3157 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003158 if (u != NULL) {
3159 Py_UNICODE_COPY(u->str, self->str,
3160 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003161 for (i = 0; i < u->length; i++)
3162 if (u->str[i] == u1) {
3163 if (--maxcount < 0)
3164 break;
3165 u->str[i] = u2;
3166 }
3167 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003168 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169
3170 } else {
3171 int n, i;
3172 Py_UNICODE *p;
3173
3174 /* replace strings */
3175 n = count(self, 0, self->length, str1);
3176 if (n > maxcount)
3177 n = maxcount;
3178 if (n == 0) {
3179 /* nothing to replace, return original string */
3180 Py_INCREF(self);
3181 u = self;
3182 } else {
3183 u = _PyUnicode_New(
3184 self->length + n * (str2->length - str1->length));
3185 if (u) {
3186 i = 0;
3187 p = u->str;
3188 while (i <= self->length - str1->length)
3189 if (Py_UNICODE_MATCH(self, i, str1)) {
3190 /* replace string segment */
3191 Py_UNICODE_COPY(p, str2->str, str2->length);
3192 p += str2->length;
3193 i += str1->length;
3194 if (--n <= 0) {
3195 /* copy remaining part */
3196 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3197 break;
3198 }
3199 } else
3200 *p++ = self->str[i++];
3201 }
3202 }
3203 }
3204
3205 return (PyObject *) u;
3206}
3207
3208/* --- Unicode Object Methods --------------------------------------------- */
3209
3210static char title__doc__[] =
3211"S.title() -> unicode\n\
3212\n\
3213Return a titlecased version of S, i.e. words start with title case\n\
3214characters, all remaining cased characters have lower case.";
3215
3216static PyObject*
3217unicode_title(PyUnicodeObject *self, PyObject *args)
3218{
3219 if (!PyArg_NoArgs(args))
3220 return NULL;
3221 return fixup(self, fixtitle);
3222}
3223
3224static char capitalize__doc__[] =
3225"S.capitalize() -> unicode\n\
3226\n\
3227Return a capitalized version of S, i.e. make the first character\n\
3228have upper case.";
3229
3230static PyObject*
3231unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3232{
3233 if (!PyArg_NoArgs(args))
3234 return NULL;
3235 return fixup(self, fixcapitalize);
3236}
3237
3238#if 0
3239static char capwords__doc__[] =
3240"S.capwords() -> unicode\n\
3241\n\
3242Apply .capitalize() to all words in S and return the result with\n\
3243normalized whitespace (all whitespace strings are replaced by ' ').";
3244
3245static PyObject*
3246unicode_capwords(PyUnicodeObject *self, PyObject *args)
3247{
3248 PyObject *list;
3249 PyObject *item;
3250 int i;
3251
3252 if (!PyArg_NoArgs(args))
3253 return NULL;
3254
3255 /* Split into words */
3256 list = split(self, NULL, -1);
3257 if (!list)
3258 return NULL;
3259
3260 /* Capitalize each word */
3261 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3262 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3263 fixcapitalize);
3264 if (item == NULL)
3265 goto onError;
3266 Py_DECREF(PyList_GET_ITEM(list, i));
3267 PyList_SET_ITEM(list, i, item);
3268 }
3269
3270 /* Join the words to form a new string */
3271 item = PyUnicode_Join(NULL, list);
3272
3273onError:
3274 Py_DECREF(list);
3275 return (PyObject *)item;
3276}
3277#endif
3278
3279static char center__doc__[] =
3280"S.center(width) -> unicode\n\
3281\n\
3282Return S centered in a Unicode string of length width. Padding is done\n\
3283using spaces.";
3284
3285static PyObject *
3286unicode_center(PyUnicodeObject *self, PyObject *args)
3287{
3288 int marg, left;
3289 int width;
3290
3291 if (!PyArg_ParseTuple(args, "i:center", &width))
3292 return NULL;
3293
3294 if (self->length >= width) {
3295 Py_INCREF(self);
3296 return (PyObject*) self;
3297 }
3298
3299 marg = width - self->length;
3300 left = marg / 2 + (marg & width & 1);
3301
3302 return (PyObject*) pad(self, left, marg - left, ' ');
3303}
3304
Marc-André Lemburge5034372000-08-08 08:04:29 +00003305#if 0
3306
3307/* This code should go into some future Unicode collation support
3308 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003309 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003310
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003311/* speedy UTF-16 code point order comparison */
3312/* gleaned from: */
3313/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3314
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003315static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003316{
3317 0, 0, 0, 0, 0, 0, 0, 0,
3318 0, 0, 0, 0, 0, 0, 0, 0,
3319 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003320 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003321};
3322
Guido van Rossumd57fd912000-03-10 22:53:23 +00003323static int
3324unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3325{
3326 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003327
Guido van Rossumd57fd912000-03-10 22:53:23 +00003328 Py_UNICODE *s1 = str1->str;
3329 Py_UNICODE *s2 = str2->str;
3330
3331 len1 = str1->length;
3332 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003333
Guido van Rossumd57fd912000-03-10 22:53:23 +00003334 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003335 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003336
3337 c1 = *s1++;
3338 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003339
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003340 if (c1 > (1<<11) * 26)
3341 c1 += utf16Fixup[c1>>11];
3342 if (c2 > (1<<11) * 26)
3343 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003344 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003345
3346 if (c1 != c2)
3347 return (c1 < c2) ? -1 : 1;
3348
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003349 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 }
3351
3352 return (len1 < len2) ? -1 : (len1 != len2);
3353}
3354
Marc-André Lemburge5034372000-08-08 08:04:29 +00003355#else
3356
3357static int
3358unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3359{
3360 register int len1, len2;
3361
3362 Py_UNICODE *s1 = str1->str;
3363 Py_UNICODE *s2 = str2->str;
3364
3365 len1 = str1->length;
3366 len2 = str2->length;
3367
3368 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003369 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003370
Fredrik Lundh45714e92001-06-26 16:39:36 +00003371 c1 = *s1++;
3372 c2 = *s2++;
3373
3374 if (c1 != c2)
3375 return (c1 < c2) ? -1 : 1;
3376
Marc-André Lemburge5034372000-08-08 08:04:29 +00003377 len1--; len2--;
3378 }
3379
3380 return (len1 < len2) ? -1 : (len1 != len2);
3381}
3382
3383#endif
3384
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385int PyUnicode_Compare(PyObject *left,
3386 PyObject *right)
3387{
3388 PyUnicodeObject *u = NULL, *v = NULL;
3389 int result;
3390
3391 /* Coerce the two arguments */
3392 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3393 if (u == NULL)
3394 goto onError;
3395 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3396 if (v == NULL)
3397 goto onError;
3398
Thomas Wouters7e474022000-07-16 12:04:32 +00003399 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003400 if (v == u) {
3401 Py_DECREF(u);
3402 Py_DECREF(v);
3403 return 0;
3404 }
3405
3406 result = unicode_compare(u, v);
3407
3408 Py_DECREF(u);
3409 Py_DECREF(v);
3410 return result;
3411
3412onError:
3413 Py_XDECREF(u);
3414 Py_XDECREF(v);
3415 return -1;
3416}
3417
Guido van Rossum403d68b2000-03-13 15:55:09 +00003418int PyUnicode_Contains(PyObject *container,
3419 PyObject *element)
3420{
3421 PyUnicodeObject *u = NULL, *v = NULL;
3422 int result;
3423 register const Py_UNICODE *p, *e;
3424 register Py_UNICODE ch;
3425
3426 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003427 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003428 if (v == NULL) {
3429 PyErr_SetString(PyExc_TypeError,
3430 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003431 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003432 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003433 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3434 if (u == NULL) {
3435 Py_DECREF(v);
3436 goto onError;
3437 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003438
3439 /* Check v in u */
3440 if (PyUnicode_GET_SIZE(v) != 1) {
3441 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003442 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003443 goto onError;
3444 }
3445 ch = *PyUnicode_AS_UNICODE(v);
3446 p = PyUnicode_AS_UNICODE(u);
3447 e = p + PyUnicode_GET_SIZE(u);
3448 result = 0;
3449 while (p < e) {
3450 if (*p++ == ch) {
3451 result = 1;
3452 break;
3453 }
3454 }
3455
3456 Py_DECREF(u);
3457 Py_DECREF(v);
3458 return result;
3459
3460onError:
3461 Py_XDECREF(u);
3462 Py_XDECREF(v);
3463 return -1;
3464}
3465
Guido van Rossumd57fd912000-03-10 22:53:23 +00003466/* Concat to string or Unicode object giving a new Unicode object. */
3467
3468PyObject *PyUnicode_Concat(PyObject *left,
3469 PyObject *right)
3470{
3471 PyUnicodeObject *u = NULL, *v = NULL, *w;
3472
3473 /* Coerce the two arguments */
3474 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3475 if (u == NULL)
3476 goto onError;
3477 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3478 if (v == NULL)
3479 goto onError;
3480
3481 /* Shortcuts */
3482 if (v == unicode_empty) {
3483 Py_DECREF(v);
3484 return (PyObject *)u;
3485 }
3486 if (u == unicode_empty) {
3487 Py_DECREF(u);
3488 return (PyObject *)v;
3489 }
3490
3491 /* Concat the two Unicode strings */
3492 w = _PyUnicode_New(u->length + v->length);
3493 if (w == NULL)
3494 goto onError;
3495 Py_UNICODE_COPY(w->str, u->str, u->length);
3496 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3497
3498 Py_DECREF(u);
3499 Py_DECREF(v);
3500 return (PyObject *)w;
3501
3502onError:
3503 Py_XDECREF(u);
3504 Py_XDECREF(v);
3505 return NULL;
3506}
3507
3508static char count__doc__[] =
3509"S.count(sub[, start[, end]]) -> int\n\
3510\n\
3511Return the number of occurrences of substring sub in Unicode string\n\
3512S[start:end]. Optional arguments start and end are\n\
3513interpreted as in slice notation.";
3514
3515static PyObject *
3516unicode_count(PyUnicodeObject *self, PyObject *args)
3517{
3518 PyUnicodeObject *substring;
3519 int start = 0;
3520 int end = INT_MAX;
3521 PyObject *result;
3522
Guido van Rossumb8872e62000-05-09 14:14:27 +00003523 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3524 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525 return NULL;
3526
3527 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3528 (PyObject *)substring);
3529 if (substring == NULL)
3530 return NULL;
3531
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 if (start < 0)
3533 start += self->length;
3534 if (start < 0)
3535 start = 0;
3536 if (end > self->length)
3537 end = self->length;
3538 if (end < 0)
3539 end += self->length;
3540 if (end < 0)
3541 end = 0;
3542
3543 result = PyInt_FromLong((long) count(self, start, end, substring));
3544
3545 Py_DECREF(substring);
3546 return result;
3547}
3548
3549static char encode__doc__[] =
3550"S.encode([encoding[,errors]]) -> string\n\
3551\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003552Return an encoded string version of S. Default encoding is the current\n\
3553default string encoding. errors may be given to set a different error\n\
3554handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3555a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556
3557static PyObject *
3558unicode_encode(PyUnicodeObject *self, PyObject *args)
3559{
3560 char *encoding = NULL;
3561 char *errors = NULL;
3562 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3563 return NULL;
3564 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3565}
3566
3567static char expandtabs__doc__[] =
3568"S.expandtabs([tabsize]) -> unicode\n\
3569\n\
3570Return a copy of S where all tab characters are expanded using spaces.\n\
3571If tabsize is not given, a tab size of 8 characters is assumed.";
3572
3573static PyObject*
3574unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3575{
3576 Py_UNICODE *e;
3577 Py_UNICODE *p;
3578 Py_UNICODE *q;
3579 int i, j;
3580 PyUnicodeObject *u;
3581 int tabsize = 8;
3582
3583 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3584 return NULL;
3585
Thomas Wouters7e474022000-07-16 12:04:32 +00003586 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 i = j = 0;
3588 e = self->str + self->length;
3589 for (p = self->str; p < e; p++)
3590 if (*p == '\t') {
3591 if (tabsize > 0)
3592 j += tabsize - (j % tabsize);
3593 }
3594 else {
3595 j++;
3596 if (*p == '\n' || *p == '\r') {
3597 i += j;
3598 j = 0;
3599 }
3600 }
3601
3602 /* Second pass: create output string and fill it */
3603 u = _PyUnicode_New(i + j);
3604 if (!u)
3605 return NULL;
3606
3607 j = 0;
3608 q = u->str;
3609
3610 for (p = self->str; p < e; p++)
3611 if (*p == '\t') {
3612 if (tabsize > 0) {
3613 i = tabsize - (j % tabsize);
3614 j += i;
3615 while (i--)
3616 *q++ = ' ';
3617 }
3618 }
3619 else {
3620 j++;
3621 *q++ = *p;
3622 if (*p == '\n' || *p == '\r')
3623 j = 0;
3624 }
3625
3626 return (PyObject*) u;
3627}
3628
3629static char find__doc__[] =
3630"S.find(sub [,start [,end]]) -> int\n\
3631\n\
3632Return the lowest index in S where substring sub is found,\n\
3633such that sub is contained within s[start,end]. Optional\n\
3634arguments start and end are interpreted as in slice notation.\n\
3635\n\
3636Return -1 on failure.";
3637
3638static PyObject *
3639unicode_find(PyUnicodeObject *self, PyObject *args)
3640{
3641 PyUnicodeObject *substring;
3642 int start = 0;
3643 int end = INT_MAX;
3644 PyObject *result;
3645
Guido van Rossumb8872e62000-05-09 14:14:27 +00003646 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3647 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648 return NULL;
3649 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3650 (PyObject *)substring);
3651 if (substring == NULL)
3652 return NULL;
3653
3654 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3655
3656 Py_DECREF(substring);
3657 return result;
3658}
3659
3660static PyObject *
3661unicode_getitem(PyUnicodeObject *self, int index)
3662{
3663 if (index < 0 || index >= self->length) {
3664 PyErr_SetString(PyExc_IndexError, "string index out of range");
3665 return NULL;
3666 }
3667
3668 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3669}
3670
3671static long
3672unicode_hash(PyUnicodeObject *self)
3673{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003674 /* Since Unicode objects compare equal to their ASCII string
3675 counterparts, they should use the individual character values
3676 as basis for their hash value. This is needed to assure that
3677 strings and Unicode objects behave in the same way as
3678 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003679
Fredrik Lundhdde61642000-07-10 18:27:47 +00003680 register int len;
3681 register Py_UNICODE *p;
3682 register long x;
3683
Guido van Rossumd57fd912000-03-10 22:53:23 +00003684 if (self->hash != -1)
3685 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003686 len = PyUnicode_GET_SIZE(self);
3687 p = PyUnicode_AS_UNICODE(self);
3688 x = *p << 7;
3689 while (--len >= 0)
3690 x = (1000003*x) ^ *p++;
3691 x ^= PyUnicode_GET_SIZE(self);
3692 if (x == -1)
3693 x = -2;
3694 self->hash = x;
3695 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003696}
3697
3698static char index__doc__[] =
3699"S.index(sub [,start [,end]]) -> int\n\
3700\n\
3701Like S.find() but raise ValueError when the substring is not found.";
3702
3703static PyObject *
3704unicode_index(PyUnicodeObject *self, PyObject *args)
3705{
3706 int result;
3707 PyUnicodeObject *substring;
3708 int start = 0;
3709 int end = INT_MAX;
3710
Guido van Rossumb8872e62000-05-09 14:14:27 +00003711 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3712 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 return NULL;
3714
3715 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3716 (PyObject *)substring);
3717 if (substring == NULL)
3718 return NULL;
3719
3720 result = findstring(self, substring, start, end, 1);
3721
3722 Py_DECREF(substring);
3723 if (result < 0) {
3724 PyErr_SetString(PyExc_ValueError, "substring not found");
3725 return NULL;
3726 }
3727 return PyInt_FromLong(result);
3728}
3729
3730static char islower__doc__[] =
3731"S.islower() -> int\n\
3732\n\
3733Return 1 if all cased characters in S are lowercase and there is\n\
3734at least one cased character in S, 0 otherwise.";
3735
3736static PyObject*
3737unicode_islower(PyUnicodeObject *self, PyObject *args)
3738{
3739 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3740 register const Py_UNICODE *e;
3741 int cased;
3742
3743 if (!PyArg_NoArgs(args))
3744 return NULL;
3745
3746 /* Shortcut for single character strings */
3747 if (PyUnicode_GET_SIZE(self) == 1)
3748 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3749
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003750 /* Special case for empty strings */
3751 if (PyString_GET_SIZE(self) == 0)
3752 return PyInt_FromLong(0);
3753
Guido van Rossumd57fd912000-03-10 22:53:23 +00003754 e = p + PyUnicode_GET_SIZE(self);
3755 cased = 0;
3756 for (; p < e; p++) {
3757 register const Py_UNICODE ch = *p;
3758
3759 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3760 return PyInt_FromLong(0);
3761 else if (!cased && Py_UNICODE_ISLOWER(ch))
3762 cased = 1;
3763 }
3764 return PyInt_FromLong(cased);
3765}
3766
3767static char isupper__doc__[] =
3768"S.isupper() -> int\n\
3769\n\
3770Return 1 if all cased characters in S are uppercase and there is\n\
3771at least one cased character in S, 0 otherwise.";
3772
3773static PyObject*
3774unicode_isupper(PyUnicodeObject *self, PyObject *args)
3775{
3776 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3777 register const Py_UNICODE *e;
3778 int cased;
3779
3780 if (!PyArg_NoArgs(args))
3781 return NULL;
3782
3783 /* Shortcut for single character strings */
3784 if (PyUnicode_GET_SIZE(self) == 1)
3785 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3786
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003787 /* Special case for empty strings */
3788 if (PyString_GET_SIZE(self) == 0)
3789 return PyInt_FromLong(0);
3790
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 e = p + PyUnicode_GET_SIZE(self);
3792 cased = 0;
3793 for (; p < e; p++) {
3794 register const Py_UNICODE ch = *p;
3795
3796 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3797 return PyInt_FromLong(0);
3798 else if (!cased && Py_UNICODE_ISUPPER(ch))
3799 cased = 1;
3800 }
3801 return PyInt_FromLong(cased);
3802}
3803
3804static char istitle__doc__[] =
3805"S.istitle() -> int\n\
3806\n\
3807Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3808may only follow uncased characters and lowercase characters only cased\n\
3809ones. Return 0 otherwise.";
3810
3811static PyObject*
3812unicode_istitle(PyUnicodeObject *self, PyObject *args)
3813{
3814 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3815 register const Py_UNICODE *e;
3816 int cased, previous_is_cased;
3817
3818 if (!PyArg_NoArgs(args))
3819 return NULL;
3820
3821 /* Shortcut for single character strings */
3822 if (PyUnicode_GET_SIZE(self) == 1)
3823 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3824 (Py_UNICODE_ISUPPER(*p) != 0));
3825
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003826 /* Special case for empty strings */
3827 if (PyString_GET_SIZE(self) == 0)
3828 return PyInt_FromLong(0);
3829
Guido van Rossumd57fd912000-03-10 22:53:23 +00003830 e = p + PyUnicode_GET_SIZE(self);
3831 cased = 0;
3832 previous_is_cased = 0;
3833 for (; p < e; p++) {
3834 register const Py_UNICODE ch = *p;
3835
3836 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3837 if (previous_is_cased)
3838 return PyInt_FromLong(0);
3839 previous_is_cased = 1;
3840 cased = 1;
3841 }
3842 else if (Py_UNICODE_ISLOWER(ch)) {
3843 if (!previous_is_cased)
3844 return PyInt_FromLong(0);
3845 previous_is_cased = 1;
3846 cased = 1;
3847 }
3848 else
3849 previous_is_cased = 0;
3850 }
3851 return PyInt_FromLong(cased);
3852}
3853
3854static char isspace__doc__[] =
3855"S.isspace() -> int\n\
3856\n\
3857Return 1 if there are only whitespace characters in S,\n\
38580 otherwise.";
3859
3860static PyObject*
3861unicode_isspace(PyUnicodeObject *self, PyObject *args)
3862{
3863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3864 register const Py_UNICODE *e;
3865
3866 if (!PyArg_NoArgs(args))
3867 return NULL;
3868
3869 /* Shortcut for single character strings */
3870 if (PyUnicode_GET_SIZE(self) == 1 &&
3871 Py_UNICODE_ISSPACE(*p))
3872 return PyInt_FromLong(1);
3873
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003874 /* Special case for empty strings */
3875 if (PyString_GET_SIZE(self) == 0)
3876 return PyInt_FromLong(0);
3877
Guido van Rossumd57fd912000-03-10 22:53:23 +00003878 e = p + PyUnicode_GET_SIZE(self);
3879 for (; p < e; p++) {
3880 if (!Py_UNICODE_ISSPACE(*p))
3881 return PyInt_FromLong(0);
3882 }
3883 return PyInt_FromLong(1);
3884}
3885
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003886static char isalpha__doc__[] =
3887"S.isalpha() -> int\n\
3888\n\
3889Return 1 if all characters in S are alphabetic\n\
3890and there is at least one character in S, 0 otherwise.";
3891
3892static PyObject*
3893unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3894{
3895 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3896 register const Py_UNICODE *e;
3897
3898 if (!PyArg_NoArgs(args))
3899 return NULL;
3900
3901 /* Shortcut for single character strings */
3902 if (PyUnicode_GET_SIZE(self) == 1 &&
3903 Py_UNICODE_ISALPHA(*p))
3904 return PyInt_FromLong(1);
3905
3906 /* Special case for empty strings */
3907 if (PyString_GET_SIZE(self) == 0)
3908 return PyInt_FromLong(0);
3909
3910 e = p + PyUnicode_GET_SIZE(self);
3911 for (; p < e; p++) {
3912 if (!Py_UNICODE_ISALPHA(*p))
3913 return PyInt_FromLong(0);
3914 }
3915 return PyInt_FromLong(1);
3916}
3917
3918static char isalnum__doc__[] =
3919"S.isalnum() -> int\n\
3920\n\
3921Return 1 if all characters in S are alphanumeric\n\
3922and there is at least one character in S, 0 otherwise.";
3923
3924static PyObject*
3925unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3926{
3927 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3928 register const Py_UNICODE *e;
3929
3930 if (!PyArg_NoArgs(args))
3931 return NULL;
3932
3933 /* Shortcut for single character strings */
3934 if (PyUnicode_GET_SIZE(self) == 1 &&
3935 Py_UNICODE_ISALNUM(*p))
3936 return PyInt_FromLong(1);
3937
3938 /* Special case for empty strings */
3939 if (PyString_GET_SIZE(self) == 0)
3940 return PyInt_FromLong(0);
3941
3942 e = p + PyUnicode_GET_SIZE(self);
3943 for (; p < e; p++) {
3944 if (!Py_UNICODE_ISALNUM(*p))
3945 return PyInt_FromLong(0);
3946 }
3947 return PyInt_FromLong(1);
3948}
3949
Guido van Rossumd57fd912000-03-10 22:53:23 +00003950static char isdecimal__doc__[] =
3951"S.isdecimal() -> int\n\
3952\n\
3953Return 1 if there are only decimal characters in S,\n\
39540 otherwise.";
3955
3956static PyObject*
3957unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3958{
3959 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3960 register const Py_UNICODE *e;
3961
3962 if (!PyArg_NoArgs(args))
3963 return NULL;
3964
3965 /* Shortcut for single character strings */
3966 if (PyUnicode_GET_SIZE(self) == 1 &&
3967 Py_UNICODE_ISDECIMAL(*p))
3968 return PyInt_FromLong(1);
3969
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003970 /* Special case for empty strings */
3971 if (PyString_GET_SIZE(self) == 0)
3972 return PyInt_FromLong(0);
3973
Guido van Rossumd57fd912000-03-10 22:53:23 +00003974 e = p + PyUnicode_GET_SIZE(self);
3975 for (; p < e; p++) {
3976 if (!Py_UNICODE_ISDECIMAL(*p))
3977 return PyInt_FromLong(0);
3978 }
3979 return PyInt_FromLong(1);
3980}
3981
3982static char isdigit__doc__[] =
3983"S.isdigit() -> int\n\
3984\n\
3985Return 1 if there are only digit characters in S,\n\
39860 otherwise.";
3987
3988static PyObject*
3989unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3990{
3991 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3992 register const Py_UNICODE *e;
3993
3994 if (!PyArg_NoArgs(args))
3995 return NULL;
3996
3997 /* Shortcut for single character strings */
3998 if (PyUnicode_GET_SIZE(self) == 1 &&
3999 Py_UNICODE_ISDIGIT(*p))
4000 return PyInt_FromLong(1);
4001
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004002 /* Special case for empty strings */
4003 if (PyString_GET_SIZE(self) == 0)
4004 return PyInt_FromLong(0);
4005
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 e = p + PyUnicode_GET_SIZE(self);
4007 for (; p < e; p++) {
4008 if (!Py_UNICODE_ISDIGIT(*p))
4009 return PyInt_FromLong(0);
4010 }
4011 return PyInt_FromLong(1);
4012}
4013
4014static char isnumeric__doc__[] =
4015"S.isnumeric() -> int\n\
4016\n\
4017Return 1 if there are only numeric characters in S,\n\
40180 otherwise.";
4019
4020static PyObject*
4021unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
4022{
4023 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
4024 register const Py_UNICODE *e;
4025
4026 if (!PyArg_NoArgs(args))
4027 return NULL;
4028
4029 /* Shortcut for single character strings */
4030 if (PyUnicode_GET_SIZE(self) == 1 &&
4031 Py_UNICODE_ISNUMERIC(*p))
4032 return PyInt_FromLong(1);
4033
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00004034 /* Special case for empty strings */
4035 if (PyString_GET_SIZE(self) == 0)
4036 return PyInt_FromLong(0);
4037
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 e = p + PyUnicode_GET_SIZE(self);
4039 for (; p < e; p++) {
4040 if (!Py_UNICODE_ISNUMERIC(*p))
4041 return PyInt_FromLong(0);
4042 }
4043 return PyInt_FromLong(1);
4044}
4045
4046static char join__doc__[] =
4047"S.join(sequence) -> unicode\n\
4048\n\
4049Return a string which is the concatenation of the strings in the\n\
4050sequence. The separator between elements is S.";
4051
4052static PyObject*
4053unicode_join(PyUnicodeObject *self, PyObject *args)
4054{
4055 PyObject *data;
4056 if (!PyArg_ParseTuple(args, "O:join", &data))
4057 return NULL;
4058
4059 return PyUnicode_Join((PyObject *)self, data);
4060}
4061
4062static int
4063unicode_length(PyUnicodeObject *self)
4064{
4065 return self->length;
4066}
4067
4068static char ljust__doc__[] =
4069"S.ljust(width) -> unicode\n\
4070\n\
4071Return S left justified in a Unicode string of length width. Padding is\n\
4072done using spaces.";
4073
4074static PyObject *
4075unicode_ljust(PyUnicodeObject *self, PyObject *args)
4076{
4077 int width;
4078 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4079 return NULL;
4080
4081 if (self->length >= width) {
4082 Py_INCREF(self);
4083 return (PyObject*) self;
4084 }
4085
4086 return (PyObject*) pad(self, 0, width - self->length, ' ');
4087}
4088
4089static char lower__doc__[] =
4090"S.lower() -> unicode\n\
4091\n\
4092Return a copy of the string S converted to lowercase.";
4093
4094static PyObject*
4095unicode_lower(PyUnicodeObject *self, PyObject *args)
4096{
4097 if (!PyArg_NoArgs(args))
4098 return NULL;
4099 return fixup(self, fixlower);
4100}
4101
4102static char lstrip__doc__[] =
4103"S.lstrip() -> unicode\n\
4104\n\
4105Return a copy of the string S with leading whitespace removed.";
4106
4107static PyObject *
4108unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4109{
4110 if (!PyArg_NoArgs(args))
4111 return NULL;
4112 return strip(self, 1, 0);
4113}
4114
4115static PyObject*
4116unicode_repeat(PyUnicodeObject *str, int len)
4117{
4118 PyUnicodeObject *u;
4119 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004120 int nchars;
4121 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004122
4123 if (len < 0)
4124 len = 0;
4125
4126 if (len == 1) {
4127 /* no repeat, return original string */
4128 Py_INCREF(str);
4129 return (PyObject*) str;
4130 }
Tim Peters8f422462000-09-09 06:13:41 +00004131
4132 /* ensure # of chars needed doesn't overflow int and # of bytes
4133 * needed doesn't overflow size_t
4134 */
4135 nchars = len * str->length;
4136 if (len && nchars / len != str->length) {
4137 PyErr_SetString(PyExc_OverflowError,
4138 "repeated string is too long");
4139 return NULL;
4140 }
4141 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4142 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4143 PyErr_SetString(PyExc_OverflowError,
4144 "repeated string is too long");
4145 return NULL;
4146 }
4147 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004148 if (!u)
4149 return NULL;
4150
4151 p = u->str;
4152
4153 while (len-- > 0) {
4154 Py_UNICODE_COPY(p, str->str, str->length);
4155 p += str->length;
4156 }
4157
4158 return (PyObject*) u;
4159}
4160
4161PyObject *PyUnicode_Replace(PyObject *obj,
4162 PyObject *subobj,
4163 PyObject *replobj,
4164 int maxcount)
4165{
4166 PyObject *self;
4167 PyObject *str1;
4168 PyObject *str2;
4169 PyObject *result;
4170
4171 self = PyUnicode_FromObject(obj);
4172 if (self == NULL)
4173 return NULL;
4174 str1 = PyUnicode_FromObject(subobj);
4175 if (str1 == NULL) {
4176 Py_DECREF(self);
4177 return NULL;
4178 }
4179 str2 = PyUnicode_FromObject(replobj);
4180 if (str2 == NULL) {
4181 Py_DECREF(self);
4182 Py_DECREF(str1);
4183 return NULL;
4184 }
4185 result = replace((PyUnicodeObject *)self,
4186 (PyUnicodeObject *)str1,
4187 (PyUnicodeObject *)str2,
4188 maxcount);
4189 Py_DECREF(self);
4190 Py_DECREF(str1);
4191 Py_DECREF(str2);
4192 return result;
4193}
4194
4195static char replace__doc__[] =
4196"S.replace (old, new[, maxsplit]) -> unicode\n\
4197\n\
4198Return a copy of S with all occurrences of substring\n\
4199old replaced by new. If the optional argument maxsplit is\n\
4200given, only the first maxsplit occurrences are replaced.";
4201
4202static PyObject*
4203unicode_replace(PyUnicodeObject *self, PyObject *args)
4204{
4205 PyUnicodeObject *str1;
4206 PyUnicodeObject *str2;
4207 int maxcount = -1;
4208 PyObject *result;
4209
4210 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4211 return NULL;
4212 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4213 if (str1 == NULL)
4214 return NULL;
4215 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4216 if (str2 == NULL)
4217 return NULL;
4218
4219 result = replace(self, str1, str2, maxcount);
4220
4221 Py_DECREF(str1);
4222 Py_DECREF(str2);
4223 return result;
4224}
4225
4226static
4227PyObject *unicode_repr(PyObject *unicode)
4228{
4229 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4230 PyUnicode_GET_SIZE(unicode),
4231 1);
4232}
4233
4234static char rfind__doc__[] =
4235"S.rfind(sub [,start [,end]]) -> int\n\
4236\n\
4237Return the highest index in S where substring sub is found,\n\
4238such that sub is contained within s[start,end]. Optional\n\
4239arguments start and end are interpreted as in slice notation.\n\
4240\n\
4241Return -1 on failure.";
4242
4243static PyObject *
4244unicode_rfind(PyUnicodeObject *self, PyObject *args)
4245{
4246 PyUnicodeObject *substring;
4247 int start = 0;
4248 int end = INT_MAX;
4249 PyObject *result;
4250
Guido van Rossumb8872e62000-05-09 14:14:27 +00004251 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4252 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253 return NULL;
4254 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4255 (PyObject *)substring);
4256 if (substring == NULL)
4257 return NULL;
4258
4259 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4260
4261 Py_DECREF(substring);
4262 return result;
4263}
4264
4265static char rindex__doc__[] =
4266"S.rindex(sub [,start [,end]]) -> int\n\
4267\n\
4268Like S.rfind() but raise ValueError when the substring is not found.";
4269
4270static PyObject *
4271unicode_rindex(PyUnicodeObject *self, PyObject *args)
4272{
4273 int result;
4274 PyUnicodeObject *substring;
4275 int start = 0;
4276 int end = INT_MAX;
4277
Guido van Rossumb8872e62000-05-09 14:14:27 +00004278 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4279 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280 return NULL;
4281 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4282 (PyObject *)substring);
4283 if (substring == NULL)
4284 return NULL;
4285
4286 result = findstring(self, substring, start, end, -1);
4287
4288 Py_DECREF(substring);
4289 if (result < 0) {
4290 PyErr_SetString(PyExc_ValueError, "substring not found");
4291 return NULL;
4292 }
4293 return PyInt_FromLong(result);
4294}
4295
4296static char rjust__doc__[] =
4297"S.rjust(width) -> unicode\n\
4298\n\
4299Return S right justified in a Unicode string of length width. Padding is\n\
4300done using spaces.";
4301
4302static PyObject *
4303unicode_rjust(PyUnicodeObject *self, PyObject *args)
4304{
4305 int width;
4306 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4307 return NULL;
4308
4309 if (self->length >= width) {
4310 Py_INCREF(self);
4311 return (PyObject*) self;
4312 }
4313
4314 return (PyObject*) pad(self, width - self->length, 0, ' ');
4315}
4316
4317static char rstrip__doc__[] =
4318"S.rstrip() -> unicode\n\
4319\n\
4320Return a copy of the string S with trailing whitespace removed.";
4321
4322static PyObject *
4323unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4324{
4325 if (!PyArg_NoArgs(args))
4326 return NULL;
4327 return strip(self, 0, 1);
4328}
4329
4330static PyObject*
4331unicode_slice(PyUnicodeObject *self, int start, int end)
4332{
4333 /* standard clamping */
4334 if (start < 0)
4335 start = 0;
4336 if (end < 0)
4337 end = 0;
4338 if (end > self->length)
4339 end = self->length;
4340 if (start == 0 && end == self->length) {
4341 /* full slice, return original string */
4342 Py_INCREF(self);
4343 return (PyObject*) self;
4344 }
4345 if (start > end)
4346 start = end;
4347 /* copy slice */
4348 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4349 end - start);
4350}
4351
4352PyObject *PyUnicode_Split(PyObject *s,
4353 PyObject *sep,
4354 int maxsplit)
4355{
4356 PyObject *result;
4357
4358 s = PyUnicode_FromObject(s);
4359 if (s == NULL)
4360 return NULL;
4361 if (sep != NULL) {
4362 sep = PyUnicode_FromObject(sep);
4363 if (sep == NULL) {
4364 Py_DECREF(s);
4365 return NULL;
4366 }
4367 }
4368
4369 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4370
4371 Py_DECREF(s);
4372 Py_XDECREF(sep);
4373 return result;
4374}
4375
4376static char split__doc__[] =
4377"S.split([sep [,maxsplit]]) -> list of strings\n\
4378\n\
4379Return a list of the words in S, using sep as the\n\
4380delimiter string. If maxsplit is given, at most maxsplit\n\
4381splits are done. If sep is not specified, any whitespace string\n\
4382is a separator.";
4383
4384static PyObject*
4385unicode_split(PyUnicodeObject *self, PyObject *args)
4386{
4387 PyObject *substring = Py_None;
4388 int maxcount = -1;
4389
4390 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4391 return NULL;
4392
4393 if (substring == Py_None)
4394 return split(self, NULL, maxcount);
4395 else if (PyUnicode_Check(substring))
4396 return split(self, (PyUnicodeObject *)substring, maxcount);
4397 else
4398 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4399}
4400
4401static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004402"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004403\n\
4404Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004405Line breaks are not included in the resulting list unless keepends\n\
4406is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004407
4408static PyObject*
4409unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4410{
Guido van Rossum86662912000-04-11 15:38:46 +00004411 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004412
Guido van Rossum86662912000-04-11 15:38:46 +00004413 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004414 return NULL;
4415
Guido van Rossum86662912000-04-11 15:38:46 +00004416 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004417}
4418
4419static
4420PyObject *unicode_str(PyUnicodeObject *self)
4421{
Fred Drakee4315f52000-05-09 19:53:39 +00004422 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423}
4424
4425static char strip__doc__[] =
4426"S.strip() -> unicode\n\
4427\n\
4428Return a copy of S with leading and trailing whitespace removed.";
4429
4430static PyObject *
4431unicode_strip(PyUnicodeObject *self, PyObject *args)
4432{
4433 if (!PyArg_NoArgs(args))
4434 return NULL;
4435 return strip(self, 1, 1);
4436}
4437
4438static char swapcase__doc__[] =
4439"S.swapcase() -> unicode\n\
4440\n\
4441Return a copy of S with uppercase characters converted to lowercase\n\
4442and vice versa.";
4443
4444static PyObject*
4445unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4446{
4447 if (!PyArg_NoArgs(args))
4448 return NULL;
4449 return fixup(self, fixswapcase);
4450}
4451
4452static char translate__doc__[] =
4453"S.translate(table) -> unicode\n\
4454\n\
4455Return a copy of the string S, where all characters have been mapped\n\
4456through the given translation table, which must be a mapping of\n\
4457Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4458are left untouched. Characters mapped to None are deleted.";
4459
4460static PyObject*
4461unicode_translate(PyUnicodeObject *self, PyObject *args)
4462{
4463 PyObject *table;
4464
4465 if (!PyArg_ParseTuple(args, "O:translate", &table))
4466 return NULL;
4467 return PyUnicode_TranslateCharmap(self->str,
4468 self->length,
4469 table,
4470 "ignore");
4471}
4472
4473static char upper__doc__[] =
4474"S.upper() -> unicode\n\
4475\n\
4476Return a copy of S converted to uppercase.";
4477
4478static PyObject*
4479unicode_upper(PyUnicodeObject *self, PyObject *args)
4480{
4481 if (!PyArg_NoArgs(args))
4482 return NULL;
4483 return fixup(self, fixupper);
4484}
4485
4486#if 0
4487static char zfill__doc__[] =
4488"S.zfill(width) -> unicode\n\
4489\n\
4490Pad a numeric string x with zeros on the left, to fill a field\n\
4491of the specified width. The string x is never truncated.";
4492
4493static PyObject *
4494unicode_zfill(PyUnicodeObject *self, PyObject *args)
4495{
4496 int fill;
4497 PyUnicodeObject *u;
4498
4499 int width;
4500 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4501 return NULL;
4502
4503 if (self->length >= width) {
4504 Py_INCREF(self);
4505 return (PyObject*) self;
4506 }
4507
4508 fill = width - self->length;
4509
4510 u = pad(self, fill, 0, '0');
4511
4512 if (u->str[fill] == '+' || u->str[fill] == '-') {
4513 /* move sign to beginning of string */
4514 u->str[0] = u->str[fill];
4515 u->str[fill] = '0';
4516 }
4517
4518 return (PyObject*) u;
4519}
4520#endif
4521
4522#if 0
4523static PyObject*
4524unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4525{
4526 if (!PyArg_NoArgs(args))
4527 return NULL;
4528 return PyInt_FromLong(unicode_freelist_size);
4529}
4530#endif
4531
4532static char startswith__doc__[] =
4533"S.startswith(prefix[, start[, end]]) -> int\n\
4534\n\
4535Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4536optional start, test S beginning at that position. With optional end, stop\n\
4537comparing S at that position.";
4538
4539static PyObject *
4540unicode_startswith(PyUnicodeObject *self,
4541 PyObject *args)
4542{
4543 PyUnicodeObject *substring;
4544 int start = 0;
4545 int end = INT_MAX;
4546 PyObject *result;
4547
Guido van Rossumb8872e62000-05-09 14:14:27 +00004548 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4549 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550 return NULL;
4551 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4552 (PyObject *)substring);
4553 if (substring == NULL)
4554 return NULL;
4555
4556 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4557
4558 Py_DECREF(substring);
4559 return result;
4560}
4561
4562
4563static char endswith__doc__[] =
4564"S.endswith(suffix[, start[, end]]) -> int\n\
4565\n\
4566Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4567optional start, test S beginning at that position. With optional end, stop\n\
4568comparing S at that position.";
4569
4570static PyObject *
4571unicode_endswith(PyUnicodeObject *self,
4572 PyObject *args)
4573{
4574 PyUnicodeObject *substring;
4575 int start = 0;
4576 int end = INT_MAX;
4577 PyObject *result;
4578
Guido van Rossumb8872e62000-05-09 14:14:27 +00004579 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4580 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004581 return NULL;
4582 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4583 (PyObject *)substring);
4584 if (substring == NULL)
4585 return NULL;
4586
4587 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4588
4589 Py_DECREF(substring);
4590 return result;
4591}
4592
4593
4594static PyMethodDef unicode_methods[] = {
4595
4596 /* Order is according to common usage: often used methods should
4597 appear first, since lookup is done sequentially. */
4598
4599 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4600 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4601 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4602 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4603 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4604 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4605 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4606 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4607 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4608 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4609 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4610 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4611 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4612 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4613/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4614 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4615 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4616 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4617 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4618 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4619 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4620 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4621 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4622 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4623 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4624 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4625 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4626 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4627 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4628 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4629 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4630 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4631 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004632 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4633 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004634#if 0
4635 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4636 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4637#endif
4638
4639#if 0
4640 /* This one is just used for debugging the implementation. */
4641 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4642#endif
4643
4644 {NULL, NULL}
4645};
4646
4647static PyObject *
4648unicode_getattr(PyUnicodeObject *self, char *name)
4649{
4650 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4651}
4652
4653static PySequenceMethods unicode_as_sequence = {
4654 (inquiry) unicode_length, /* sq_length */
4655 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4656 (intargfunc) unicode_repeat, /* sq_repeat */
4657 (intargfunc) unicode_getitem, /* sq_item */
4658 (intintargfunc) unicode_slice, /* sq_slice */
4659 0, /* sq_ass_item */
4660 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004661 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662};
4663
4664static int
4665unicode_buffer_getreadbuf(PyUnicodeObject *self,
4666 int index,
4667 const void **ptr)
4668{
4669 if (index != 0) {
4670 PyErr_SetString(PyExc_SystemError,
4671 "accessing non-existent unicode segment");
4672 return -1;
4673 }
4674 *ptr = (void *) self->str;
4675 return PyUnicode_GET_DATA_SIZE(self);
4676}
4677
4678static int
4679unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4680 const void **ptr)
4681{
4682 PyErr_SetString(PyExc_TypeError,
4683 "cannot use unicode as modifyable buffer");
4684 return -1;
4685}
4686
4687static int
4688unicode_buffer_getsegcount(PyUnicodeObject *self,
4689 int *lenp)
4690{
4691 if (lenp)
4692 *lenp = PyUnicode_GET_DATA_SIZE(self);
4693 return 1;
4694}
4695
4696static int
4697unicode_buffer_getcharbuf(PyUnicodeObject *self,
4698 int index,
4699 const void **ptr)
4700{
4701 PyObject *str;
4702
4703 if (index != 0) {
4704 PyErr_SetString(PyExc_SystemError,
4705 "accessing non-existent unicode segment");
4706 return -1;
4707 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004708 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 if (str == NULL)
4710 return -1;
4711 *ptr = (void *) PyString_AS_STRING(str);
4712 return PyString_GET_SIZE(str);
4713}
4714
4715/* Helpers for PyUnicode_Format() */
4716
4717static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004718getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719{
4720 int argidx = *p_argidx;
4721 if (argidx < arglen) {
4722 (*p_argidx)++;
4723 if (arglen < 0)
4724 return args;
4725 else
4726 return PyTuple_GetItem(args, argidx);
4727 }
4728 PyErr_SetString(PyExc_TypeError,
4729 "not enough arguments for format string");
4730 return NULL;
4731}
4732
4733#define F_LJUST (1<<0)
4734#define F_SIGN (1<<1)
4735#define F_BLANK (1<<2)
4736#define F_ALT (1<<3)
4737#define F_ZERO (1<<4)
4738
4739static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004740int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741{
4742 register int i;
4743 int len;
4744 va_list va;
4745 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004746 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747
4748 /* First, format the string as char array, then expand to Py_UNICODE
4749 array. */
4750 charbuffer = (char *)buffer;
4751 len = vsprintf(charbuffer, format, va);
4752 for (i = len - 1; i >= 0; i--)
4753 buffer[i] = (Py_UNICODE) charbuffer[i];
4754
4755 va_end(va);
4756 return len;
4757}
4758
4759static int
4760formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004761 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762 int flags,
4763 int prec,
4764 int type,
4765 PyObject *v)
4766{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004767 /* fmt = '%#.' + `prec` + `type`
4768 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004769 char fmt[20];
4770 double x;
4771
4772 x = PyFloat_AsDouble(v);
4773 if (x == -1.0 && PyErr_Occurred())
4774 return -1;
4775 if (prec < 0)
4776 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4778 type = 'g';
4779 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004780 /* worst case length calc to ensure no buffer overrun:
4781 fmt = %#.<prec>g
4782 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4783 for any double rep.)
4784 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4785 If prec=0 the effective precision is 1 (the leading digit is
4786 always given), therefore increase by one to 10+prec. */
4787 if (buflen <= (size_t)10 + (size_t)prec) {
4788 PyErr_SetString(PyExc_OverflowError,
4789 "formatted float is too long (precision too long?)");
4790 return -1;
4791 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004792 return usprintf(buf, fmt, x);
4793}
4794
Tim Peters38fd5b62000-09-21 05:43:11 +00004795static PyObject*
4796formatlong(PyObject *val, int flags, int prec, int type)
4797{
4798 char *buf;
4799 int i, len;
4800 PyObject *str; /* temporary string object. */
4801 PyUnicodeObject *result;
4802
4803 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4804 if (!str)
4805 return NULL;
4806 result = _PyUnicode_New(len);
4807 for (i = 0; i < len; i++)
4808 result->str[i] = buf[i];
4809 result->str[len] = 0;
4810 Py_DECREF(str);
4811 return (PyObject*)result;
4812}
4813
Guido van Rossumd57fd912000-03-10 22:53:23 +00004814static int
4815formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004816 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004817 int flags,
4818 int prec,
4819 int type,
4820 PyObject *v)
4821{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004822 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004823 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4824 + 1 + 1 = 24*/
4825 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004827 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004828
4829 x = PyInt_AsLong(v);
4830 if (x == -1 && PyErr_Occurred())
4831 return -1;
4832 if (prec < 0)
4833 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004834 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4835 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4836 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4837 PyErr_SetString(PyExc_OverflowError,
4838 "formatted integer is too long (precision too long?)");
4839 return -1;
4840 }
Tim Petersfff53252001-04-12 18:38:48 +00004841 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4842 * but we want it (for consistency with other %#x conversions, and
4843 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004844 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4845 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4846 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004847 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004848 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4849 /* Only way to know what the platform does is to try it. */
4850 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4851 if (fmt[1] != (char)type) {
4852 /* Supply our own leading 0x/0X -- needed under std C */
4853 use_native_c_format = 0;
4854 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4855 }
4856 }
4857 if (use_native_c_format)
4858 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 return usprintf(buf, fmt, x);
4860}
4861
4862static int
4863formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004864 size_t buflen,
4865 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004867 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004868 if (PyUnicode_Check(v)) {
4869 if (PyUnicode_GET_SIZE(v) != 1)
4870 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004872 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004874 else if (PyString_Check(v)) {
4875 if (PyString_GET_SIZE(v) != 1)
4876 goto onError;
4877 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4878 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879
4880 else {
4881 /* Integer input truncated to a character */
4882 long x;
4883 x = PyInt_AsLong(v);
4884 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004885 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886 buf[0] = (char) x;
4887 }
4888 buf[1] = '\0';
4889 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004890
4891 onError:
4892 PyErr_SetString(PyExc_TypeError,
4893 "%c requires int or char");
4894 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004895}
4896
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004897/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4898
4899 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4900 chars are formatted. XXX This is a magic number. Each formatting
4901 routine does bounds checking to ensure no overflow, but a better
4902 solution may be to malloc a buffer of appropriate size for each
4903 format. For now, the current solution is sufficient.
4904*/
4905#define FORMATBUFLEN (size_t)120
4906
Guido van Rossumd57fd912000-03-10 22:53:23 +00004907PyObject *PyUnicode_Format(PyObject *format,
4908 PyObject *args)
4909{
4910 Py_UNICODE *fmt, *res;
4911 int fmtcnt, rescnt, reslen, arglen, argidx;
4912 int args_owned = 0;
4913 PyUnicodeObject *result = NULL;
4914 PyObject *dict = NULL;
4915 PyObject *uformat;
4916
4917 if (format == NULL || args == NULL) {
4918 PyErr_BadInternalCall();
4919 return NULL;
4920 }
4921 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004922 if (uformat == NULL)
4923 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004924 fmt = PyUnicode_AS_UNICODE(uformat);
4925 fmtcnt = PyUnicode_GET_SIZE(uformat);
4926
4927 reslen = rescnt = fmtcnt + 100;
4928 result = _PyUnicode_New(reslen);
4929 if (result == NULL)
4930 goto onError;
4931 res = PyUnicode_AS_UNICODE(result);
4932
4933 if (PyTuple_Check(args)) {
4934 arglen = PyTuple_Size(args);
4935 argidx = 0;
4936 }
4937 else {
4938 arglen = -1;
4939 argidx = -2;
4940 }
4941 if (args->ob_type->tp_as_mapping)
4942 dict = args;
4943
4944 while (--fmtcnt >= 0) {
4945 if (*fmt != '%') {
4946 if (--rescnt < 0) {
4947 rescnt = fmtcnt + 100;
4948 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004949 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 return NULL;
4951 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4952 --rescnt;
4953 }
4954 *res++ = *fmt++;
4955 }
4956 else {
4957 /* Got a format specifier */
4958 int flags = 0;
4959 int width = -1;
4960 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004961 Py_UNICODE c = '\0';
4962 Py_UNICODE fill;
4963 PyObject *v = NULL;
4964 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004965 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004966 Py_UNICODE sign;
4967 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004968 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004969
4970 fmt++;
4971 if (*fmt == '(') {
4972 Py_UNICODE *keystart;
4973 int keylen;
4974 PyObject *key;
4975 int pcount = 1;
4976
4977 if (dict == NULL) {
4978 PyErr_SetString(PyExc_TypeError,
4979 "format requires a mapping");
4980 goto onError;
4981 }
4982 ++fmt;
4983 --fmtcnt;
4984 keystart = fmt;
4985 /* Skip over balanced parentheses */
4986 while (pcount > 0 && --fmtcnt >= 0) {
4987 if (*fmt == ')')
4988 --pcount;
4989 else if (*fmt == '(')
4990 ++pcount;
4991 fmt++;
4992 }
4993 keylen = fmt - keystart - 1;
4994 if (fmtcnt < 0 || pcount > 0) {
4995 PyErr_SetString(PyExc_ValueError,
4996 "incomplete format key");
4997 goto onError;
4998 }
Fred Drakee4315f52000-05-09 19:53:39 +00004999 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 then looked up since Python uses strings to hold
5001 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00005002 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00005003 key = PyUnicode_EncodeUTF8(keystart,
5004 keylen,
5005 NULL);
5006 if (key == NULL)
5007 goto onError;
5008 if (args_owned) {
5009 Py_DECREF(args);
5010 args_owned = 0;
5011 }
5012 args = PyObject_GetItem(dict, key);
5013 Py_DECREF(key);
5014 if (args == NULL) {
5015 goto onError;
5016 }
5017 args_owned = 1;
5018 arglen = -1;
5019 argidx = -2;
5020 }
5021 while (--fmtcnt >= 0) {
5022 switch (c = *fmt++) {
5023 case '-': flags |= F_LJUST; continue;
5024 case '+': flags |= F_SIGN; continue;
5025 case ' ': flags |= F_BLANK; continue;
5026 case '#': flags |= F_ALT; continue;
5027 case '0': flags |= F_ZERO; continue;
5028 }
5029 break;
5030 }
5031 if (c == '*') {
5032 v = getnextarg(args, arglen, &argidx);
5033 if (v == NULL)
5034 goto onError;
5035 if (!PyInt_Check(v)) {
5036 PyErr_SetString(PyExc_TypeError,
5037 "* wants int");
5038 goto onError;
5039 }
5040 width = PyInt_AsLong(v);
5041 if (width < 0) {
5042 flags |= F_LJUST;
5043 width = -width;
5044 }
5045 if (--fmtcnt >= 0)
5046 c = *fmt++;
5047 }
5048 else if (c >= '0' && c <= '9') {
5049 width = c - '0';
5050 while (--fmtcnt >= 0) {
5051 c = *fmt++;
5052 if (c < '0' || c > '9')
5053 break;
5054 if ((width*10) / 10 != width) {
5055 PyErr_SetString(PyExc_ValueError,
5056 "width too big");
5057 goto onError;
5058 }
5059 width = width*10 + (c - '0');
5060 }
5061 }
5062 if (c == '.') {
5063 prec = 0;
5064 if (--fmtcnt >= 0)
5065 c = *fmt++;
5066 if (c == '*') {
5067 v = getnextarg(args, arglen, &argidx);
5068 if (v == NULL)
5069 goto onError;
5070 if (!PyInt_Check(v)) {
5071 PyErr_SetString(PyExc_TypeError,
5072 "* wants int");
5073 goto onError;
5074 }
5075 prec = PyInt_AsLong(v);
5076 if (prec < 0)
5077 prec = 0;
5078 if (--fmtcnt >= 0)
5079 c = *fmt++;
5080 }
5081 else if (c >= '0' && c <= '9') {
5082 prec = c - '0';
5083 while (--fmtcnt >= 0) {
5084 c = Py_CHARMASK(*fmt++);
5085 if (c < '0' || c > '9')
5086 break;
5087 if ((prec*10) / 10 != prec) {
5088 PyErr_SetString(PyExc_ValueError,
5089 "prec too big");
5090 goto onError;
5091 }
5092 prec = prec*10 + (c - '0');
5093 }
5094 }
5095 } /* prec */
5096 if (fmtcnt >= 0) {
5097 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005098 if (--fmtcnt >= 0)
5099 c = *fmt++;
5100 }
5101 }
5102 if (fmtcnt < 0) {
5103 PyErr_SetString(PyExc_ValueError,
5104 "incomplete format");
5105 goto onError;
5106 }
5107 if (c != '%') {
5108 v = getnextarg(args, arglen, &argidx);
5109 if (v == NULL)
5110 goto onError;
5111 }
5112 sign = 0;
5113 fill = ' ';
5114 switch (c) {
5115
5116 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005117 pbuf = formatbuf;
5118 /* presume that buffer length is at least 1 */
5119 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 len = 1;
5121 break;
5122
5123 case 's':
5124 case 'r':
5125 if (PyUnicode_Check(v) && c == 's') {
5126 temp = v;
5127 Py_INCREF(temp);
5128 }
5129 else {
5130 PyObject *unicode;
5131 if (c == 's')
5132 temp = PyObject_Str(v);
5133 else
5134 temp = PyObject_Repr(v);
5135 if (temp == NULL)
5136 goto onError;
5137 if (!PyString_Check(temp)) {
5138 /* XXX Note: this should never happen, since
5139 PyObject_Repr() and PyObject_Str() assure
5140 this */
5141 Py_DECREF(temp);
5142 PyErr_SetString(PyExc_TypeError,
5143 "%s argument has non-string str()");
5144 goto onError;
5145 }
Fred Drakee4315f52000-05-09 19:53:39 +00005146 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005148 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005149 "strict");
5150 Py_DECREF(temp);
5151 temp = unicode;
5152 if (temp == NULL)
5153 goto onError;
5154 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005155 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005156 len = PyUnicode_GET_SIZE(temp);
5157 if (prec >= 0 && len > prec)
5158 len = prec;
5159 break;
5160
5161 case 'i':
5162 case 'd':
5163 case 'u':
5164 case 'o':
5165 case 'x':
5166 case 'X':
5167 if (c == 'i')
5168 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005169 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005170 temp = formatlong(v, flags, prec, c);
5171 if (!temp)
5172 goto onError;
5173 pbuf = PyUnicode_AS_UNICODE(temp);
5174 len = PyUnicode_GET_SIZE(temp);
5175 /* unbounded ints can always produce
5176 a sign character! */
5177 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005178 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005179 else {
5180 pbuf = formatbuf;
5181 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5182 flags, prec, c, v);
5183 if (len < 0)
5184 goto onError;
5185 /* only d conversion is signed */
5186 sign = c == 'd';
5187 }
5188 if (flags & F_ZERO)
5189 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005190 break;
5191
5192 case 'e':
5193 case 'E':
5194 case 'f':
5195 case 'g':
5196 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005197 pbuf = formatbuf;
5198 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5199 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005200 if (len < 0)
5201 goto onError;
5202 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005203 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005204 fill = '0';
5205 break;
5206
5207 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005208 pbuf = formatbuf;
5209 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 if (len < 0)
5211 goto onError;
5212 break;
5213
5214 default:
5215 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005216 "unsupported format character '%c' (0x%x) "
5217 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005218 (31<=c && c<=126) ? c : '?',
5219 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 goto onError;
5221 }
5222 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005223 if (*pbuf == '-' || *pbuf == '+') {
5224 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 len--;
5226 }
5227 else if (flags & F_SIGN)
5228 sign = '+';
5229 else if (flags & F_BLANK)
5230 sign = ' ';
5231 else
5232 sign = 0;
5233 }
5234 if (width < len)
5235 width = len;
5236 if (rescnt < width + (sign != 0)) {
5237 reslen -= rescnt;
5238 rescnt = width + fmtcnt + 100;
5239 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005240 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 return NULL;
5242 res = PyUnicode_AS_UNICODE(result)
5243 + reslen - rescnt;
5244 }
5245 if (sign) {
5246 if (fill != ' ')
5247 *res++ = sign;
5248 rescnt--;
5249 if (width > len)
5250 width--;
5251 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005252 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5253 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005254 assert(pbuf[1] == c);
5255 if (fill != ' ') {
5256 *res++ = *pbuf++;
5257 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005258 }
Tim Petersfff53252001-04-12 18:38:48 +00005259 rescnt -= 2;
5260 width -= 2;
5261 if (width < 0)
5262 width = 0;
5263 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005264 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005265 if (width > len && !(flags & F_LJUST)) {
5266 do {
5267 --rescnt;
5268 *res++ = fill;
5269 } while (--width > len);
5270 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005271 if (fill == ' ') {
5272 if (sign)
5273 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005274 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005275 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005276 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005277 *res++ = *pbuf++;
5278 *res++ = *pbuf++;
5279 }
5280 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005281 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 res += len;
5283 rescnt -= len;
5284 while (--width >= len) {
5285 --rescnt;
5286 *res++ = ' ';
5287 }
5288 if (dict && (argidx < arglen) && c != '%') {
5289 PyErr_SetString(PyExc_TypeError,
5290 "not all arguments converted");
5291 goto onError;
5292 }
5293 Py_XDECREF(temp);
5294 } /* '%' */
5295 } /* until end */
5296 if (argidx < arglen && !dict) {
5297 PyErr_SetString(PyExc_TypeError,
5298 "not all arguments converted");
5299 goto onError;
5300 }
5301
5302 if (args_owned) {
5303 Py_DECREF(args);
5304 }
5305 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005306 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005307 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005308 return (PyObject *)result;
5309
5310 onError:
5311 Py_XDECREF(result);
5312 Py_DECREF(uformat);
5313 if (args_owned) {
5314 Py_DECREF(args);
5315 }
5316 return NULL;
5317}
5318
5319static PyBufferProcs unicode_as_buffer = {
5320 (getreadbufferproc) unicode_buffer_getreadbuf,
5321 (getwritebufferproc) unicode_buffer_getwritebuf,
5322 (getsegcountproc) unicode_buffer_getsegcount,
5323 (getcharbufferproc) unicode_buffer_getcharbuf,
5324};
5325
5326PyTypeObject PyUnicode_Type = {
5327 PyObject_HEAD_INIT(&PyType_Type)
5328 0, /* ob_size */
5329 "unicode", /* tp_name */
5330 sizeof(PyUnicodeObject), /* tp_size */
5331 0, /* tp_itemsize */
5332 /* Slots */
5333 (destructor)_PyUnicode_Free, /* tp_dealloc */
5334 0, /* tp_print */
5335 (getattrfunc)unicode_getattr, /* tp_getattr */
5336 0, /* tp_setattr */
5337 (cmpfunc) unicode_compare, /* tp_compare */
5338 (reprfunc) unicode_repr, /* tp_repr */
5339 0, /* tp_as_number */
5340 &unicode_as_sequence, /* tp_as_sequence */
5341 0, /* tp_as_mapping */
5342 (hashfunc) unicode_hash, /* tp_hash*/
5343 0, /* tp_call*/
5344 (reprfunc) unicode_str, /* tp_str */
5345 (getattrofunc) NULL, /* tp_getattro */
5346 (setattrofunc) NULL, /* tp_setattro */
5347 &unicode_as_buffer, /* tp_as_buffer */
5348 Py_TPFLAGS_DEFAULT, /* tp_flags */
5349};
5350
5351/* Initialize the Unicode implementation */
5352
Thomas Wouters78890102000-07-22 19:25:51 +00005353void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005354{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005355 int i;
5356
Fred Drakee4315f52000-05-09 19:53:39 +00005357 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005358 unicode_freelist = NULL;
5359 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005360 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005361 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005362 for (i = 0; i < 256; i++)
5363 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005364}
5365
5366/* Finalize the Unicode implementation */
5367
5368void
Thomas Wouters78890102000-07-22 19:25:51 +00005369_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005371 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005372 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005373
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005374 Py_XDECREF(unicode_empty);
5375 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005376
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005377 for (i = 0; i < 256; i++) {
5378 if (unicode_latin1[i]) {
5379 Py_DECREF(unicode_latin1[i]);
5380 unicode_latin1[i] = NULL;
5381 }
5382 }
5383
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005384 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005385 PyUnicodeObject *v = u;
5386 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005387 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005388 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005389 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005390 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005391 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005392 unicode_freelist = NULL;
5393 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005394}