blob: 475215c25f209c9b337c1650ea00596064397bea [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* --- Unicode Object ----------------------------------------------------- */
107
108static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110 int length)
111{
112 void *oldstr;
113
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000114 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000115 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000116 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118 /* Resizing shared object (unicode_empty or single character
119 objects) in-place is not allowed. Use PyUnicode_Resize()
120 instead ! */
121 if (unicode == unicode_empty ||
122 (unicode->length == 1 &&
123 unicode->str[0] < 256 &&
124 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 return -1;
128 }
129
130 /* We allocate one more byte to make sure the string is
131 Ux0000 terminated -- XXX is this needed ? */
132 oldstr = unicode->str;
133 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
134 if (!unicode->str) {
135 unicode->str = oldstr;
136 PyErr_NoMemory();
137 return -1;
138 }
139 unicode->str[length] = 0;
140 unicode->length = length;
141
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000142 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000143 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000144 if (unicode->defenc) {
145 Py_DECREF(unicode->defenc);
146 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147 }
148 unicode->hash = -1;
149
150 return 0;
151}
152
153/* We allocate one more byte to make sure the string is
154 Ux0000 terminated -- XXX is this needed ?
155
156 XXX This allocator could further be enhanced by assuring that the
157 free list never reduces its size below 1.
158
159*/
160
161static
162PyUnicodeObject *_PyUnicode_New(int length)
163{
164 register PyUnicodeObject *unicode;
165
166 /* Optimization for empty strings */
167 if (length == 0 && unicode_empty != NULL) {
168 Py_INCREF(unicode_empty);
169 return unicode_empty;
170 }
171
172 /* Unicode freelist & memory allocation */
173 if (unicode_freelist) {
174 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000175 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000176 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 /* Keep-Alive optimization: we only upsize the buffer,
179 never downsize it. */
180 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000182 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000183 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 }
185 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000186 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000188 }
189 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 }
191 else {
192 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
193 if (unicode == NULL)
194 return NULL;
195 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
196 }
197
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000198 if (!unicode->str) {
199 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000200 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 unicode->str[length] = 0;
203 unicode->length = length;
204 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000205 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000207
208 onError:
209 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000210 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212}
213
214static
215void _PyUnicode_Free(register PyUnicodeObject *unicode)
216{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 /* Keep-Alive optimization */
219 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000220 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 unicode->str = NULL;
222 unicode->length = 0;
223 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000224 if (unicode->defenc) {
225 Py_DECREF(unicode->defenc);
226 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 }
228 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 *(PyUnicodeObject **)unicode = unicode_freelist;
230 unicode_freelist = unicode;
231 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000234 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000235 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000236 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 }
238}
239
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000240int PyUnicode_Resize(PyObject **unicode,
241 int length)
242{
243 register PyUnicodeObject *v;
244
245 /* Argument checks */
246 if (unicode == NULL) {
247 PyErr_BadInternalCall();
248 return -1;
249 }
250 v = (PyUnicodeObject *)*unicode;
251 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
252 PyErr_BadInternalCall();
253 return -1;
254 }
255
256 /* Resizing unicode_empty and single character objects is not
257 possible since these are being shared. We simply return a fresh
258 copy with the same Unicode content. */
259 if (v->length != length &&
260 (v == unicode_empty || v->length == 1)) {
261 PyUnicodeObject *w = _PyUnicode_New(length);
262 if (w == NULL)
263 return -1;
264 Py_UNICODE_COPY(w->str, v->str,
265 length < v->length ? length : v->length);
266 *unicode = (PyObject *)w;
267 return 0;
268 }
269
270 /* Note that we don't have to modify *unicode for unshared Unicode
271 objects, since we can modify them in-place. */
272 return unicode_resize(v, length);
273}
274
275/* Internal API for use in unicodeobject.c only ! */
276#define _PyUnicode_Resize(unicodevar, length) \
277 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
278
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
280 int size)
281{
282 PyUnicodeObject *unicode;
283
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284 /* If the Unicode data is known at construction time, we can apply
285 some optimizations which share commonly used objects. */
286 if (u != NULL) {
287
288 /* Optimization for empty strings */
289 if (size == 0 && unicode_empty != NULL) {
290 Py_INCREF(unicode_empty);
291 return (PyObject *)unicode_empty;
292 }
293
294 /* Single character Unicode objects in the Latin-1 range are
295 shared when using this constructor */
296 if (size == 1 && *u < 256) {
297 unicode = unicode_latin1[*u];
298 if (!unicode) {
299 unicode = _PyUnicode_New(1);
300 unicode->str[0] = *u;
301 if (!unicode)
302 return NULL;
303 unicode_latin1[*u] = unicode;
304 }
305 Py_INCREF(unicode);
306 return (PyObject *)unicode;
307 }
308 }
309
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the Unicode data into the new object */
315 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317
318 return (PyObject *)unicode;
319}
320
321#ifdef HAVE_WCHAR_H
322
323PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
324 int size)
325{
326 PyUnicodeObject *unicode;
327
328 if (w == NULL) {
329 PyErr_BadInternalCall();
330 return NULL;
331 }
332
333 unicode = _PyUnicode_New(size);
334 if (!unicode)
335 return NULL;
336
337 /* Copy the wchar_t data into the new object */
338#ifdef HAVE_USABLE_WCHAR_T
339 memcpy(unicode->str, w, size * sizeof(wchar_t));
340#else
341 {
342 register Py_UNICODE *u;
343 register int i;
344 u = PyUnicode_AS_UNICODE(unicode);
345 for (i = size; i >= 0; i--)
346 *u++ = *w++;
347 }
348#endif
349
350 return (PyObject *)unicode;
351}
352
353int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
354 register wchar_t *w,
355 int size)
356{
357 if (unicode == NULL) {
358 PyErr_BadInternalCall();
359 return -1;
360 }
361 if (size > PyUnicode_GET_SIZE(unicode))
362 size = PyUnicode_GET_SIZE(unicode);
363#ifdef HAVE_USABLE_WCHAR_T
364 memcpy(w, unicode->str, size * sizeof(wchar_t));
365#else
366 {
367 register Py_UNICODE *u;
368 register int i;
369 u = PyUnicode_AS_UNICODE(unicode);
370 for (i = size; i >= 0; i--)
371 *w++ = *u++;
372 }
373#endif
374
375 return size;
376}
377
378#endif
379
380PyObject *PyUnicode_FromObject(register PyObject *obj)
381{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
383}
384
385PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
386 const char *encoding,
387 const char *errors)
388{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 const char *s;
390 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 int owned = 0;
392 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
394 if (obj == NULL) {
395 PyErr_BadInternalCall();
396 return NULL;
397 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000398
399 /* Coerce object */
400 if (PyInstance_Check(obj)) {
401 PyObject *func;
402 func = PyObject_GetAttrString(obj, "__str__");
403 if (func == NULL) {
404 PyErr_SetString(PyExc_TypeError,
405 "coercing to Unicode: instance doesn't define __str__");
406 return NULL;
407 }
408 obj = PyEval_CallObject(func, NULL);
409 Py_DECREF(func);
410 if (obj == NULL)
411 return NULL;
412 owned = 1;
413 }
414 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 v = obj;
417 if (encoding) {
418 PyErr_SetString(PyExc_TypeError,
419 "decoding Unicode is not supported");
420 return NULL;
421 }
422 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else if (PyString_Check(obj)) {
425 s = PyString_AS_STRING(obj);
426 len = PyString_GET_SIZE(obj);
427 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000428 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
429 /* Overwrite the error message with something more useful in
430 case of a TypeError. */
431 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000432 PyErr_Format(PyExc_TypeError,
433 "coercing to Unicode: need string or buffer, "
434 "%.80s found",
435 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000436 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000437 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000438
439 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 if (len == 0) {
441 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000442 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 else
445 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000446
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000448 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000449 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000450 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000451 return v;
452
453 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000454 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000456 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458}
459
460PyObject *PyUnicode_Decode(const char *s,
461 int size,
462 const char *encoding,
463 const char *errors)
464{
465 PyObject *buffer = NULL, *unicode;
466
Fred Drakee4315f52000-05-09 19:53:39 +0000467 if (encoding == NULL)
468 encoding = PyUnicode_GetDefaultEncoding();
469
470 /* Shortcuts for common default encodings */
471 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000473 else if (strcmp(encoding, "latin-1") == 0)
474 return PyUnicode_DecodeLatin1(s, size, errors);
475 else if (strcmp(encoding, "ascii") == 0)
476 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477
478 /* Decode via the codec registry */
479 buffer = PyBuffer_FromMemory((void *)s, size);
480 if (buffer == NULL)
481 goto onError;
482 unicode = PyCodec_Decode(buffer, encoding, errors);
483 if (unicode == NULL)
484 goto onError;
485 if (!PyUnicode_Check(unicode)) {
486 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000487 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488 unicode->ob_type->tp_name);
489 Py_DECREF(unicode);
490 goto onError;
491 }
492 Py_DECREF(buffer);
493 return unicode;
494
495 onError:
496 Py_XDECREF(buffer);
497 return NULL;
498}
499
500PyObject *PyUnicode_Encode(const Py_UNICODE *s,
501 int size,
502 const char *encoding,
503 const char *errors)
504{
505 PyObject *v, *unicode;
506
507 unicode = PyUnicode_FromUnicode(s, size);
508 if (unicode == NULL)
509 return NULL;
510 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
511 Py_DECREF(unicode);
512 return v;
513}
514
515PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
516 const char *encoding,
517 const char *errors)
518{
519 PyObject *v;
520
521 if (!PyUnicode_Check(unicode)) {
522 PyErr_BadArgument();
523 goto onError;
524 }
Fred Drakee4315f52000-05-09 19:53:39 +0000525
526 if (encoding == NULL)
527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (errors == NULL) {
531 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000533 else if (strcmp(encoding, "latin-1") == 0)
534 return PyUnicode_AsLatin1String(unicode);
535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_AsASCIIString(unicode);
537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000538
539 /* Encode via the codec registry */
540 v = PyCodec_Encode(unicode, encoding, errors);
541 if (v == NULL)
542 goto onError;
543 /* XXX Should we really enforce this ? */
544 if (!PyString_Check(v)) {
545 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000546 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547 v->ob_type->tp_name);
548 Py_DECREF(v);
549 goto onError;
550 }
551 return v;
552
553 onError:
554 return NULL;
555}
556
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000557/* Return a Python string holding the default encoded value of the
558 Unicode object.
559
560 The resulting string is cached in the Unicode object for subsequent
561 usage by this function. The cached version is needed to implement
562 the character buffer interface and will live (at least) as long as
563 the Unicode object itself.
564
565 The refcount of the string is *not* incremented.
566
567 *** Exported for internal use by the interpreter only !!! ***
568
569*/
570
571PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
572 const char *errors)
573{
574 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
575
576 if (v)
577 return v;
578 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
579 if (v && errors == NULL)
580 ((PyUnicodeObject *)unicode)->defenc = v;
581 return v;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
585{
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
590 return PyUnicode_AS_UNICODE(unicode);
591
592 onError:
593 return NULL;
594}
595
596int PyUnicode_GetSize(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_GET_SIZE(unicode);
603
604 onError:
605 return -1;
606}
607
Thomas Wouters78890102000-07-22 19:25:51 +0000608const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000609{
610 return unicode_default_encoding;
611}
612
613int PyUnicode_SetDefaultEncoding(const char *encoding)
614{
615 PyObject *v;
616
617 /* Make sure the encoding is valid. As side effect, this also
618 loads the encoding into the codec registry cache. */
619 v = _PyCodec_Lookup(encoding);
620 if (v == NULL)
621 goto onError;
622 Py_DECREF(v);
623 strncpy(unicode_default_encoding,
624 encoding,
625 sizeof(unicode_default_encoding));
626 return 0;
627
628 onError:
629 return -1;
630}
631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632/* --- UTF-8 Codec -------------------------------------------------------- */
633
634static
635char utf8_code_length[256] = {
636 /* Map UTF-8 encoded prefix byte to sequence length. zero means
637 illegal prefix. see RFC 2279 for details */
638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
652 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
653 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
654};
655
656static
657int utf8_decoding_error(const char **source,
658 Py_UNICODE **dest,
659 const char *errors,
660 const char *details)
661{
662 if ((errors == NULL) ||
663 (strcmp(errors,"strict") == 0)) {
664 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000665 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 details);
667 return -1;
668 }
669 else if (strcmp(errors,"ignore") == 0) {
670 (*source)++;
671 return 0;
672 }
673 else if (strcmp(errors,"replace") == 0) {
674 (*source)++;
675 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
676 (*dest)++;
677 return 0;
678 }
679 else {
680 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000681 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682 errors);
683 return -1;
684 }
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_DecodeUTF8(const char *s,
688 int size,
689 const char *errors)
690{
691 int n;
692 const char *e;
693 PyUnicodeObject *unicode;
694 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000696
697 /* Note: size will always be longer than the resulting Unicode
698 character count */
699 unicode = _PyUnicode_New(size);
700 if (!unicode)
701 return NULL;
702 if (size == 0)
703 return (PyObject *)unicode;
704
705 /* Unpack UTF-8 encoded data */
706 p = unicode->str;
707 e = s + size;
708
709 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000710 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711
712 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000713 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 s++;
715 continue;
716 }
717
718 n = utf8_code_length[ch];
719
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if (s + n > e) {
721 errmsg = "unexpected end of data";
722 goto utf8Error;
723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724
725 switch (n) {
726
727 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 errmsg = "unexpected code byte";
729 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
731 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000732 errmsg = "internal error";
733 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000734
735 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000736 if ((s[1] & 0xc0) != 0x80) {
737 errmsg = "invalid data";
738 goto utf8Error;
739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 if (ch < 0x80) {
742 errmsg = "illegal encoding";
743 goto utf8Error;
744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000747 break;
748
749 case 3:
750 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 (s[2] & 0xc0) != 0x80) {
752 errmsg = "invalid data";
753 goto utf8Error;
754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000756 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
757 errmsg = "illegal encoding";
758 goto utf8Error;
759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000761 *p++ = (Py_UNICODE)ch;
762 break;
763
764 case 4:
765 if ((s[1] & 0xc0) != 0x80 ||
766 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000767 (s[3] & 0xc0) != 0x80) {
768 errmsg = "invalid data";
769 goto utf8Error;
770 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000771 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
772 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
773 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000774 if ((ch < 0x10000) || /* minimum value allowed for 4
775 byte encoding */
776 (ch > 0x10ffff)) { /* maximum value allowed for
777 UTF-16 */
778 errmsg = "illegal encoding";
779 goto utf8Error;
780 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000781 /* compute and append the two surrogates: */
782
783 /* translate from 10000..10FFFF to 0..FFFF */
784 ch -= 0x10000;
785
786 /* high surrogate = top 10 bits added to D800 */
787 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
788
789 /* low surrogate = bottom 10 bits added to DC00 */
790 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791 break;
792
793 default:
794 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000795 errmsg = "unsupported Unicode code range";
796 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797 }
798 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000799 continue;
800
801 utf8Error:
802 if (utf8_decoding_error(&s, &p, errors, errmsg))
803 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804 }
805
806 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000807 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 goto onError;
809
810 return (PyObject *)unicode;
811
812onError:
813 Py_DECREF(unicode);
814 return NULL;
815}
816
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000817/* Not used anymore, now that the encoder supports UTF-16
818 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000819#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820static
821int utf8_encoding_error(const Py_UNICODE **source,
822 char **dest,
823 const char *errors,
824 const char *details)
825{
826 if ((errors == NULL) ||
827 (strcmp(errors,"strict") == 0)) {
828 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000829 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830 details);
831 return -1;
832 }
833 else if (strcmp(errors,"ignore") == 0) {
834 return 0;
835 }
836 else if (strcmp(errors,"replace") == 0) {
837 **dest = '?';
838 (*dest)++;
839 return 0;
840 }
841 else {
842 PyErr_Format(PyExc_ValueError,
843 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000844 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 errors);
846 return -1;
847 }
848}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000849#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850
851PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
852 int size,
853 const char *errors)
854{
855 PyObject *v;
856 char *p;
857 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000858 Py_UCS4 ch2;
859 unsigned int cbAllocated = 3 * size;
860 unsigned int cbWritten = 0;
861 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000863 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 if (v == NULL)
865 return NULL;
866 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000867 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868
869 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000870 while (i < size) {
871 Py_UCS4 ch = s[i++];
872 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000874 cbWritten++;
875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000876 else if (ch < 0x0800) {
877 *p++ = 0xc0 | (ch >> 6);
878 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 cbWritten += 2;
880 }
881 else {
882 /* Check for high surrogate */
883 if (0xD800 <= ch && ch <= 0xDBFF) {
884 if (i != size) {
885 ch2 = s[i];
886 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
887
888 if (cbWritten >= (cbAllocated - 4)) {
889 /* Provide enough room for some more
890 surrogates */
891 cbAllocated += 4*10;
892 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000893 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000894 }
895
896 /* combine the two values */
897 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
898
899 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000900 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000901 i++;
902 cbWritten += 4;
903 }
904 }
905 }
906 else {
907 *p++ = (char)(0xe0 | (ch >> 12));
908 cbWritten += 3;
909 }
910 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
911 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000912 }
913 }
914 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000915 if (_PyString_Resize(&v, p - q))
916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000917 return v;
918
919 onError:
920 Py_DECREF(v);
921 return NULL;
922}
923
Guido van Rossumd57fd912000-03-10 22:53:23 +0000924PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
925{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926 if (!PyUnicode_Check(unicode)) {
927 PyErr_BadArgument();
928 return NULL;
929 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000930 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
931 PyUnicode_GET_SIZE(unicode),
932 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933}
934
935/* --- UTF-16 Codec ------------------------------------------------------- */
936
937static
938int utf16_decoding_error(const Py_UNICODE **source,
939 Py_UNICODE **dest,
940 const char *errors,
941 const char *details)
942{
943 if ((errors == NULL) ||
944 (strcmp(errors,"strict") == 0)) {
945 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000946 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 details);
948 return -1;
949 }
950 else if (strcmp(errors,"ignore") == 0) {
951 return 0;
952 }
953 else if (strcmp(errors,"replace") == 0) {
954 if (dest) {
955 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
956 (*dest)++;
957 }
958 return 0;
959 }
960 else {
961 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000962 "UTF-16 decoding error; "
963 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 errors);
965 return -1;
966 }
967}
968
Guido van Rossumd57fd912000-03-10 22:53:23 +0000969PyObject *PyUnicode_DecodeUTF16(const char *s,
970 int size,
971 const char *errors,
972 int *byteorder)
973{
974 PyUnicodeObject *unicode;
975 Py_UNICODE *p;
976 const Py_UNICODE *q, *e;
977 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000978 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979
980 /* size should be an even number */
981 if (size % sizeof(Py_UNICODE) != 0) {
982 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
983 return NULL;
984 /* The remaining input chars are ignored if we fall through
985 here... */
986 }
987
988 /* Note: size will always be longer than the resulting Unicode
989 character count */
990 unicode = _PyUnicode_New(size);
991 if (!unicode)
992 return NULL;
993 if (size == 0)
994 return (PyObject *)unicode;
995
996 /* Unpack UTF-16 encoded data */
997 p = unicode->str;
998 q = (Py_UNICODE *)s;
999 e = q + (size / sizeof(Py_UNICODE));
1000
1001 if (byteorder)
1002 bo = *byteorder;
1003
1004 while (q < e) {
1005 register Py_UNICODE ch = *q++;
1006
1007 /* Check for BOM marks (U+FEFF) in the input and adjust
1008 current byte order setting accordingly. Swap input
1009 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
1010 !) */
1011#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1012 if (ch == 0xFEFF) {
1013 bo = -1;
1014 continue;
1015 } else if (ch == 0xFFFE) {
1016 bo = 1;
1017 continue;
1018 }
1019 if (bo == 1)
1020 ch = (ch >> 8) | (ch << 8);
1021#else
1022 if (ch == 0xFEFF) {
1023 bo = 1;
1024 continue;
1025 } else if (ch == 0xFFFE) {
1026 bo = -1;
1027 continue;
1028 }
1029 if (bo == -1)
1030 ch = (ch >> 8) | (ch << 8);
1031#endif
1032 if (ch < 0xD800 || ch > 0xDFFF) {
1033 *p++ = ch;
1034 continue;
1035 }
1036
1037 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001038 if (q >= e) {
1039 errmsg = "unexpected end of data";
1040 goto utf16Error;
1041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 if (0xDC00 <= *q && *q <= 0xDFFF) {
1043 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 /* This is valid data (a UTF-16 surrogate pair), but
1046 we are not able to store this information since our
1047 Py_UNICODE type only has 16 bits... this might
1048 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001049 errmsg = "code pairs are not supported";
1050 goto utf16Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 else
1053 continue;
1054 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 errmsg = "illegal encoding";
1056 /* Fall through to report the error */
1057
1058 utf16Error:
1059 if (utf16_decoding_error(&q, &p, errors, errmsg))
1060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 }
1062
1063 if (byteorder)
1064 *byteorder = bo;
1065
1066 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001067 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 goto onError;
1069
1070 return (PyObject *)unicode;
1071
1072onError:
1073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077#undef UTF16_ERROR
1078
1079PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1080 int size,
1081 const char *errors,
1082 int byteorder)
1083{
1084 PyObject *v;
1085 Py_UNICODE *p;
1086 char *q;
1087
1088 /* We don't create UTF-16 pairs... */
1089 v = PyString_FromStringAndSize(NULL,
1090 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1091 if (v == NULL)
1092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093
1094 q = PyString_AS_STRING(v);
1095 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if (byteorder == 0)
1097 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001098 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001099 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 if (byteorder == 0 ||
1101#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1102 byteorder == -1
1103#else
1104 byteorder == 1
1105#endif
1106 )
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001107 Py_UNICODE_COPY(p, s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 else
1109 while (size-- > 0) {
1110 Py_UNICODE ch = *s++;
1111 *p++ = (ch >> 8) | (ch << 8);
1112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 return v;
1114}
1115
1116PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1117{
1118 if (!PyUnicode_Check(unicode)) {
1119 PyErr_BadArgument();
1120 return NULL;
1121 }
1122 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1123 PyUnicode_GET_SIZE(unicode),
1124 NULL,
1125 0);
1126}
1127
1128/* --- Unicode Escape Codec ----------------------------------------------- */
1129
1130static
1131int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001132 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133 const char *errors,
1134 const char *details)
1135{
1136 if ((errors == NULL) ||
1137 (strcmp(errors,"strict") == 0)) {
1138 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001139 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 details);
1141 return -1;
1142 }
1143 else if (strcmp(errors,"ignore") == 0) {
1144 return 0;
1145 }
1146 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001147 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 return 0;
1149 }
1150 else {
1151 PyErr_Format(PyExc_ValueError,
1152 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 errors);
1155 return -1;
1156 }
1157}
1158
Fredrik Lundh06d12682001-01-24 07:59:11 +00001159static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1162 int size,
1163 const char *errors)
1164{
1165 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001166 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001168 char* message;
1169 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1170
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 /* Escaped strings will always be longer than the resulting
1172 Unicode string, so we start with size here and then reduce the
1173 length after conversion to the true value. */
1174 v = _PyUnicode_New(size);
1175 if (v == NULL)
1176 goto onError;
1177 if (size == 0)
1178 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 p = buf = PyUnicode_AS_UNICODE(v);
1181 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 while (s < end) {
1184 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001185 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001186 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187
1188 /* Non-escape characters are interpreted as Unicode ordinals */
1189 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001190 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 continue;
1192 }
1193
1194 /* \ - Escapes */
1195 s++;
1196 switch (*s++) {
1197
1198 /* \x escapes */
1199 case '\n': break;
1200 case '\\': *p++ = '\\'; break;
1201 case '\'': *p++ = '\''; break;
1202 case '\"': *p++ = '\"'; break;
1203 case 'b': *p++ = '\b'; break;
1204 case 'f': *p++ = '\014'; break; /* FF */
1205 case 't': *p++ = '\t'; break;
1206 case 'n': *p++ = '\n'; break;
1207 case 'r': *p++ = '\r'; break;
1208 case 'v': *p++ = '\013'; break; /* VT */
1209 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1210
1211 /* \OOO (octal) escapes */
1212 case '0': case '1': case '2': case '3':
1213 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001214 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001216 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001218 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001220 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 break;
1222
Fredrik Lundhccc74732001-02-18 22:13:49 +00001223 /* hex escapes */
1224 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001226 digits = 2;
1227 message = "truncated \\xXX escape";
1228 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229
Fredrik Lundhccc74732001-02-18 22:13:49 +00001230 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001232 digits = 4;
1233 message = "truncated \\uXXXX escape";
1234 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235
Fredrik Lundhccc74732001-02-18 22:13:49 +00001236 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001237 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001238 digits = 8;
1239 message = "truncated \\UXXXXXXXX escape";
1240 hexescape:
1241 chr = 0;
1242 for (i = 0; i < digits; i++) {
1243 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001244 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001245 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001246 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001247 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001248 i++;
1249 break;
1250 }
1251 chr = (chr<<4) & ~0xF;
1252 if (c >= '0' && c <= '9')
1253 chr += c - '0';
1254 else if (c >= 'a' && c <= 'f')
1255 chr += 10 + c - 'a';
1256 else
1257 chr += 10 + c - 'A';
1258 }
1259 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001260 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001261 /* when we get here, chr is a 32-bit unicode character */
1262 if (chr <= 0xffff)
1263 /* UCS-2 character */
1264 *p++ = (Py_UNICODE) chr;
1265 else if (chr <= 0x10ffff) {
1266 /* UCS-4 character. store as two surrogate characters */
1267 chr -= 0x10000L;
1268 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1269 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1270 } else {
1271 if (unicodeescape_decoding_error(
1272 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001273 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001274 )
1275 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001276 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001277 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001278 break;
1279
1280 /* \N{name} */
1281 case 'N':
1282 message = "malformed \\N character escape";
1283 if (ucnhash_CAPI == NULL) {
1284 /* load the unicode data module */
1285 PyObject *m, *v;
1286 m = PyImport_ImportModule("unicodedata");
1287 if (m == NULL)
1288 goto ucnhashError;
1289 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1290 Py_DECREF(m);
1291 if (v == NULL)
1292 goto ucnhashError;
1293 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1294 Py_DECREF(v);
1295 if (ucnhash_CAPI == NULL)
1296 goto ucnhashError;
1297 }
1298 if (*s == '{') {
1299 const char *start = s+1;
1300 /* look for the closing brace */
1301 while (*s != '}' && s < end)
1302 s++;
1303 if (s > start && s < end && *s == '}') {
1304 /* found a name. look it up in the unicode database */
1305 message = "unknown Unicode character name";
1306 s++;
1307 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1308 goto store;
1309 }
1310 }
1311 if (unicodeescape_decoding_error(&s, &x, errors, message))
1312 goto onError;
1313 *p++ = x;
1314 break;
1315
1316 default:
1317 *p++ = '\\';
1318 *p++ = (unsigned char)s[-1];
1319 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 }
1321 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001322 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 return (PyObject *)v;
1325
Fredrik Lundhccc74732001-02-18 22:13:49 +00001326ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001327 PyErr_SetString(
1328 PyExc_UnicodeError,
1329 "\\N escapes not supported (can't load unicodedata module)"
1330 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001331 return NULL;
1332
Fredrik Lundhccc74732001-02-18 22:13:49 +00001333onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 Py_XDECREF(v);
1335 return NULL;
1336}
1337
1338/* Return a Unicode-Escape string version of the Unicode object.
1339
1340 If quotes is true, the string is enclosed in u"" or u'' quotes as
1341 appropriate.
1342
1343*/
1344
Barry Warsaw51ac5802000-03-20 16:36:48 +00001345static const Py_UNICODE *findchar(const Py_UNICODE *s,
1346 int size,
1347 Py_UNICODE ch);
1348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349static
1350PyObject *unicodeescape_string(const Py_UNICODE *s,
1351 int size,
1352 int quotes)
1353{
1354 PyObject *repr;
1355 char *p;
1356 char *q;
1357
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001358 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359
1360 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1361 if (repr == NULL)
1362 return NULL;
1363
1364 p = q = PyString_AS_STRING(repr);
1365
1366 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 *p++ = 'u';
1368 *p++ = (findchar(s, size, '\'') &&
1369 !findchar(s, size, '"')) ? '"' : '\'';
1370 }
1371 while (size-- > 0) {
1372 Py_UNICODE ch = *s++;
1373 /* Escape quotes */
1374 if (quotes && (ch == q[1] || ch == '\\')) {
1375 *p++ = '\\';
1376 *p++ = (char) ch;
1377 }
1378 /* Map 16-bit characters to '\uxxxx' */
1379 else if (ch >= 256) {
1380 *p++ = '\\';
1381 *p++ = 'u';
1382 *p++ = hexdigit[(ch >> 12) & 0xf];
1383 *p++ = hexdigit[(ch >> 8) & 0xf];
1384 *p++ = hexdigit[(ch >> 4) & 0xf];
1385 *p++ = hexdigit[ch & 15];
1386 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001387 /* Map special whitespace to '\t', \n', '\r' */
1388 else if (ch == '\t') {
1389 *p++ = '\\';
1390 *p++ = 't';
1391 }
1392 else if (ch == '\n') {
1393 *p++ = '\\';
1394 *p++ = 'n';
1395 }
1396 else if (ch == '\r') {
1397 *p++ = '\\';
1398 *p++ = 'r';
1399 }
1400 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401 else if (ch < ' ' || ch >= 128) {
1402 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001403 *p++ = 'x';
1404 *p++ = hexdigit[(ch >> 4) & 0xf];
1405 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 }
1407 /* Copy everything else as-is */
1408 else
1409 *p++ = (char) ch;
1410 }
1411 if (quotes)
1412 *p++ = q[1];
1413
1414 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001415 if (_PyString_Resize(&repr, p - q))
1416 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
1418 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001419
1420 onError:
1421 Py_DECREF(repr);
1422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423}
1424
1425PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1426 int size)
1427{
1428 return unicodeescape_string(s, size, 0);
1429}
1430
1431PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1432{
1433 if (!PyUnicode_Check(unicode)) {
1434 PyErr_BadArgument();
1435 return NULL;
1436 }
1437 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1438 PyUnicode_GET_SIZE(unicode));
1439}
1440
1441/* --- Raw Unicode Escape Codec ------------------------------------------- */
1442
1443PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1444 int size,
1445 const char *errors)
1446{
1447 PyUnicodeObject *v;
1448 Py_UNICODE *p, *buf;
1449 const char *end;
1450 const char *bs;
1451
1452 /* Escaped strings will always be longer than the resulting
1453 Unicode string, so we start with size here and then reduce the
1454 length after conversion to the true value. */
1455 v = _PyUnicode_New(size);
1456 if (v == NULL)
1457 goto onError;
1458 if (size == 0)
1459 return (PyObject *)v;
1460 p = buf = PyUnicode_AS_UNICODE(v);
1461 end = s + size;
1462 while (s < end) {
1463 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001464 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 int i;
1466
1467 /* Non-escape characters are interpreted as Unicode ordinals */
1468 if (*s != '\\') {
1469 *p++ = (unsigned char)*s++;
1470 continue;
1471 }
1472
1473 /* \u-escapes are only interpreted iff the number of leading
1474 backslashes if odd */
1475 bs = s;
1476 for (;s < end;) {
1477 if (*s != '\\')
1478 break;
1479 *p++ = (unsigned char)*s++;
1480 }
1481 if (((s - bs) & 1) == 0 ||
1482 s >= end ||
1483 *s != 'u') {
1484 continue;
1485 }
1486 p--;
1487 s++;
1488
1489 /* \uXXXX with 4 hex digits */
1490 for (x = 0, i = 0; i < 4; i++) {
1491 c = (unsigned char)s[i];
1492 if (!isxdigit(c)) {
1493 if (unicodeescape_decoding_error(&s, &x, errors,
1494 "truncated \\uXXXX"))
1495 goto onError;
1496 i++;
1497 break;
1498 }
1499 x = (x<<4) & ~0xF;
1500 if (c >= '0' && c <= '9')
1501 x += c - '0';
1502 else if (c >= 'a' && c <= 'f')
1503 x += 10 + c - 'a';
1504 else
1505 x += 10 + c - 'A';
1506 }
1507 s += i;
1508 *p++ = x;
1509 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001510 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001511 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 return (PyObject *)v;
1513
1514 onError:
1515 Py_XDECREF(v);
1516 return NULL;
1517}
1518
1519PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1520 int size)
1521{
1522 PyObject *repr;
1523 char *p;
1524 char *q;
1525
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001526 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527
1528 repr = PyString_FromStringAndSize(NULL, 6 * size);
1529 if (repr == NULL)
1530 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001531 if (size == 0)
1532 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533
1534 p = q = PyString_AS_STRING(repr);
1535 while (size-- > 0) {
1536 Py_UNICODE ch = *s++;
1537 /* Map 16-bit characters to '\uxxxx' */
1538 if (ch >= 256) {
1539 *p++ = '\\';
1540 *p++ = 'u';
1541 *p++ = hexdigit[(ch >> 12) & 0xf];
1542 *p++ = hexdigit[(ch >> 8) & 0xf];
1543 *p++ = hexdigit[(ch >> 4) & 0xf];
1544 *p++ = hexdigit[ch & 15];
1545 }
1546 /* Copy everything else as-is */
1547 else
1548 *p++ = (char) ch;
1549 }
1550 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001551 if (_PyString_Resize(&repr, p - q))
1552 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553
1554 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001555
1556 onError:
1557 Py_DECREF(repr);
1558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559}
1560
1561PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1562{
1563 if (!PyUnicode_Check(unicode)) {
1564 PyErr_BadArgument();
1565 return NULL;
1566 }
1567 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1568 PyUnicode_GET_SIZE(unicode));
1569}
1570
1571/* --- Latin-1 Codec ------------------------------------------------------ */
1572
1573PyObject *PyUnicode_DecodeLatin1(const char *s,
1574 int size,
1575 const char *errors)
1576{
1577 PyUnicodeObject *v;
1578 Py_UNICODE *p;
1579
1580 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001581 if (size == 1 && *(unsigned char*)s < 256) {
1582 Py_UNICODE r = *(unsigned char*)s;
1583 return PyUnicode_FromUnicode(&r, 1);
1584 }
1585
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 v = _PyUnicode_New(size);
1587 if (v == NULL)
1588 goto onError;
1589 if (size == 0)
1590 return (PyObject *)v;
1591 p = PyUnicode_AS_UNICODE(v);
1592 while (size-- > 0)
1593 *p++ = (unsigned char)*s++;
1594 return (PyObject *)v;
1595
1596 onError:
1597 Py_XDECREF(v);
1598 return NULL;
1599}
1600
1601static
1602int latin1_encoding_error(const Py_UNICODE **source,
1603 char **dest,
1604 const char *errors,
1605 const char *details)
1606{
1607 if ((errors == NULL) ||
1608 (strcmp(errors,"strict") == 0)) {
1609 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001610 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 details);
1612 return -1;
1613 }
1614 else if (strcmp(errors,"ignore") == 0) {
1615 return 0;
1616 }
1617 else if (strcmp(errors,"replace") == 0) {
1618 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001619 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 return 0;
1621 }
1622 else {
1623 PyErr_Format(PyExc_ValueError,
1624 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001625 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 errors);
1627 return -1;
1628 }
1629}
1630
1631PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1632 int size,
1633 const char *errors)
1634{
1635 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001636 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001637
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 repr = PyString_FromStringAndSize(NULL, size);
1639 if (repr == NULL)
1640 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001641 if (size == 0)
1642 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643
1644 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001645 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 while (size-- > 0) {
1647 Py_UNICODE ch = *p++;
1648 if (ch >= 256) {
1649 if (latin1_encoding_error(&p, &s, errors,
1650 "ordinal not in range(256)"))
1651 goto onError;
1652 }
1653 else
1654 *s++ = (char)ch;
1655 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001656 /* Resize if error handling skipped some characters */
1657 if (s - start < PyString_GET_SIZE(repr))
1658 if (_PyString_Resize(&repr, s - start))
1659 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 return repr;
1661
1662 onError:
1663 Py_DECREF(repr);
1664 return NULL;
1665}
1666
1667PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1668{
1669 if (!PyUnicode_Check(unicode)) {
1670 PyErr_BadArgument();
1671 return NULL;
1672 }
1673 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1674 PyUnicode_GET_SIZE(unicode),
1675 NULL);
1676}
1677
1678/* --- 7-bit ASCII Codec -------------------------------------------------- */
1679
1680static
1681int ascii_decoding_error(const char **source,
1682 Py_UNICODE **dest,
1683 const char *errors,
1684 const char *details)
1685{
1686 if ((errors == NULL) ||
1687 (strcmp(errors,"strict") == 0)) {
1688 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001689 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 details);
1691 return -1;
1692 }
1693 else if (strcmp(errors,"ignore") == 0) {
1694 return 0;
1695 }
1696 else if (strcmp(errors,"replace") == 0) {
1697 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1698 (*dest)++;
1699 return 0;
1700 }
1701 else {
1702 PyErr_Format(PyExc_ValueError,
1703 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001704 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705 errors);
1706 return -1;
1707 }
1708}
1709
1710PyObject *PyUnicode_DecodeASCII(const char *s,
1711 int size,
1712 const char *errors)
1713{
1714 PyUnicodeObject *v;
1715 Py_UNICODE *p;
1716
1717 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001718 if (size == 1 && *(unsigned char*)s < 128) {
1719 Py_UNICODE r = *(unsigned char*)s;
1720 return PyUnicode_FromUnicode(&r, 1);
1721 }
1722
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 v = _PyUnicode_New(size);
1724 if (v == NULL)
1725 goto onError;
1726 if (size == 0)
1727 return (PyObject *)v;
1728 p = PyUnicode_AS_UNICODE(v);
1729 while (size-- > 0) {
1730 register unsigned char c;
1731
1732 c = (unsigned char)*s++;
1733 if (c < 128)
1734 *p++ = c;
1735 else if (ascii_decoding_error(&s, &p, errors,
1736 "ordinal not in range(128)"))
1737 goto onError;
1738 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001739 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001740 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001741 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 return (PyObject *)v;
1743
1744 onError:
1745 Py_XDECREF(v);
1746 return NULL;
1747}
1748
1749static
1750int ascii_encoding_error(const Py_UNICODE **source,
1751 char **dest,
1752 const char *errors,
1753 const char *details)
1754{
1755 if ((errors == NULL) ||
1756 (strcmp(errors,"strict") == 0)) {
1757 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001758 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 details);
1760 return -1;
1761 }
1762 else if (strcmp(errors,"ignore") == 0) {
1763 return 0;
1764 }
1765 else if (strcmp(errors,"replace") == 0) {
1766 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001767 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 return 0;
1769 }
1770 else {
1771 PyErr_Format(PyExc_ValueError,
1772 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001773 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 errors);
1775 return -1;
1776 }
1777}
1778
1779PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1780 int size,
1781 const char *errors)
1782{
1783 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001784 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001785
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 repr = PyString_FromStringAndSize(NULL, size);
1787 if (repr == NULL)
1788 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001789 if (size == 0)
1790 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791
1792 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001793 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 while (size-- > 0) {
1795 Py_UNICODE ch = *p++;
1796 if (ch >= 128) {
1797 if (ascii_encoding_error(&p, &s, errors,
1798 "ordinal not in range(128)"))
1799 goto onError;
1800 }
1801 else
1802 *s++ = (char)ch;
1803 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001804 /* Resize if error handling skipped some characters */
1805 if (s - start < PyString_GET_SIZE(repr))
1806 if (_PyString_Resize(&repr, s - start))
1807 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 return repr;
1809
1810 onError:
1811 Py_DECREF(repr);
1812 return NULL;
1813}
1814
1815PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1816{
1817 if (!PyUnicode_Check(unicode)) {
1818 PyErr_BadArgument();
1819 return NULL;
1820 }
1821 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1822 PyUnicode_GET_SIZE(unicode),
1823 NULL);
1824}
1825
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001826#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001828/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001829
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001830PyObject *PyUnicode_DecodeMBCS(const char *s,
1831 int size,
1832 const char *errors)
1833{
1834 PyUnicodeObject *v;
1835 Py_UNICODE *p;
1836
1837 /* First get the size of the result */
1838 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001839 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001840 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1841
1842 v = _PyUnicode_New(usize);
1843 if (v == NULL)
1844 return NULL;
1845 if (usize == 0)
1846 return (PyObject *)v;
1847 p = PyUnicode_AS_UNICODE(v);
1848 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1849 Py_DECREF(v);
1850 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1851 }
1852
1853 return (PyObject *)v;
1854}
1855
1856PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1857 int size,
1858 const char *errors)
1859{
1860 PyObject *repr;
1861 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001862 DWORD mbcssize;
1863
1864 /* If there are no characters, bail now! */
1865 if (size==0)
1866 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001867
1868 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001869 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001870 if (mbcssize==0)
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872
1873 repr = PyString_FromStringAndSize(NULL, mbcssize);
1874 if (repr == NULL)
1875 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001876 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001877 return repr;
1878
1879 /* Do the conversion */
1880 s = PyString_AS_STRING(repr);
1881 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1882 Py_DECREF(repr);
1883 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1884 }
1885 return repr;
1886}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001887
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001888#endif /* MS_WIN32 */
1889
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890/* --- Character Mapping Codec -------------------------------------------- */
1891
1892static
1893int charmap_decoding_error(const char **source,
1894 Py_UNICODE **dest,
1895 const char *errors,
1896 const char *details)
1897{
1898 if ((errors == NULL) ||
1899 (strcmp(errors,"strict") == 0)) {
1900 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001901 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 details);
1903 return -1;
1904 }
1905 else if (strcmp(errors,"ignore") == 0) {
1906 return 0;
1907 }
1908 else if (strcmp(errors,"replace") == 0) {
1909 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1910 (*dest)++;
1911 return 0;
1912 }
1913 else {
1914 PyErr_Format(PyExc_ValueError,
1915 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001916 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 errors);
1918 return -1;
1919 }
1920}
1921
1922PyObject *PyUnicode_DecodeCharmap(const char *s,
1923 int size,
1924 PyObject *mapping,
1925 const char *errors)
1926{
1927 PyUnicodeObject *v;
1928 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001929 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930
1931 /* Default to Latin-1 */
1932 if (mapping == NULL)
1933 return PyUnicode_DecodeLatin1(s, size, errors);
1934
1935 v = _PyUnicode_New(size);
1936 if (v == NULL)
1937 goto onError;
1938 if (size == 0)
1939 return (PyObject *)v;
1940 p = PyUnicode_AS_UNICODE(v);
1941 while (size-- > 0) {
1942 unsigned char ch = *s++;
1943 PyObject *w, *x;
1944
1945 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1946 w = PyInt_FromLong((long)ch);
1947 if (w == NULL)
1948 goto onError;
1949 x = PyObject_GetItem(mapping, w);
1950 Py_DECREF(w);
1951 if (x == NULL) {
1952 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001953 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001955 x = Py_None;
1956 Py_INCREF(x);
1957 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001958 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 }
1960
1961 /* Apply mapping */
1962 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001963 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 if (value < 0 || value > 65535) {
1965 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001966 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 Py_DECREF(x);
1968 goto onError;
1969 }
1970 *p++ = (Py_UNICODE)value;
1971 }
1972 else if (x == Py_None) {
1973 /* undefined mapping */
1974 if (charmap_decoding_error(&s, &p, errors,
1975 "character maps to <undefined>")) {
1976 Py_DECREF(x);
1977 goto onError;
1978 }
1979 }
1980 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001981 int targetsize = PyUnicode_GET_SIZE(x);
1982
1983 if (targetsize == 1)
1984 /* 1-1 mapping */
1985 *p++ = *PyUnicode_AS_UNICODE(x);
1986
1987 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001989 if (targetsize > extrachars) {
1990 /* resize first */
1991 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1992 int needed = (targetsize - extrachars) + \
1993 (targetsize << 2);
1994 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001995 if (_PyUnicode_Resize(&v,
1996 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001997 Py_DECREF(x);
1998 goto onError;
1999 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002000 p = PyUnicode_AS_UNICODE(v) + oldpos;
2001 }
2002 Py_UNICODE_COPY(p,
2003 PyUnicode_AS_UNICODE(x),
2004 targetsize);
2005 p += targetsize;
2006 extrachars -= targetsize;
2007 }
2008 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 }
2010 else {
2011 /* wrong return value */
2012 PyErr_SetString(PyExc_TypeError,
2013 "character mapping must return integer, None or unicode");
2014 Py_DECREF(x);
2015 goto onError;
2016 }
2017 Py_DECREF(x);
2018 }
2019 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002020 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 goto onError;
2022 return (PyObject *)v;
2023
2024 onError:
2025 Py_XDECREF(v);
2026 return NULL;
2027}
2028
2029static
2030int charmap_encoding_error(const Py_UNICODE **source,
2031 char **dest,
2032 const char *errors,
2033 const char *details)
2034{
2035 if ((errors == NULL) ||
2036 (strcmp(errors,"strict") == 0)) {
2037 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002038 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 details);
2040 return -1;
2041 }
2042 else if (strcmp(errors,"ignore") == 0) {
2043 return 0;
2044 }
2045 else if (strcmp(errors,"replace") == 0) {
2046 **dest = '?';
2047 (*dest)++;
2048 return 0;
2049 }
2050 else {
2051 PyErr_Format(PyExc_ValueError,
2052 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002053 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 errors);
2055 return -1;
2056 }
2057}
2058
2059PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2060 int size,
2061 PyObject *mapping,
2062 const char *errors)
2063{
2064 PyObject *v;
2065 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002066 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067
2068 /* Default to Latin-1 */
2069 if (mapping == NULL)
2070 return PyUnicode_EncodeLatin1(p, size, errors);
2071
2072 v = PyString_FromStringAndSize(NULL, size);
2073 if (v == NULL)
2074 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002075 if (size == 0)
2076 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 s = PyString_AS_STRING(v);
2078 while (size-- > 0) {
2079 Py_UNICODE ch = *p++;
2080 PyObject *w, *x;
2081
2082 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2083 w = PyInt_FromLong((long)ch);
2084 if (w == NULL)
2085 goto onError;
2086 x = PyObject_GetItem(mapping, w);
2087 Py_DECREF(w);
2088 if (x == NULL) {
2089 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002090 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002092 x = Py_None;
2093 Py_INCREF(x);
2094 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002095 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 }
2097
2098 /* Apply mapping */
2099 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002100 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 if (value < 0 || value > 255) {
2102 PyErr_SetString(PyExc_TypeError,
2103 "character mapping must be in range(256)");
2104 Py_DECREF(x);
2105 goto onError;
2106 }
2107 *s++ = (char)value;
2108 }
2109 else if (x == Py_None) {
2110 /* undefined mapping */
2111 if (charmap_encoding_error(&p, &s, errors,
2112 "character maps to <undefined>")) {
2113 Py_DECREF(x);
2114 goto onError;
2115 }
2116 }
2117 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002118 int targetsize = PyString_GET_SIZE(x);
2119
2120 if (targetsize == 1)
2121 /* 1-1 mapping */
2122 *s++ = *PyString_AS_STRING(x);
2123
2124 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002126 if (targetsize > extrachars) {
2127 /* resize first */
2128 int oldpos = (int)(s - PyString_AS_STRING(v));
2129 int needed = (targetsize - extrachars) + \
2130 (targetsize << 2);
2131 extrachars += needed;
2132 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002133 Py_DECREF(x);
2134 goto onError;
2135 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002136 s = PyString_AS_STRING(v) + oldpos;
2137 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002138 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002139 s += targetsize;
2140 extrachars -= targetsize;
2141 }
2142 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143 }
2144 else {
2145 /* wrong return value */
2146 PyErr_SetString(PyExc_TypeError,
2147 "character mapping must return integer, None or unicode");
2148 Py_DECREF(x);
2149 goto onError;
2150 }
2151 Py_DECREF(x);
2152 }
2153 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2154 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2155 goto onError;
2156 return v;
2157
2158 onError:
2159 Py_DECREF(v);
2160 return NULL;
2161}
2162
2163PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2164 PyObject *mapping)
2165{
2166 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2167 PyErr_BadArgument();
2168 return NULL;
2169 }
2170 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2171 PyUnicode_GET_SIZE(unicode),
2172 mapping,
2173 NULL);
2174}
2175
2176static
2177int translate_error(const Py_UNICODE **source,
2178 Py_UNICODE **dest,
2179 const char *errors,
2180 const char *details)
2181{
2182 if ((errors == NULL) ||
2183 (strcmp(errors,"strict") == 0)) {
2184 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002185 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 details);
2187 return -1;
2188 }
2189 else if (strcmp(errors,"ignore") == 0) {
2190 return 0;
2191 }
2192 else if (strcmp(errors,"replace") == 0) {
2193 **dest = '?';
2194 (*dest)++;
2195 return 0;
2196 }
2197 else {
2198 PyErr_Format(PyExc_ValueError,
2199 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002200 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 errors);
2202 return -1;
2203 }
2204}
2205
2206PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2207 int size,
2208 PyObject *mapping,
2209 const char *errors)
2210{
2211 PyUnicodeObject *v;
2212 Py_UNICODE *p;
2213
2214 if (mapping == NULL) {
2215 PyErr_BadArgument();
2216 return NULL;
2217 }
2218
2219 /* Output will never be longer than input */
2220 v = _PyUnicode_New(size);
2221 if (v == NULL)
2222 goto onError;
2223 if (size == 0)
2224 goto done;
2225 p = PyUnicode_AS_UNICODE(v);
2226 while (size-- > 0) {
2227 Py_UNICODE ch = *s++;
2228 PyObject *w, *x;
2229
2230 /* Get mapping */
2231 w = PyInt_FromLong(ch);
2232 if (w == NULL)
2233 goto onError;
2234 x = PyObject_GetItem(mapping, w);
2235 Py_DECREF(w);
2236 if (x == NULL) {
2237 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2238 /* No mapping found: default to 1-1 mapping */
2239 PyErr_Clear();
2240 *p++ = ch;
2241 continue;
2242 }
2243 goto onError;
2244 }
2245
2246 /* Apply mapping */
2247 if (PyInt_Check(x))
2248 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2249 else if (x == Py_None) {
2250 /* undefined mapping */
2251 if (translate_error(&s, &p, errors,
2252 "character maps to <undefined>")) {
2253 Py_DECREF(x);
2254 goto onError;
2255 }
2256 }
2257 else if (PyUnicode_Check(x)) {
2258 if (PyUnicode_GET_SIZE(x) != 1) {
2259 /* 1-n mapping */
2260 PyErr_SetString(PyExc_NotImplementedError,
2261 "1-n mappings are currently not implemented");
2262 Py_DECREF(x);
2263 goto onError;
2264 }
2265 *p++ = *PyUnicode_AS_UNICODE(x);
2266 }
2267 else {
2268 /* wrong return value */
2269 PyErr_SetString(PyExc_TypeError,
2270 "translate mapping must return integer, None or unicode");
2271 Py_DECREF(x);
2272 goto onError;
2273 }
2274 Py_DECREF(x);
2275 }
2276 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002277 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002278 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279
2280 done:
2281 return (PyObject *)v;
2282
2283 onError:
2284 Py_XDECREF(v);
2285 return NULL;
2286}
2287
2288PyObject *PyUnicode_Translate(PyObject *str,
2289 PyObject *mapping,
2290 const char *errors)
2291{
2292 PyObject *result;
2293
2294 str = PyUnicode_FromObject(str);
2295 if (str == NULL)
2296 goto onError;
2297 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2298 PyUnicode_GET_SIZE(str),
2299 mapping,
2300 errors);
2301 Py_DECREF(str);
2302 return result;
2303
2304 onError:
2305 Py_XDECREF(str);
2306 return NULL;
2307}
2308
Guido van Rossum9e896b32000-04-05 20:11:21 +00002309/* --- Decimal Encoder ---------------------------------------------------- */
2310
2311int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2312 int length,
2313 char *output,
2314 const char *errors)
2315{
2316 Py_UNICODE *p, *end;
2317
2318 if (output == NULL) {
2319 PyErr_BadArgument();
2320 return -1;
2321 }
2322
2323 p = s;
2324 end = s + length;
2325 while (p < end) {
2326 register Py_UNICODE ch = *p++;
2327 int decimal;
2328
2329 if (Py_UNICODE_ISSPACE(ch)) {
2330 *output++ = ' ';
2331 continue;
2332 }
2333 decimal = Py_UNICODE_TODECIMAL(ch);
2334 if (decimal >= 0) {
2335 *output++ = '0' + decimal;
2336 continue;
2337 }
Guido van Rossumba477042000-04-06 18:18:10 +00002338 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002339 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002340 continue;
2341 }
2342 /* All other characters are considered invalid */
2343 if (errors == NULL || strcmp(errors, "strict") == 0) {
2344 PyErr_SetString(PyExc_ValueError,
2345 "invalid decimal Unicode string");
2346 goto onError;
2347 }
2348 else if (strcmp(errors, "ignore") == 0)
2349 continue;
2350 else if (strcmp(errors, "replace") == 0) {
2351 *output++ = '?';
2352 continue;
2353 }
2354 }
2355 /* 0-terminate the output string */
2356 *output++ = '\0';
2357 return 0;
2358
2359 onError:
2360 return -1;
2361}
2362
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363/* --- Helpers ------------------------------------------------------------ */
2364
2365static
2366int count(PyUnicodeObject *self,
2367 int start,
2368 int end,
2369 PyUnicodeObject *substring)
2370{
2371 int count = 0;
2372
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002373 if (start < 0)
2374 start += self->length;
2375 if (start < 0)
2376 start = 0;
2377 if (end > self->length)
2378 end = self->length;
2379 if (end < 0)
2380 end += self->length;
2381 if (end < 0)
2382 end = 0;
2383
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002384 if (substring->length == 0)
2385 return (end - start + 1);
2386
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 end -= substring->length;
2388
2389 while (start <= end)
2390 if (Py_UNICODE_MATCH(self, start, substring)) {
2391 count++;
2392 start += substring->length;
2393 } else
2394 start++;
2395
2396 return count;
2397}
2398
2399int PyUnicode_Count(PyObject *str,
2400 PyObject *substr,
2401 int start,
2402 int end)
2403{
2404 int result;
2405
2406 str = PyUnicode_FromObject(str);
2407 if (str == NULL)
2408 return -1;
2409 substr = PyUnicode_FromObject(substr);
2410 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002411 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 return -1;
2413 }
2414
2415 result = count((PyUnicodeObject *)str,
2416 start, end,
2417 (PyUnicodeObject *)substr);
2418
2419 Py_DECREF(str);
2420 Py_DECREF(substr);
2421 return result;
2422}
2423
2424static
2425int findstring(PyUnicodeObject *self,
2426 PyUnicodeObject *substring,
2427 int start,
2428 int end,
2429 int direction)
2430{
2431 if (start < 0)
2432 start += self->length;
2433 if (start < 0)
2434 start = 0;
2435
2436 if (substring->length == 0)
2437 return start;
2438
2439 if (end > self->length)
2440 end = self->length;
2441 if (end < 0)
2442 end += self->length;
2443 if (end < 0)
2444 end = 0;
2445
2446 end -= substring->length;
2447
2448 if (direction < 0) {
2449 for (; end >= start; end--)
2450 if (Py_UNICODE_MATCH(self, end, substring))
2451 return end;
2452 } else {
2453 for (; start <= end; start++)
2454 if (Py_UNICODE_MATCH(self, start, substring))
2455 return start;
2456 }
2457
2458 return -1;
2459}
2460
2461int PyUnicode_Find(PyObject *str,
2462 PyObject *substr,
2463 int start,
2464 int end,
2465 int direction)
2466{
2467 int result;
2468
2469 str = PyUnicode_FromObject(str);
2470 if (str == NULL)
2471 return -1;
2472 substr = PyUnicode_FromObject(substr);
2473 if (substr == NULL) {
2474 Py_DECREF(substr);
2475 return -1;
2476 }
2477
2478 result = findstring((PyUnicodeObject *)str,
2479 (PyUnicodeObject *)substr,
2480 start, end, direction);
2481 Py_DECREF(str);
2482 Py_DECREF(substr);
2483 return result;
2484}
2485
2486static
2487int tailmatch(PyUnicodeObject *self,
2488 PyUnicodeObject *substring,
2489 int start,
2490 int end,
2491 int direction)
2492{
2493 if (start < 0)
2494 start += self->length;
2495 if (start < 0)
2496 start = 0;
2497
2498 if (substring->length == 0)
2499 return 1;
2500
2501 if (end > self->length)
2502 end = self->length;
2503 if (end < 0)
2504 end += self->length;
2505 if (end < 0)
2506 end = 0;
2507
2508 end -= substring->length;
2509 if (end < start)
2510 return 0;
2511
2512 if (direction > 0) {
2513 if (Py_UNICODE_MATCH(self, end, substring))
2514 return 1;
2515 } else {
2516 if (Py_UNICODE_MATCH(self, start, substring))
2517 return 1;
2518 }
2519
2520 return 0;
2521}
2522
2523int PyUnicode_Tailmatch(PyObject *str,
2524 PyObject *substr,
2525 int start,
2526 int end,
2527 int direction)
2528{
2529 int result;
2530
2531 str = PyUnicode_FromObject(str);
2532 if (str == NULL)
2533 return -1;
2534 substr = PyUnicode_FromObject(substr);
2535 if (substr == NULL) {
2536 Py_DECREF(substr);
2537 return -1;
2538 }
2539
2540 result = tailmatch((PyUnicodeObject *)str,
2541 (PyUnicodeObject *)substr,
2542 start, end, direction);
2543 Py_DECREF(str);
2544 Py_DECREF(substr);
2545 return result;
2546}
2547
2548static
2549const Py_UNICODE *findchar(const Py_UNICODE *s,
2550 int size,
2551 Py_UNICODE ch)
2552{
2553 /* like wcschr, but doesn't stop at NULL characters */
2554
2555 while (size-- > 0) {
2556 if (*s == ch)
2557 return s;
2558 s++;
2559 }
2560
2561 return NULL;
2562}
2563
2564/* Apply fixfct filter to the Unicode object self and return a
2565 reference to the modified object */
2566
2567static
2568PyObject *fixup(PyUnicodeObject *self,
2569 int (*fixfct)(PyUnicodeObject *s))
2570{
2571
2572 PyUnicodeObject *u;
2573
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002574 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 if (u == NULL)
2576 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002577
2578 Py_UNICODE_COPY(u->str, self->str, self->length);
2579
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 if (!fixfct(u)) {
2581 /* fixfct should return TRUE if it modified the buffer. If
2582 FALSE, return a reference to the original buffer instead
2583 (to save space, not time) */
2584 Py_INCREF(self);
2585 Py_DECREF(u);
2586 return (PyObject*) self;
2587 }
2588 return (PyObject*) u;
2589}
2590
2591static
2592int fixupper(PyUnicodeObject *self)
2593{
2594 int len = self->length;
2595 Py_UNICODE *s = self->str;
2596 int status = 0;
2597
2598 while (len-- > 0) {
2599 register Py_UNICODE ch;
2600
2601 ch = Py_UNICODE_TOUPPER(*s);
2602 if (ch != *s) {
2603 status = 1;
2604 *s = ch;
2605 }
2606 s++;
2607 }
2608
2609 return status;
2610}
2611
2612static
2613int fixlower(PyUnicodeObject *self)
2614{
2615 int len = self->length;
2616 Py_UNICODE *s = self->str;
2617 int status = 0;
2618
2619 while (len-- > 0) {
2620 register Py_UNICODE ch;
2621
2622 ch = Py_UNICODE_TOLOWER(*s);
2623 if (ch != *s) {
2624 status = 1;
2625 *s = ch;
2626 }
2627 s++;
2628 }
2629
2630 return status;
2631}
2632
2633static
2634int fixswapcase(PyUnicodeObject *self)
2635{
2636 int len = self->length;
2637 Py_UNICODE *s = self->str;
2638 int status = 0;
2639
2640 while (len-- > 0) {
2641 if (Py_UNICODE_ISUPPER(*s)) {
2642 *s = Py_UNICODE_TOLOWER(*s);
2643 status = 1;
2644 } else if (Py_UNICODE_ISLOWER(*s)) {
2645 *s = Py_UNICODE_TOUPPER(*s);
2646 status = 1;
2647 }
2648 s++;
2649 }
2650
2651 return status;
2652}
2653
2654static
2655int fixcapitalize(PyUnicodeObject *self)
2656{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002657 int len = self->length;
2658 Py_UNICODE *s = self->str;
2659 int status = 0;
2660
2661 if (len == 0)
2662 return 0;
2663 if (Py_UNICODE_ISLOWER(*s)) {
2664 *s = Py_UNICODE_TOUPPER(*s);
2665 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002667 s++;
2668 while (--len > 0) {
2669 if (Py_UNICODE_ISUPPER(*s)) {
2670 *s = Py_UNICODE_TOLOWER(*s);
2671 status = 1;
2672 }
2673 s++;
2674 }
2675 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676}
2677
2678static
2679int fixtitle(PyUnicodeObject *self)
2680{
2681 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2682 register Py_UNICODE *e;
2683 int previous_is_cased;
2684
2685 /* Shortcut for single character strings */
2686 if (PyUnicode_GET_SIZE(self) == 1) {
2687 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2688 if (*p != ch) {
2689 *p = ch;
2690 return 1;
2691 }
2692 else
2693 return 0;
2694 }
2695
2696 e = p + PyUnicode_GET_SIZE(self);
2697 previous_is_cased = 0;
2698 for (; p < e; p++) {
2699 register const Py_UNICODE ch = *p;
2700
2701 if (previous_is_cased)
2702 *p = Py_UNICODE_TOLOWER(ch);
2703 else
2704 *p = Py_UNICODE_TOTITLE(ch);
2705
2706 if (Py_UNICODE_ISLOWER(ch) ||
2707 Py_UNICODE_ISUPPER(ch) ||
2708 Py_UNICODE_ISTITLE(ch))
2709 previous_is_cased = 1;
2710 else
2711 previous_is_cased = 0;
2712 }
2713 return 1;
2714}
2715
2716PyObject *PyUnicode_Join(PyObject *separator,
2717 PyObject *seq)
2718{
2719 Py_UNICODE *sep;
2720 int seplen;
2721 PyUnicodeObject *res = NULL;
2722 int reslen = 0;
2723 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002724 int sz = 100;
2725 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002726 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002727
Tim Peters2cfe3682001-05-05 05:36:48 +00002728 it = PyObject_GetIter(seq);
2729 if (it == NULL)
2730 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002731
2732 if (separator == NULL) {
2733 Py_UNICODE blank = ' ';
2734 sep = &blank;
2735 seplen = 1;
2736 }
2737 else {
2738 separator = PyUnicode_FromObject(separator);
2739 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002740 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002741 sep = PyUnicode_AS_UNICODE(separator);
2742 seplen = PyUnicode_GET_SIZE(separator);
2743 }
2744
2745 res = _PyUnicode_New(sz);
2746 if (res == NULL)
2747 goto onError;
2748 p = PyUnicode_AS_UNICODE(res);
2749 reslen = 0;
2750
Tim Peters2cfe3682001-05-05 05:36:48 +00002751 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002752 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002753 PyObject *item = PyIter_Next(it);
2754 if (item == NULL) {
2755 if (PyErr_Occurred())
2756 goto onError;
2757 break;
2758 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002759 if (!PyUnicode_Check(item)) {
2760 PyObject *v;
2761 v = PyUnicode_FromObject(item);
2762 Py_DECREF(item);
2763 item = v;
2764 if (item == NULL)
2765 goto onError;
2766 }
2767 itemlen = PyUnicode_GET_SIZE(item);
2768 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002769 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002770 goto onError;
2771 sz *= 2;
2772 p = PyUnicode_AS_UNICODE(res) + reslen;
2773 }
2774 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002775 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002776 p += seplen;
2777 reslen += seplen;
2778 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002779 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002780 p += itemlen;
2781 reslen += itemlen;
2782 Py_DECREF(item);
2783 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002784 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002785 goto onError;
2786
2787 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002788 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002789 return (PyObject *)res;
2790
2791 onError:
2792 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002793 Py_XDECREF(res);
2794 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002795 return NULL;
2796}
2797
2798static
2799PyUnicodeObject *pad(PyUnicodeObject *self,
2800 int left,
2801 int right,
2802 Py_UNICODE fill)
2803{
2804 PyUnicodeObject *u;
2805
2806 if (left < 0)
2807 left = 0;
2808 if (right < 0)
2809 right = 0;
2810
2811 if (left == 0 && right == 0) {
2812 Py_INCREF(self);
2813 return self;
2814 }
2815
2816 u = _PyUnicode_New(left + self->length + right);
2817 if (u) {
2818 if (left)
2819 Py_UNICODE_FILL(u->str, fill, left);
2820 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2821 if (right)
2822 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2823 }
2824
2825 return u;
2826}
2827
2828#define SPLIT_APPEND(data, left, right) \
2829 str = PyUnicode_FromUnicode(data + left, right - left); \
2830 if (!str) \
2831 goto onError; \
2832 if (PyList_Append(list, str)) { \
2833 Py_DECREF(str); \
2834 goto onError; \
2835 } \
2836 else \
2837 Py_DECREF(str);
2838
2839static
2840PyObject *split_whitespace(PyUnicodeObject *self,
2841 PyObject *list,
2842 int maxcount)
2843{
2844 register int i;
2845 register int j;
2846 int len = self->length;
2847 PyObject *str;
2848
2849 for (i = j = 0; i < len; ) {
2850 /* find a token */
2851 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2852 i++;
2853 j = i;
2854 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2855 i++;
2856 if (j < i) {
2857 if (maxcount-- <= 0)
2858 break;
2859 SPLIT_APPEND(self->str, j, i);
2860 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2861 i++;
2862 j = i;
2863 }
2864 }
2865 if (j < len) {
2866 SPLIT_APPEND(self->str, j, len);
2867 }
2868 return list;
2869
2870 onError:
2871 Py_DECREF(list);
2872 return NULL;
2873}
2874
2875PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002876 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002877{
2878 register int i;
2879 register int j;
2880 int len;
2881 PyObject *list;
2882 PyObject *str;
2883 Py_UNICODE *data;
2884
2885 string = PyUnicode_FromObject(string);
2886 if (string == NULL)
2887 return NULL;
2888 data = PyUnicode_AS_UNICODE(string);
2889 len = PyUnicode_GET_SIZE(string);
2890
Guido van Rossumd57fd912000-03-10 22:53:23 +00002891 list = PyList_New(0);
2892 if (!list)
2893 goto onError;
2894
2895 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002896 int eol;
2897
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898 /* Find a line and append it */
2899 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2900 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901
2902 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002903 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002904 if (i < len) {
2905 if (data[i] == '\r' && i + 1 < len &&
2906 data[i+1] == '\n')
2907 i += 2;
2908 else
2909 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002910 if (keepends)
2911 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 }
Guido van Rossum86662912000-04-11 15:38:46 +00002913 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002914 j = i;
2915 }
2916 if (j < len) {
2917 SPLIT_APPEND(data, j, len);
2918 }
2919
2920 Py_DECREF(string);
2921 return list;
2922
2923 onError:
2924 Py_DECREF(list);
2925 Py_DECREF(string);
2926 return NULL;
2927}
2928
2929static
2930PyObject *split_char(PyUnicodeObject *self,
2931 PyObject *list,
2932 Py_UNICODE ch,
2933 int maxcount)
2934{
2935 register int i;
2936 register int j;
2937 int len = self->length;
2938 PyObject *str;
2939
2940 for (i = j = 0; i < len; ) {
2941 if (self->str[i] == ch) {
2942 if (maxcount-- <= 0)
2943 break;
2944 SPLIT_APPEND(self->str, j, i);
2945 i = j = i + 1;
2946 } else
2947 i++;
2948 }
2949 if (j <= len) {
2950 SPLIT_APPEND(self->str, j, len);
2951 }
2952 return list;
2953
2954 onError:
2955 Py_DECREF(list);
2956 return NULL;
2957}
2958
2959static
2960PyObject *split_substring(PyUnicodeObject *self,
2961 PyObject *list,
2962 PyUnicodeObject *substring,
2963 int maxcount)
2964{
2965 register int i;
2966 register int j;
2967 int len = self->length;
2968 int sublen = substring->length;
2969 PyObject *str;
2970
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002971 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002972 if (Py_UNICODE_MATCH(self, i, substring)) {
2973 if (maxcount-- <= 0)
2974 break;
2975 SPLIT_APPEND(self->str, j, i);
2976 i = j = i + sublen;
2977 } else
2978 i++;
2979 }
2980 if (j <= len) {
2981 SPLIT_APPEND(self->str, j, len);
2982 }
2983 return list;
2984
2985 onError:
2986 Py_DECREF(list);
2987 return NULL;
2988}
2989
2990#undef SPLIT_APPEND
2991
2992static
2993PyObject *split(PyUnicodeObject *self,
2994 PyUnicodeObject *substring,
2995 int maxcount)
2996{
2997 PyObject *list;
2998
2999 if (maxcount < 0)
3000 maxcount = INT_MAX;
3001
3002 list = PyList_New(0);
3003 if (!list)
3004 return NULL;
3005
3006 if (substring == NULL)
3007 return split_whitespace(self,list,maxcount);
3008
3009 else if (substring->length == 1)
3010 return split_char(self,list,substring->str[0],maxcount);
3011
3012 else if (substring->length == 0) {
3013 Py_DECREF(list);
3014 PyErr_SetString(PyExc_ValueError, "empty separator");
3015 return NULL;
3016 }
3017 else
3018 return split_substring(self,list,substring,maxcount);
3019}
3020
3021static
3022PyObject *strip(PyUnicodeObject *self,
3023 int left,
3024 int right)
3025{
3026 Py_UNICODE *p = self->str;
3027 int start = 0;
3028 int end = self->length;
3029
3030 if (left)
3031 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3032 start++;
3033
3034 if (right)
3035 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3036 end--;
3037
3038 if (start == 0 && end == self->length) {
3039 /* couldn't strip anything off, return original string */
3040 Py_INCREF(self);
3041 return (PyObject*) self;
3042 }
3043
3044 return (PyObject*) PyUnicode_FromUnicode(
3045 self->str + start,
3046 end - start
3047 );
3048}
3049
3050static
3051PyObject *replace(PyUnicodeObject *self,
3052 PyUnicodeObject *str1,
3053 PyUnicodeObject *str2,
3054 int maxcount)
3055{
3056 PyUnicodeObject *u;
3057
3058 if (maxcount < 0)
3059 maxcount = INT_MAX;
3060
3061 if (str1->length == 1 && str2->length == 1) {
3062 int i;
3063
3064 /* replace characters */
3065 if (!findchar(self->str, self->length, str1->str[0])) {
3066 /* nothing to replace, return original string */
3067 Py_INCREF(self);
3068 u = self;
3069 } else {
3070 Py_UNICODE u1 = str1->str[0];
3071 Py_UNICODE u2 = str2->str[0];
3072
3073 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003074 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003075 self->length
3076 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003077 if (u != NULL) {
3078 Py_UNICODE_COPY(u->str, self->str,
3079 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003080 for (i = 0; i < u->length; i++)
3081 if (u->str[i] == u1) {
3082 if (--maxcount < 0)
3083 break;
3084 u->str[i] = u2;
3085 }
3086 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003087 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088
3089 } else {
3090 int n, i;
3091 Py_UNICODE *p;
3092
3093 /* replace strings */
3094 n = count(self, 0, self->length, str1);
3095 if (n > maxcount)
3096 n = maxcount;
3097 if (n == 0) {
3098 /* nothing to replace, return original string */
3099 Py_INCREF(self);
3100 u = self;
3101 } else {
3102 u = _PyUnicode_New(
3103 self->length + n * (str2->length - str1->length));
3104 if (u) {
3105 i = 0;
3106 p = u->str;
3107 while (i <= self->length - str1->length)
3108 if (Py_UNICODE_MATCH(self, i, str1)) {
3109 /* replace string segment */
3110 Py_UNICODE_COPY(p, str2->str, str2->length);
3111 p += str2->length;
3112 i += str1->length;
3113 if (--n <= 0) {
3114 /* copy remaining part */
3115 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3116 break;
3117 }
3118 } else
3119 *p++ = self->str[i++];
3120 }
3121 }
3122 }
3123
3124 return (PyObject *) u;
3125}
3126
3127/* --- Unicode Object Methods --------------------------------------------- */
3128
3129static char title__doc__[] =
3130"S.title() -> unicode\n\
3131\n\
3132Return a titlecased version of S, i.e. words start with title case\n\
3133characters, all remaining cased characters have lower case.";
3134
3135static PyObject*
3136unicode_title(PyUnicodeObject *self, PyObject *args)
3137{
3138 if (!PyArg_NoArgs(args))
3139 return NULL;
3140 return fixup(self, fixtitle);
3141}
3142
3143static char capitalize__doc__[] =
3144"S.capitalize() -> unicode\n\
3145\n\
3146Return a capitalized version of S, i.e. make the first character\n\
3147have upper case.";
3148
3149static PyObject*
3150unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3151{
3152 if (!PyArg_NoArgs(args))
3153 return NULL;
3154 return fixup(self, fixcapitalize);
3155}
3156
3157#if 0
3158static char capwords__doc__[] =
3159"S.capwords() -> unicode\n\
3160\n\
3161Apply .capitalize() to all words in S and return the result with\n\
3162normalized whitespace (all whitespace strings are replaced by ' ').";
3163
3164static PyObject*
3165unicode_capwords(PyUnicodeObject *self, PyObject *args)
3166{
3167 PyObject *list;
3168 PyObject *item;
3169 int i;
3170
3171 if (!PyArg_NoArgs(args))
3172 return NULL;
3173
3174 /* Split into words */
3175 list = split(self, NULL, -1);
3176 if (!list)
3177 return NULL;
3178
3179 /* Capitalize each word */
3180 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3181 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3182 fixcapitalize);
3183 if (item == NULL)
3184 goto onError;
3185 Py_DECREF(PyList_GET_ITEM(list, i));
3186 PyList_SET_ITEM(list, i, item);
3187 }
3188
3189 /* Join the words to form a new string */
3190 item = PyUnicode_Join(NULL, list);
3191
3192onError:
3193 Py_DECREF(list);
3194 return (PyObject *)item;
3195}
3196#endif
3197
3198static char center__doc__[] =
3199"S.center(width) -> unicode\n\
3200\n\
3201Return S centered in a Unicode string of length width. Padding is done\n\
3202using spaces.";
3203
3204static PyObject *
3205unicode_center(PyUnicodeObject *self, PyObject *args)
3206{
3207 int marg, left;
3208 int width;
3209
3210 if (!PyArg_ParseTuple(args, "i:center", &width))
3211 return NULL;
3212
3213 if (self->length >= width) {
3214 Py_INCREF(self);
3215 return (PyObject*) self;
3216 }
3217
3218 marg = width - self->length;
3219 left = marg / 2 + (marg & width & 1);
3220
3221 return (PyObject*) pad(self, left, marg - left, ' ');
3222}
3223
Marc-André Lemburge5034372000-08-08 08:04:29 +00003224#if 0
3225
3226/* This code should go into some future Unicode collation support
3227 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003228 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003229
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003230/* speedy UTF-16 code point order comparison */
3231/* gleaned from: */
3232/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3233
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003234static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003235{
3236 0, 0, 0, 0, 0, 0, 0, 0,
3237 0, 0, 0, 0, 0, 0, 0, 0,
3238 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003239 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003240};
3241
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242static int
3243unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3244{
3245 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003246
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 Py_UNICODE *s1 = str1->str;
3248 Py_UNICODE *s2 = str2->str;
3249
3250 len1 = str1->length;
3251 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003252
Guido van Rossumd57fd912000-03-10 22:53:23 +00003253 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003254 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003255 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003256
3257 c1 = *s1++;
3258 c2 = *s2++;
3259 if (c1 > (1<<11) * 26)
3260 c1 += utf16Fixup[c1>>11];
3261 if (c2 > (1<<11) * 26)
3262 c2 += utf16Fixup[c2>>11];
3263
3264 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003265 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003266 if (diff)
3267 return (diff < 0) ? -1 : (diff != 0);
3268 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003269 }
3270
3271 return (len1 < len2) ? -1 : (len1 != len2);
3272}
3273
Marc-André Lemburge5034372000-08-08 08:04:29 +00003274#else
3275
3276static int
3277unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3278{
3279 register int len1, len2;
3280
3281 Py_UNICODE *s1 = str1->str;
3282 Py_UNICODE *s2 = str2->str;
3283
3284 len1 = str1->length;
3285 len2 = str2->length;
3286
3287 while (len1 > 0 && len2 > 0) {
3288 register long diff;
3289
3290 diff = (long)*s1++ - (long)*s2++;
3291 if (diff)
3292 return (diff < 0) ? -1 : (diff != 0);
3293 len1--; len2--;
3294 }
3295
3296 return (len1 < len2) ? -1 : (len1 != len2);
3297}
3298
3299#endif
3300
Guido van Rossumd57fd912000-03-10 22:53:23 +00003301int PyUnicode_Compare(PyObject *left,
3302 PyObject *right)
3303{
3304 PyUnicodeObject *u = NULL, *v = NULL;
3305 int result;
3306
3307 /* Coerce the two arguments */
3308 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3309 if (u == NULL)
3310 goto onError;
3311 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3312 if (v == NULL)
3313 goto onError;
3314
Thomas Wouters7e474022000-07-16 12:04:32 +00003315 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316 if (v == u) {
3317 Py_DECREF(u);
3318 Py_DECREF(v);
3319 return 0;
3320 }
3321
3322 result = unicode_compare(u, v);
3323
3324 Py_DECREF(u);
3325 Py_DECREF(v);
3326 return result;
3327
3328onError:
3329 Py_XDECREF(u);
3330 Py_XDECREF(v);
3331 return -1;
3332}
3333
Guido van Rossum403d68b2000-03-13 15:55:09 +00003334int PyUnicode_Contains(PyObject *container,
3335 PyObject *element)
3336{
3337 PyUnicodeObject *u = NULL, *v = NULL;
3338 int result;
3339 register const Py_UNICODE *p, *e;
3340 register Py_UNICODE ch;
3341
3342 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003343 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003344 if (v == NULL) {
3345 PyErr_SetString(PyExc_TypeError,
3346 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003347 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003348 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003349 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3350 if (u == NULL) {
3351 Py_DECREF(v);
3352 goto onError;
3353 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003354
3355 /* Check v in u */
3356 if (PyUnicode_GET_SIZE(v) != 1) {
3357 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003358 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003359 goto onError;
3360 }
3361 ch = *PyUnicode_AS_UNICODE(v);
3362 p = PyUnicode_AS_UNICODE(u);
3363 e = p + PyUnicode_GET_SIZE(u);
3364 result = 0;
3365 while (p < e) {
3366 if (*p++ == ch) {
3367 result = 1;
3368 break;
3369 }
3370 }
3371
3372 Py_DECREF(u);
3373 Py_DECREF(v);
3374 return result;
3375
3376onError:
3377 Py_XDECREF(u);
3378 Py_XDECREF(v);
3379 return -1;
3380}
3381
Guido van Rossumd57fd912000-03-10 22:53:23 +00003382/* Concat to string or Unicode object giving a new Unicode object. */
3383
3384PyObject *PyUnicode_Concat(PyObject *left,
3385 PyObject *right)
3386{
3387 PyUnicodeObject *u = NULL, *v = NULL, *w;
3388
3389 /* Coerce the two arguments */
3390 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3391 if (u == NULL)
3392 goto onError;
3393 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3394 if (v == NULL)
3395 goto onError;
3396
3397 /* Shortcuts */
3398 if (v == unicode_empty) {
3399 Py_DECREF(v);
3400 return (PyObject *)u;
3401 }
3402 if (u == unicode_empty) {
3403 Py_DECREF(u);
3404 return (PyObject *)v;
3405 }
3406
3407 /* Concat the two Unicode strings */
3408 w = _PyUnicode_New(u->length + v->length);
3409 if (w == NULL)
3410 goto onError;
3411 Py_UNICODE_COPY(w->str, u->str, u->length);
3412 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3413
3414 Py_DECREF(u);
3415 Py_DECREF(v);
3416 return (PyObject *)w;
3417
3418onError:
3419 Py_XDECREF(u);
3420 Py_XDECREF(v);
3421 return NULL;
3422}
3423
3424static char count__doc__[] =
3425"S.count(sub[, start[, end]]) -> int\n\
3426\n\
3427Return the number of occurrences of substring sub in Unicode string\n\
3428S[start:end]. Optional arguments start and end are\n\
3429interpreted as in slice notation.";
3430
3431static PyObject *
3432unicode_count(PyUnicodeObject *self, PyObject *args)
3433{
3434 PyUnicodeObject *substring;
3435 int start = 0;
3436 int end = INT_MAX;
3437 PyObject *result;
3438
Guido van Rossumb8872e62000-05-09 14:14:27 +00003439 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3440 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441 return NULL;
3442
3443 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3444 (PyObject *)substring);
3445 if (substring == NULL)
3446 return NULL;
3447
Guido van Rossumd57fd912000-03-10 22:53:23 +00003448 if (start < 0)
3449 start += self->length;
3450 if (start < 0)
3451 start = 0;
3452 if (end > self->length)
3453 end = self->length;
3454 if (end < 0)
3455 end += self->length;
3456 if (end < 0)
3457 end = 0;
3458
3459 result = PyInt_FromLong((long) count(self, start, end, substring));
3460
3461 Py_DECREF(substring);
3462 return result;
3463}
3464
3465static char encode__doc__[] =
3466"S.encode([encoding[,errors]]) -> string\n\
3467\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003468Return an encoded string version of S. Default encoding is the current\n\
3469default string encoding. errors may be given to set a different error\n\
3470handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3471a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003472
3473static PyObject *
3474unicode_encode(PyUnicodeObject *self, PyObject *args)
3475{
3476 char *encoding = NULL;
3477 char *errors = NULL;
3478 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3479 return NULL;
3480 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3481}
3482
3483static char expandtabs__doc__[] =
3484"S.expandtabs([tabsize]) -> unicode\n\
3485\n\
3486Return a copy of S where all tab characters are expanded using spaces.\n\
3487If tabsize is not given, a tab size of 8 characters is assumed.";
3488
3489static PyObject*
3490unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3491{
3492 Py_UNICODE *e;
3493 Py_UNICODE *p;
3494 Py_UNICODE *q;
3495 int i, j;
3496 PyUnicodeObject *u;
3497 int tabsize = 8;
3498
3499 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3500 return NULL;
3501
Thomas Wouters7e474022000-07-16 12:04:32 +00003502 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003503 i = j = 0;
3504 e = self->str + self->length;
3505 for (p = self->str; p < e; p++)
3506 if (*p == '\t') {
3507 if (tabsize > 0)
3508 j += tabsize - (j % tabsize);
3509 }
3510 else {
3511 j++;
3512 if (*p == '\n' || *p == '\r') {
3513 i += j;
3514 j = 0;
3515 }
3516 }
3517
3518 /* Second pass: create output string and fill it */
3519 u = _PyUnicode_New(i + j);
3520 if (!u)
3521 return NULL;
3522
3523 j = 0;
3524 q = u->str;
3525
3526 for (p = self->str; p < e; p++)
3527 if (*p == '\t') {
3528 if (tabsize > 0) {
3529 i = tabsize - (j % tabsize);
3530 j += i;
3531 while (i--)
3532 *q++ = ' ';
3533 }
3534 }
3535 else {
3536 j++;
3537 *q++ = *p;
3538 if (*p == '\n' || *p == '\r')
3539 j = 0;
3540 }
3541
3542 return (PyObject*) u;
3543}
3544
3545static char find__doc__[] =
3546"S.find(sub [,start [,end]]) -> int\n\
3547\n\
3548Return the lowest index in S where substring sub is found,\n\
3549such that sub is contained within s[start,end]. Optional\n\
3550arguments start and end are interpreted as in slice notation.\n\
3551\n\
3552Return -1 on failure.";
3553
3554static PyObject *
3555unicode_find(PyUnicodeObject *self, PyObject *args)
3556{
3557 PyUnicodeObject *substring;
3558 int start = 0;
3559 int end = INT_MAX;
3560 PyObject *result;
3561
Guido van Rossumb8872e62000-05-09 14:14:27 +00003562 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3563 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003564 return NULL;
3565 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3566 (PyObject *)substring);
3567 if (substring == NULL)
3568 return NULL;
3569
3570 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3571
3572 Py_DECREF(substring);
3573 return result;
3574}
3575
3576static PyObject *
3577unicode_getitem(PyUnicodeObject *self, int index)
3578{
3579 if (index < 0 || index >= self->length) {
3580 PyErr_SetString(PyExc_IndexError, "string index out of range");
3581 return NULL;
3582 }
3583
3584 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3585}
3586
3587static long
3588unicode_hash(PyUnicodeObject *self)
3589{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003590 /* Since Unicode objects compare equal to their ASCII string
3591 counterparts, they should use the individual character values
3592 as basis for their hash value. This is needed to assure that
3593 strings and Unicode objects behave in the same way as
3594 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003595
Fredrik Lundhdde61642000-07-10 18:27:47 +00003596 register int len;
3597 register Py_UNICODE *p;
3598 register long x;
3599
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 if (self->hash != -1)
3601 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003602 len = PyUnicode_GET_SIZE(self);
3603 p = PyUnicode_AS_UNICODE(self);
3604 x = *p << 7;
3605 while (--len >= 0)
3606 x = (1000003*x) ^ *p++;
3607 x ^= PyUnicode_GET_SIZE(self);
3608 if (x == -1)
3609 x = -2;
3610 self->hash = x;
3611 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612}
3613
3614static char index__doc__[] =
3615"S.index(sub [,start [,end]]) -> int\n\
3616\n\
3617Like S.find() but raise ValueError when the substring is not found.";
3618
3619static PyObject *
3620unicode_index(PyUnicodeObject *self, PyObject *args)
3621{
3622 int result;
3623 PyUnicodeObject *substring;
3624 int start = 0;
3625 int end = INT_MAX;
3626
Guido van Rossumb8872e62000-05-09 14:14:27 +00003627 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3628 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003629 return NULL;
3630
3631 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3632 (PyObject *)substring);
3633 if (substring == NULL)
3634 return NULL;
3635
3636 result = findstring(self, substring, start, end, 1);
3637
3638 Py_DECREF(substring);
3639 if (result < 0) {
3640 PyErr_SetString(PyExc_ValueError, "substring not found");
3641 return NULL;
3642 }
3643 return PyInt_FromLong(result);
3644}
3645
3646static char islower__doc__[] =
3647"S.islower() -> int\n\
3648\n\
3649Return 1 if all cased characters in S are lowercase and there is\n\
3650at least one cased character in S, 0 otherwise.";
3651
3652static PyObject*
3653unicode_islower(PyUnicodeObject *self, PyObject *args)
3654{
3655 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3656 register const Py_UNICODE *e;
3657 int cased;
3658
3659 if (!PyArg_NoArgs(args))
3660 return NULL;
3661
3662 /* Shortcut for single character strings */
3663 if (PyUnicode_GET_SIZE(self) == 1)
3664 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3665
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003666 /* Special case for empty strings */
3667 if (PyString_GET_SIZE(self) == 0)
3668 return PyInt_FromLong(0);
3669
Guido van Rossumd57fd912000-03-10 22:53:23 +00003670 e = p + PyUnicode_GET_SIZE(self);
3671 cased = 0;
3672 for (; p < e; p++) {
3673 register const Py_UNICODE ch = *p;
3674
3675 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3676 return PyInt_FromLong(0);
3677 else if (!cased && Py_UNICODE_ISLOWER(ch))
3678 cased = 1;
3679 }
3680 return PyInt_FromLong(cased);
3681}
3682
3683static char isupper__doc__[] =
3684"S.isupper() -> int\n\
3685\n\
3686Return 1 if all cased characters in S are uppercase and there is\n\
3687at least one cased character in S, 0 otherwise.";
3688
3689static PyObject*
3690unicode_isupper(PyUnicodeObject *self, PyObject *args)
3691{
3692 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3693 register const Py_UNICODE *e;
3694 int cased;
3695
3696 if (!PyArg_NoArgs(args))
3697 return NULL;
3698
3699 /* Shortcut for single character strings */
3700 if (PyUnicode_GET_SIZE(self) == 1)
3701 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3702
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003703 /* Special case for empty strings */
3704 if (PyString_GET_SIZE(self) == 0)
3705 return PyInt_FromLong(0);
3706
Guido van Rossumd57fd912000-03-10 22:53:23 +00003707 e = p + PyUnicode_GET_SIZE(self);
3708 cased = 0;
3709 for (; p < e; p++) {
3710 register const Py_UNICODE ch = *p;
3711
3712 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3713 return PyInt_FromLong(0);
3714 else if (!cased && Py_UNICODE_ISUPPER(ch))
3715 cased = 1;
3716 }
3717 return PyInt_FromLong(cased);
3718}
3719
3720static char istitle__doc__[] =
3721"S.istitle() -> int\n\
3722\n\
3723Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3724may only follow uncased characters and lowercase characters only cased\n\
3725ones. Return 0 otherwise.";
3726
3727static PyObject*
3728unicode_istitle(PyUnicodeObject *self, PyObject *args)
3729{
3730 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3731 register const Py_UNICODE *e;
3732 int cased, previous_is_cased;
3733
3734 if (!PyArg_NoArgs(args))
3735 return NULL;
3736
3737 /* Shortcut for single character strings */
3738 if (PyUnicode_GET_SIZE(self) == 1)
3739 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3740 (Py_UNICODE_ISUPPER(*p) != 0));
3741
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003742 /* Special case for empty strings */
3743 if (PyString_GET_SIZE(self) == 0)
3744 return PyInt_FromLong(0);
3745
Guido van Rossumd57fd912000-03-10 22:53:23 +00003746 e = p + PyUnicode_GET_SIZE(self);
3747 cased = 0;
3748 previous_is_cased = 0;
3749 for (; p < e; p++) {
3750 register const Py_UNICODE ch = *p;
3751
3752 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3753 if (previous_is_cased)
3754 return PyInt_FromLong(0);
3755 previous_is_cased = 1;
3756 cased = 1;
3757 }
3758 else if (Py_UNICODE_ISLOWER(ch)) {
3759 if (!previous_is_cased)
3760 return PyInt_FromLong(0);
3761 previous_is_cased = 1;
3762 cased = 1;
3763 }
3764 else
3765 previous_is_cased = 0;
3766 }
3767 return PyInt_FromLong(cased);
3768}
3769
3770static char isspace__doc__[] =
3771"S.isspace() -> int\n\
3772\n\
3773Return 1 if there are only whitespace characters in S,\n\
37740 otherwise.";
3775
3776static PyObject*
3777unicode_isspace(PyUnicodeObject *self, PyObject *args)
3778{
3779 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3780 register const Py_UNICODE *e;
3781
3782 if (!PyArg_NoArgs(args))
3783 return NULL;
3784
3785 /* Shortcut for single character strings */
3786 if (PyUnicode_GET_SIZE(self) == 1 &&
3787 Py_UNICODE_ISSPACE(*p))
3788 return PyInt_FromLong(1);
3789
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003790 /* Special case for empty strings */
3791 if (PyString_GET_SIZE(self) == 0)
3792 return PyInt_FromLong(0);
3793
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794 e = p + PyUnicode_GET_SIZE(self);
3795 for (; p < e; p++) {
3796 if (!Py_UNICODE_ISSPACE(*p))
3797 return PyInt_FromLong(0);
3798 }
3799 return PyInt_FromLong(1);
3800}
3801
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003802static char isalpha__doc__[] =
3803"S.isalpha() -> int\n\
3804\n\
3805Return 1 if all characters in S are alphabetic\n\
3806and there is at least one character in S, 0 otherwise.";
3807
3808static PyObject*
3809unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3810{
3811 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3812 register const Py_UNICODE *e;
3813
3814 if (!PyArg_NoArgs(args))
3815 return NULL;
3816
3817 /* Shortcut for single character strings */
3818 if (PyUnicode_GET_SIZE(self) == 1 &&
3819 Py_UNICODE_ISALPHA(*p))
3820 return PyInt_FromLong(1);
3821
3822 /* Special case for empty strings */
3823 if (PyString_GET_SIZE(self) == 0)
3824 return PyInt_FromLong(0);
3825
3826 e = p + PyUnicode_GET_SIZE(self);
3827 for (; p < e; p++) {
3828 if (!Py_UNICODE_ISALPHA(*p))
3829 return PyInt_FromLong(0);
3830 }
3831 return PyInt_FromLong(1);
3832}
3833
3834static char isalnum__doc__[] =
3835"S.isalnum() -> int\n\
3836\n\
3837Return 1 if all characters in S are alphanumeric\n\
3838and there is at least one character in S, 0 otherwise.";
3839
3840static PyObject*
3841unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3842{
3843 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3844 register const Py_UNICODE *e;
3845
3846 if (!PyArg_NoArgs(args))
3847 return NULL;
3848
3849 /* Shortcut for single character strings */
3850 if (PyUnicode_GET_SIZE(self) == 1 &&
3851 Py_UNICODE_ISALNUM(*p))
3852 return PyInt_FromLong(1);
3853
3854 /* Special case for empty strings */
3855 if (PyString_GET_SIZE(self) == 0)
3856 return PyInt_FromLong(0);
3857
3858 e = p + PyUnicode_GET_SIZE(self);
3859 for (; p < e; p++) {
3860 if (!Py_UNICODE_ISALNUM(*p))
3861 return PyInt_FromLong(0);
3862 }
3863 return PyInt_FromLong(1);
3864}
3865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866static char isdecimal__doc__[] =
3867"S.isdecimal() -> int\n\
3868\n\
3869Return 1 if there are only decimal characters in S,\n\
38700 otherwise.";
3871
3872static PyObject*
3873unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3874{
3875 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3876 register const Py_UNICODE *e;
3877
3878 if (!PyArg_NoArgs(args))
3879 return NULL;
3880
3881 /* Shortcut for single character strings */
3882 if (PyUnicode_GET_SIZE(self) == 1 &&
3883 Py_UNICODE_ISDECIMAL(*p))
3884 return PyInt_FromLong(1);
3885
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003886 /* Special case for empty strings */
3887 if (PyString_GET_SIZE(self) == 0)
3888 return PyInt_FromLong(0);
3889
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890 e = p + PyUnicode_GET_SIZE(self);
3891 for (; p < e; p++) {
3892 if (!Py_UNICODE_ISDECIMAL(*p))
3893 return PyInt_FromLong(0);
3894 }
3895 return PyInt_FromLong(1);
3896}
3897
3898static char isdigit__doc__[] =
3899"S.isdigit() -> int\n\
3900\n\
3901Return 1 if there are only digit characters in S,\n\
39020 otherwise.";
3903
3904static PyObject*
3905unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3906{
3907 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3908 register const Py_UNICODE *e;
3909
3910 if (!PyArg_NoArgs(args))
3911 return NULL;
3912
3913 /* Shortcut for single character strings */
3914 if (PyUnicode_GET_SIZE(self) == 1 &&
3915 Py_UNICODE_ISDIGIT(*p))
3916 return PyInt_FromLong(1);
3917
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003918 /* Special case for empty strings */
3919 if (PyString_GET_SIZE(self) == 0)
3920 return PyInt_FromLong(0);
3921
Guido van Rossumd57fd912000-03-10 22:53:23 +00003922 e = p + PyUnicode_GET_SIZE(self);
3923 for (; p < e; p++) {
3924 if (!Py_UNICODE_ISDIGIT(*p))
3925 return PyInt_FromLong(0);
3926 }
3927 return PyInt_FromLong(1);
3928}
3929
3930static char isnumeric__doc__[] =
3931"S.isnumeric() -> int\n\
3932\n\
3933Return 1 if there are only numeric characters in S,\n\
39340 otherwise.";
3935
3936static PyObject*
3937unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3938{
3939 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3940 register const Py_UNICODE *e;
3941
3942 if (!PyArg_NoArgs(args))
3943 return NULL;
3944
3945 /* Shortcut for single character strings */
3946 if (PyUnicode_GET_SIZE(self) == 1 &&
3947 Py_UNICODE_ISNUMERIC(*p))
3948 return PyInt_FromLong(1);
3949
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003950 /* Special case for empty strings */
3951 if (PyString_GET_SIZE(self) == 0)
3952 return PyInt_FromLong(0);
3953
Guido van Rossumd57fd912000-03-10 22:53:23 +00003954 e = p + PyUnicode_GET_SIZE(self);
3955 for (; p < e; p++) {
3956 if (!Py_UNICODE_ISNUMERIC(*p))
3957 return PyInt_FromLong(0);
3958 }
3959 return PyInt_FromLong(1);
3960}
3961
3962static char join__doc__[] =
3963"S.join(sequence) -> unicode\n\
3964\n\
3965Return a string which is the concatenation of the strings in the\n\
3966sequence. The separator between elements is S.";
3967
3968static PyObject*
3969unicode_join(PyUnicodeObject *self, PyObject *args)
3970{
3971 PyObject *data;
3972 if (!PyArg_ParseTuple(args, "O:join", &data))
3973 return NULL;
3974
3975 return PyUnicode_Join((PyObject *)self, data);
3976}
3977
3978static int
3979unicode_length(PyUnicodeObject *self)
3980{
3981 return self->length;
3982}
3983
3984static char ljust__doc__[] =
3985"S.ljust(width) -> unicode\n\
3986\n\
3987Return S left justified in a Unicode string of length width. Padding is\n\
3988done using spaces.";
3989
3990static PyObject *
3991unicode_ljust(PyUnicodeObject *self, PyObject *args)
3992{
3993 int width;
3994 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3995 return NULL;
3996
3997 if (self->length >= width) {
3998 Py_INCREF(self);
3999 return (PyObject*) self;
4000 }
4001
4002 return (PyObject*) pad(self, 0, width - self->length, ' ');
4003}
4004
4005static char lower__doc__[] =
4006"S.lower() -> unicode\n\
4007\n\
4008Return a copy of the string S converted to lowercase.";
4009
4010static PyObject*
4011unicode_lower(PyUnicodeObject *self, PyObject *args)
4012{
4013 if (!PyArg_NoArgs(args))
4014 return NULL;
4015 return fixup(self, fixlower);
4016}
4017
4018static char lstrip__doc__[] =
4019"S.lstrip() -> unicode\n\
4020\n\
4021Return a copy of the string S with leading whitespace removed.";
4022
4023static PyObject *
4024unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4025{
4026 if (!PyArg_NoArgs(args))
4027 return NULL;
4028 return strip(self, 1, 0);
4029}
4030
4031static PyObject*
4032unicode_repeat(PyUnicodeObject *str, int len)
4033{
4034 PyUnicodeObject *u;
4035 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004036 int nchars;
4037 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038
4039 if (len < 0)
4040 len = 0;
4041
4042 if (len == 1) {
4043 /* no repeat, return original string */
4044 Py_INCREF(str);
4045 return (PyObject*) str;
4046 }
Tim Peters8f422462000-09-09 06:13:41 +00004047
4048 /* ensure # of chars needed doesn't overflow int and # of bytes
4049 * needed doesn't overflow size_t
4050 */
4051 nchars = len * str->length;
4052 if (len && nchars / len != str->length) {
4053 PyErr_SetString(PyExc_OverflowError,
4054 "repeated string is too long");
4055 return NULL;
4056 }
4057 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4058 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4059 PyErr_SetString(PyExc_OverflowError,
4060 "repeated string is too long");
4061 return NULL;
4062 }
4063 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004064 if (!u)
4065 return NULL;
4066
4067 p = u->str;
4068
4069 while (len-- > 0) {
4070 Py_UNICODE_COPY(p, str->str, str->length);
4071 p += str->length;
4072 }
4073
4074 return (PyObject*) u;
4075}
4076
4077PyObject *PyUnicode_Replace(PyObject *obj,
4078 PyObject *subobj,
4079 PyObject *replobj,
4080 int maxcount)
4081{
4082 PyObject *self;
4083 PyObject *str1;
4084 PyObject *str2;
4085 PyObject *result;
4086
4087 self = PyUnicode_FromObject(obj);
4088 if (self == NULL)
4089 return NULL;
4090 str1 = PyUnicode_FromObject(subobj);
4091 if (str1 == NULL) {
4092 Py_DECREF(self);
4093 return NULL;
4094 }
4095 str2 = PyUnicode_FromObject(replobj);
4096 if (str2 == NULL) {
4097 Py_DECREF(self);
4098 Py_DECREF(str1);
4099 return NULL;
4100 }
4101 result = replace((PyUnicodeObject *)self,
4102 (PyUnicodeObject *)str1,
4103 (PyUnicodeObject *)str2,
4104 maxcount);
4105 Py_DECREF(self);
4106 Py_DECREF(str1);
4107 Py_DECREF(str2);
4108 return result;
4109}
4110
4111static char replace__doc__[] =
4112"S.replace (old, new[, maxsplit]) -> unicode\n\
4113\n\
4114Return a copy of S with all occurrences of substring\n\
4115old replaced by new. If the optional argument maxsplit is\n\
4116given, only the first maxsplit occurrences are replaced.";
4117
4118static PyObject*
4119unicode_replace(PyUnicodeObject *self, PyObject *args)
4120{
4121 PyUnicodeObject *str1;
4122 PyUnicodeObject *str2;
4123 int maxcount = -1;
4124 PyObject *result;
4125
4126 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4127 return NULL;
4128 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4129 if (str1 == NULL)
4130 return NULL;
4131 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4132 if (str2 == NULL)
4133 return NULL;
4134
4135 result = replace(self, str1, str2, maxcount);
4136
4137 Py_DECREF(str1);
4138 Py_DECREF(str2);
4139 return result;
4140}
4141
4142static
4143PyObject *unicode_repr(PyObject *unicode)
4144{
4145 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4146 PyUnicode_GET_SIZE(unicode),
4147 1);
4148}
4149
4150static char rfind__doc__[] =
4151"S.rfind(sub [,start [,end]]) -> int\n\
4152\n\
4153Return the highest index in S where substring sub is found,\n\
4154such that sub is contained within s[start,end]. Optional\n\
4155arguments start and end are interpreted as in slice notation.\n\
4156\n\
4157Return -1 on failure.";
4158
4159static PyObject *
4160unicode_rfind(PyUnicodeObject *self, PyObject *args)
4161{
4162 PyUnicodeObject *substring;
4163 int start = 0;
4164 int end = INT_MAX;
4165 PyObject *result;
4166
Guido van Rossumb8872e62000-05-09 14:14:27 +00004167 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4168 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004169 return NULL;
4170 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4171 (PyObject *)substring);
4172 if (substring == NULL)
4173 return NULL;
4174
4175 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4176
4177 Py_DECREF(substring);
4178 return result;
4179}
4180
4181static char rindex__doc__[] =
4182"S.rindex(sub [,start [,end]]) -> int\n\
4183\n\
4184Like S.rfind() but raise ValueError when the substring is not found.";
4185
4186static PyObject *
4187unicode_rindex(PyUnicodeObject *self, PyObject *args)
4188{
4189 int result;
4190 PyUnicodeObject *substring;
4191 int start = 0;
4192 int end = INT_MAX;
4193
Guido van Rossumb8872e62000-05-09 14:14:27 +00004194 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4195 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004196 return NULL;
4197 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4198 (PyObject *)substring);
4199 if (substring == NULL)
4200 return NULL;
4201
4202 result = findstring(self, substring, start, end, -1);
4203
4204 Py_DECREF(substring);
4205 if (result < 0) {
4206 PyErr_SetString(PyExc_ValueError, "substring not found");
4207 return NULL;
4208 }
4209 return PyInt_FromLong(result);
4210}
4211
4212static char rjust__doc__[] =
4213"S.rjust(width) -> unicode\n\
4214\n\
4215Return S right justified in a Unicode string of length width. Padding is\n\
4216done using spaces.";
4217
4218static PyObject *
4219unicode_rjust(PyUnicodeObject *self, PyObject *args)
4220{
4221 int width;
4222 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4223 return NULL;
4224
4225 if (self->length >= width) {
4226 Py_INCREF(self);
4227 return (PyObject*) self;
4228 }
4229
4230 return (PyObject*) pad(self, width - self->length, 0, ' ');
4231}
4232
4233static char rstrip__doc__[] =
4234"S.rstrip() -> unicode\n\
4235\n\
4236Return a copy of the string S with trailing whitespace removed.";
4237
4238static PyObject *
4239unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4240{
4241 if (!PyArg_NoArgs(args))
4242 return NULL;
4243 return strip(self, 0, 1);
4244}
4245
4246static PyObject*
4247unicode_slice(PyUnicodeObject *self, int start, int end)
4248{
4249 /* standard clamping */
4250 if (start < 0)
4251 start = 0;
4252 if (end < 0)
4253 end = 0;
4254 if (end > self->length)
4255 end = self->length;
4256 if (start == 0 && end == self->length) {
4257 /* full slice, return original string */
4258 Py_INCREF(self);
4259 return (PyObject*) self;
4260 }
4261 if (start > end)
4262 start = end;
4263 /* copy slice */
4264 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4265 end - start);
4266}
4267
4268PyObject *PyUnicode_Split(PyObject *s,
4269 PyObject *sep,
4270 int maxsplit)
4271{
4272 PyObject *result;
4273
4274 s = PyUnicode_FromObject(s);
4275 if (s == NULL)
4276 return NULL;
4277 if (sep != NULL) {
4278 sep = PyUnicode_FromObject(sep);
4279 if (sep == NULL) {
4280 Py_DECREF(s);
4281 return NULL;
4282 }
4283 }
4284
4285 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4286
4287 Py_DECREF(s);
4288 Py_XDECREF(sep);
4289 return result;
4290}
4291
4292static char split__doc__[] =
4293"S.split([sep [,maxsplit]]) -> list of strings\n\
4294\n\
4295Return a list of the words in S, using sep as the\n\
4296delimiter string. If maxsplit is given, at most maxsplit\n\
4297splits are done. If sep is not specified, any whitespace string\n\
4298is a separator.";
4299
4300static PyObject*
4301unicode_split(PyUnicodeObject *self, PyObject *args)
4302{
4303 PyObject *substring = Py_None;
4304 int maxcount = -1;
4305
4306 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4307 return NULL;
4308
4309 if (substring == Py_None)
4310 return split(self, NULL, maxcount);
4311 else if (PyUnicode_Check(substring))
4312 return split(self, (PyUnicodeObject *)substring, maxcount);
4313 else
4314 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4315}
4316
4317static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004318"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004319\n\
4320Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004321Line breaks are not included in the resulting list unless keepends\n\
4322is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323
4324static PyObject*
4325unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4326{
Guido van Rossum86662912000-04-11 15:38:46 +00004327 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004328
Guido van Rossum86662912000-04-11 15:38:46 +00004329 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330 return NULL;
4331
Guido van Rossum86662912000-04-11 15:38:46 +00004332 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004333}
4334
4335static
4336PyObject *unicode_str(PyUnicodeObject *self)
4337{
Fred Drakee4315f52000-05-09 19:53:39 +00004338 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339}
4340
4341static char strip__doc__[] =
4342"S.strip() -> unicode\n\
4343\n\
4344Return a copy of S with leading and trailing whitespace removed.";
4345
4346static PyObject *
4347unicode_strip(PyUnicodeObject *self, PyObject *args)
4348{
4349 if (!PyArg_NoArgs(args))
4350 return NULL;
4351 return strip(self, 1, 1);
4352}
4353
4354static char swapcase__doc__[] =
4355"S.swapcase() -> unicode\n\
4356\n\
4357Return a copy of S with uppercase characters converted to lowercase\n\
4358and vice versa.";
4359
4360static PyObject*
4361unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4362{
4363 if (!PyArg_NoArgs(args))
4364 return NULL;
4365 return fixup(self, fixswapcase);
4366}
4367
4368static char translate__doc__[] =
4369"S.translate(table) -> unicode\n\
4370\n\
4371Return a copy of the string S, where all characters have been mapped\n\
4372through the given translation table, which must be a mapping of\n\
4373Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4374are left untouched. Characters mapped to None are deleted.";
4375
4376static PyObject*
4377unicode_translate(PyUnicodeObject *self, PyObject *args)
4378{
4379 PyObject *table;
4380
4381 if (!PyArg_ParseTuple(args, "O:translate", &table))
4382 return NULL;
4383 return PyUnicode_TranslateCharmap(self->str,
4384 self->length,
4385 table,
4386 "ignore");
4387}
4388
4389static char upper__doc__[] =
4390"S.upper() -> unicode\n\
4391\n\
4392Return a copy of S converted to uppercase.";
4393
4394static PyObject*
4395unicode_upper(PyUnicodeObject *self, PyObject *args)
4396{
4397 if (!PyArg_NoArgs(args))
4398 return NULL;
4399 return fixup(self, fixupper);
4400}
4401
4402#if 0
4403static char zfill__doc__[] =
4404"S.zfill(width) -> unicode\n\
4405\n\
4406Pad a numeric string x with zeros on the left, to fill a field\n\
4407of the specified width. The string x is never truncated.";
4408
4409static PyObject *
4410unicode_zfill(PyUnicodeObject *self, PyObject *args)
4411{
4412 int fill;
4413 PyUnicodeObject *u;
4414
4415 int width;
4416 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4417 return NULL;
4418
4419 if (self->length >= width) {
4420 Py_INCREF(self);
4421 return (PyObject*) self;
4422 }
4423
4424 fill = width - self->length;
4425
4426 u = pad(self, fill, 0, '0');
4427
4428 if (u->str[fill] == '+' || u->str[fill] == '-') {
4429 /* move sign to beginning of string */
4430 u->str[0] = u->str[fill];
4431 u->str[fill] = '0';
4432 }
4433
4434 return (PyObject*) u;
4435}
4436#endif
4437
4438#if 0
4439static PyObject*
4440unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4441{
4442 if (!PyArg_NoArgs(args))
4443 return NULL;
4444 return PyInt_FromLong(unicode_freelist_size);
4445}
4446#endif
4447
4448static char startswith__doc__[] =
4449"S.startswith(prefix[, start[, end]]) -> int\n\
4450\n\
4451Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4452optional start, test S beginning at that position. With optional end, stop\n\
4453comparing S at that position.";
4454
4455static PyObject *
4456unicode_startswith(PyUnicodeObject *self,
4457 PyObject *args)
4458{
4459 PyUnicodeObject *substring;
4460 int start = 0;
4461 int end = INT_MAX;
4462 PyObject *result;
4463
Guido van Rossumb8872e62000-05-09 14:14:27 +00004464 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4465 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004466 return NULL;
4467 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4468 (PyObject *)substring);
4469 if (substring == NULL)
4470 return NULL;
4471
4472 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4473
4474 Py_DECREF(substring);
4475 return result;
4476}
4477
4478
4479static char endswith__doc__[] =
4480"S.endswith(suffix[, start[, end]]) -> int\n\
4481\n\
4482Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4483optional start, test S beginning at that position. With optional end, stop\n\
4484comparing S at that position.";
4485
4486static PyObject *
4487unicode_endswith(PyUnicodeObject *self,
4488 PyObject *args)
4489{
4490 PyUnicodeObject *substring;
4491 int start = 0;
4492 int end = INT_MAX;
4493 PyObject *result;
4494
Guido van Rossumb8872e62000-05-09 14:14:27 +00004495 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4496 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004497 return NULL;
4498 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4499 (PyObject *)substring);
4500 if (substring == NULL)
4501 return NULL;
4502
4503 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4504
4505 Py_DECREF(substring);
4506 return result;
4507}
4508
4509
4510static PyMethodDef unicode_methods[] = {
4511
4512 /* Order is according to common usage: often used methods should
4513 appear first, since lookup is done sequentially. */
4514
4515 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4516 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4517 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4518 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4519 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4520 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4521 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4522 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4523 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4524 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4525 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4526 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4527 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4528 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4529/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4530 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4531 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4532 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4533 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4534 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4535 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4536 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4537 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4538 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4539 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4540 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4541 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4542 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4543 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4544 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4545 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4546 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4547 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004548 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4549 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004550#if 0
4551 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4552 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4553#endif
4554
4555#if 0
4556 /* This one is just used for debugging the implementation. */
4557 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4558#endif
4559
4560 {NULL, NULL}
4561};
4562
4563static PyObject *
4564unicode_getattr(PyUnicodeObject *self, char *name)
4565{
4566 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4567}
4568
4569static PySequenceMethods unicode_as_sequence = {
4570 (inquiry) unicode_length, /* sq_length */
4571 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4572 (intargfunc) unicode_repeat, /* sq_repeat */
4573 (intargfunc) unicode_getitem, /* sq_item */
4574 (intintargfunc) unicode_slice, /* sq_slice */
4575 0, /* sq_ass_item */
4576 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004577 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004578};
4579
4580static int
4581unicode_buffer_getreadbuf(PyUnicodeObject *self,
4582 int index,
4583 const void **ptr)
4584{
4585 if (index != 0) {
4586 PyErr_SetString(PyExc_SystemError,
4587 "accessing non-existent unicode segment");
4588 return -1;
4589 }
4590 *ptr = (void *) self->str;
4591 return PyUnicode_GET_DATA_SIZE(self);
4592}
4593
4594static int
4595unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4596 const void **ptr)
4597{
4598 PyErr_SetString(PyExc_TypeError,
4599 "cannot use unicode as modifyable buffer");
4600 return -1;
4601}
4602
4603static int
4604unicode_buffer_getsegcount(PyUnicodeObject *self,
4605 int *lenp)
4606{
4607 if (lenp)
4608 *lenp = PyUnicode_GET_DATA_SIZE(self);
4609 return 1;
4610}
4611
4612static int
4613unicode_buffer_getcharbuf(PyUnicodeObject *self,
4614 int index,
4615 const void **ptr)
4616{
4617 PyObject *str;
4618
4619 if (index != 0) {
4620 PyErr_SetString(PyExc_SystemError,
4621 "accessing non-existent unicode segment");
4622 return -1;
4623 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004624 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004625 if (str == NULL)
4626 return -1;
4627 *ptr = (void *) PyString_AS_STRING(str);
4628 return PyString_GET_SIZE(str);
4629}
4630
4631/* Helpers for PyUnicode_Format() */
4632
4633static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004634getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635{
4636 int argidx = *p_argidx;
4637 if (argidx < arglen) {
4638 (*p_argidx)++;
4639 if (arglen < 0)
4640 return args;
4641 else
4642 return PyTuple_GetItem(args, argidx);
4643 }
4644 PyErr_SetString(PyExc_TypeError,
4645 "not enough arguments for format string");
4646 return NULL;
4647}
4648
4649#define F_LJUST (1<<0)
4650#define F_SIGN (1<<1)
4651#define F_BLANK (1<<2)
4652#define F_ALT (1<<3)
4653#define F_ZERO (1<<4)
4654
4655static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004656int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657{
4658 register int i;
4659 int len;
4660 va_list va;
4661 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004662 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663
4664 /* First, format the string as char array, then expand to Py_UNICODE
4665 array. */
4666 charbuffer = (char *)buffer;
4667 len = vsprintf(charbuffer, format, va);
4668 for (i = len - 1; i >= 0; i--)
4669 buffer[i] = (Py_UNICODE) charbuffer[i];
4670
4671 va_end(va);
4672 return len;
4673}
4674
4675static int
4676formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004677 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004678 int flags,
4679 int prec,
4680 int type,
4681 PyObject *v)
4682{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004683 /* fmt = '%#.' + `prec` + `type`
4684 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004685 char fmt[20];
4686 double x;
4687
4688 x = PyFloat_AsDouble(v);
4689 if (x == -1.0 && PyErr_Occurred())
4690 return -1;
4691 if (prec < 0)
4692 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004693 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4694 type = 'g';
4695 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004696 /* worst case length calc to ensure no buffer overrun:
4697 fmt = %#.<prec>g
4698 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4699 for any double rep.)
4700 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4701 If prec=0 the effective precision is 1 (the leading digit is
4702 always given), therefore increase by one to 10+prec. */
4703 if (buflen <= (size_t)10 + (size_t)prec) {
4704 PyErr_SetString(PyExc_OverflowError,
4705 "formatted float is too long (precision too long?)");
4706 return -1;
4707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708 return usprintf(buf, fmt, x);
4709}
4710
Tim Peters38fd5b62000-09-21 05:43:11 +00004711static PyObject*
4712formatlong(PyObject *val, int flags, int prec, int type)
4713{
4714 char *buf;
4715 int i, len;
4716 PyObject *str; /* temporary string object. */
4717 PyUnicodeObject *result;
4718
4719 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4720 if (!str)
4721 return NULL;
4722 result = _PyUnicode_New(len);
4723 for (i = 0; i < len; i++)
4724 result->str[i] = buf[i];
4725 result->str[len] = 0;
4726 Py_DECREF(str);
4727 return (PyObject*)result;
4728}
4729
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730static int
4731formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004732 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004733 int flags,
4734 int prec,
4735 int type,
4736 PyObject *v)
4737{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004738 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004739 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4740 + 1 + 1 = 24*/
4741 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004742 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004743 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744
4745 x = PyInt_AsLong(v);
4746 if (x == -1 && PyErr_Occurred())
4747 return -1;
4748 if (prec < 0)
4749 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004750 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4751 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4752 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4753 PyErr_SetString(PyExc_OverflowError,
4754 "formatted integer is too long (precision too long?)");
4755 return -1;
4756 }
Tim Petersfff53252001-04-12 18:38:48 +00004757 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4758 * but we want it (for consistency with other %#x conversions, and
4759 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004760 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4761 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4762 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004763 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004764 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4765 /* Only way to know what the platform does is to try it. */
4766 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4767 if (fmt[1] != (char)type) {
4768 /* Supply our own leading 0x/0X -- needed under std C */
4769 use_native_c_format = 0;
4770 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4771 }
4772 }
4773 if (use_native_c_format)
4774 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 return usprintf(buf, fmt, x);
4776}
4777
4778static int
4779formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004780 size_t buflen,
4781 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004782{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004783 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004784 if (PyUnicode_Check(v)) {
4785 if (PyUnicode_GET_SIZE(v) != 1)
4786 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004788 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004789
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004790 else if (PyString_Check(v)) {
4791 if (PyString_GET_SIZE(v) != 1)
4792 goto onError;
4793 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4794 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795
4796 else {
4797 /* Integer input truncated to a character */
4798 long x;
4799 x = PyInt_AsLong(v);
4800 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004801 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004802 buf[0] = (char) x;
4803 }
4804 buf[1] = '\0';
4805 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004806
4807 onError:
4808 PyErr_SetString(PyExc_TypeError,
4809 "%c requires int or char");
4810 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004811}
4812
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004813/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4814
4815 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4816 chars are formatted. XXX This is a magic number. Each formatting
4817 routine does bounds checking to ensure no overflow, but a better
4818 solution may be to malloc a buffer of appropriate size for each
4819 format. For now, the current solution is sufficient.
4820*/
4821#define FORMATBUFLEN (size_t)120
4822
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823PyObject *PyUnicode_Format(PyObject *format,
4824 PyObject *args)
4825{
4826 Py_UNICODE *fmt, *res;
4827 int fmtcnt, rescnt, reslen, arglen, argidx;
4828 int args_owned = 0;
4829 PyUnicodeObject *result = NULL;
4830 PyObject *dict = NULL;
4831 PyObject *uformat;
4832
4833 if (format == NULL || args == NULL) {
4834 PyErr_BadInternalCall();
4835 return NULL;
4836 }
4837 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004838 if (uformat == NULL)
4839 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004840 fmt = PyUnicode_AS_UNICODE(uformat);
4841 fmtcnt = PyUnicode_GET_SIZE(uformat);
4842
4843 reslen = rescnt = fmtcnt + 100;
4844 result = _PyUnicode_New(reslen);
4845 if (result == NULL)
4846 goto onError;
4847 res = PyUnicode_AS_UNICODE(result);
4848
4849 if (PyTuple_Check(args)) {
4850 arglen = PyTuple_Size(args);
4851 argidx = 0;
4852 }
4853 else {
4854 arglen = -1;
4855 argidx = -2;
4856 }
4857 if (args->ob_type->tp_as_mapping)
4858 dict = args;
4859
4860 while (--fmtcnt >= 0) {
4861 if (*fmt != '%') {
4862 if (--rescnt < 0) {
4863 rescnt = fmtcnt + 100;
4864 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004865 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 return NULL;
4867 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4868 --rescnt;
4869 }
4870 *res++ = *fmt++;
4871 }
4872 else {
4873 /* Got a format specifier */
4874 int flags = 0;
4875 int width = -1;
4876 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 Py_UNICODE c = '\0';
4878 Py_UNICODE fill;
4879 PyObject *v = NULL;
4880 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004881 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004882 Py_UNICODE sign;
4883 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004884 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004885
4886 fmt++;
4887 if (*fmt == '(') {
4888 Py_UNICODE *keystart;
4889 int keylen;
4890 PyObject *key;
4891 int pcount = 1;
4892
4893 if (dict == NULL) {
4894 PyErr_SetString(PyExc_TypeError,
4895 "format requires a mapping");
4896 goto onError;
4897 }
4898 ++fmt;
4899 --fmtcnt;
4900 keystart = fmt;
4901 /* Skip over balanced parentheses */
4902 while (pcount > 0 && --fmtcnt >= 0) {
4903 if (*fmt == ')')
4904 --pcount;
4905 else if (*fmt == '(')
4906 ++pcount;
4907 fmt++;
4908 }
4909 keylen = fmt - keystart - 1;
4910 if (fmtcnt < 0 || pcount > 0) {
4911 PyErr_SetString(PyExc_ValueError,
4912 "incomplete format key");
4913 goto onError;
4914 }
Fred Drakee4315f52000-05-09 19:53:39 +00004915 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004916 then looked up since Python uses strings to hold
4917 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004918 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004919 key = PyUnicode_EncodeUTF8(keystart,
4920 keylen,
4921 NULL);
4922 if (key == NULL)
4923 goto onError;
4924 if (args_owned) {
4925 Py_DECREF(args);
4926 args_owned = 0;
4927 }
4928 args = PyObject_GetItem(dict, key);
4929 Py_DECREF(key);
4930 if (args == NULL) {
4931 goto onError;
4932 }
4933 args_owned = 1;
4934 arglen = -1;
4935 argidx = -2;
4936 }
4937 while (--fmtcnt >= 0) {
4938 switch (c = *fmt++) {
4939 case '-': flags |= F_LJUST; continue;
4940 case '+': flags |= F_SIGN; continue;
4941 case ' ': flags |= F_BLANK; continue;
4942 case '#': flags |= F_ALT; continue;
4943 case '0': flags |= F_ZERO; continue;
4944 }
4945 break;
4946 }
4947 if (c == '*') {
4948 v = getnextarg(args, arglen, &argidx);
4949 if (v == NULL)
4950 goto onError;
4951 if (!PyInt_Check(v)) {
4952 PyErr_SetString(PyExc_TypeError,
4953 "* wants int");
4954 goto onError;
4955 }
4956 width = PyInt_AsLong(v);
4957 if (width < 0) {
4958 flags |= F_LJUST;
4959 width = -width;
4960 }
4961 if (--fmtcnt >= 0)
4962 c = *fmt++;
4963 }
4964 else if (c >= '0' && c <= '9') {
4965 width = c - '0';
4966 while (--fmtcnt >= 0) {
4967 c = *fmt++;
4968 if (c < '0' || c > '9')
4969 break;
4970 if ((width*10) / 10 != width) {
4971 PyErr_SetString(PyExc_ValueError,
4972 "width too big");
4973 goto onError;
4974 }
4975 width = width*10 + (c - '0');
4976 }
4977 }
4978 if (c == '.') {
4979 prec = 0;
4980 if (--fmtcnt >= 0)
4981 c = *fmt++;
4982 if (c == '*') {
4983 v = getnextarg(args, arglen, &argidx);
4984 if (v == NULL)
4985 goto onError;
4986 if (!PyInt_Check(v)) {
4987 PyErr_SetString(PyExc_TypeError,
4988 "* wants int");
4989 goto onError;
4990 }
4991 prec = PyInt_AsLong(v);
4992 if (prec < 0)
4993 prec = 0;
4994 if (--fmtcnt >= 0)
4995 c = *fmt++;
4996 }
4997 else if (c >= '0' && c <= '9') {
4998 prec = c - '0';
4999 while (--fmtcnt >= 0) {
5000 c = Py_CHARMASK(*fmt++);
5001 if (c < '0' || c > '9')
5002 break;
5003 if ((prec*10) / 10 != prec) {
5004 PyErr_SetString(PyExc_ValueError,
5005 "prec too big");
5006 goto onError;
5007 }
5008 prec = prec*10 + (c - '0');
5009 }
5010 }
5011 } /* prec */
5012 if (fmtcnt >= 0) {
5013 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005014 if (--fmtcnt >= 0)
5015 c = *fmt++;
5016 }
5017 }
5018 if (fmtcnt < 0) {
5019 PyErr_SetString(PyExc_ValueError,
5020 "incomplete format");
5021 goto onError;
5022 }
5023 if (c != '%') {
5024 v = getnextarg(args, arglen, &argidx);
5025 if (v == NULL)
5026 goto onError;
5027 }
5028 sign = 0;
5029 fill = ' ';
5030 switch (c) {
5031
5032 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005033 pbuf = formatbuf;
5034 /* presume that buffer length is at least 1 */
5035 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 len = 1;
5037 break;
5038
5039 case 's':
5040 case 'r':
5041 if (PyUnicode_Check(v) && c == 's') {
5042 temp = v;
5043 Py_INCREF(temp);
5044 }
5045 else {
5046 PyObject *unicode;
5047 if (c == 's')
5048 temp = PyObject_Str(v);
5049 else
5050 temp = PyObject_Repr(v);
5051 if (temp == NULL)
5052 goto onError;
5053 if (!PyString_Check(temp)) {
5054 /* XXX Note: this should never happen, since
5055 PyObject_Repr() and PyObject_Str() assure
5056 this */
5057 Py_DECREF(temp);
5058 PyErr_SetString(PyExc_TypeError,
5059 "%s argument has non-string str()");
5060 goto onError;
5061 }
Fred Drakee4315f52000-05-09 19:53:39 +00005062 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005063 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005064 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005065 "strict");
5066 Py_DECREF(temp);
5067 temp = unicode;
5068 if (temp == NULL)
5069 goto onError;
5070 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005071 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005072 len = PyUnicode_GET_SIZE(temp);
5073 if (prec >= 0 && len > prec)
5074 len = prec;
5075 break;
5076
5077 case 'i':
5078 case 'd':
5079 case 'u':
5080 case 'o':
5081 case 'x':
5082 case 'X':
5083 if (c == 'i')
5084 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005085 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005086 temp = formatlong(v, flags, prec, c);
5087 if (!temp)
5088 goto onError;
5089 pbuf = PyUnicode_AS_UNICODE(temp);
5090 len = PyUnicode_GET_SIZE(temp);
5091 /* unbounded ints can always produce
5092 a sign character! */
5093 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005094 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005095 else {
5096 pbuf = formatbuf;
5097 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5098 flags, prec, c, v);
5099 if (len < 0)
5100 goto onError;
5101 /* only d conversion is signed */
5102 sign = c == 'd';
5103 }
5104 if (flags & F_ZERO)
5105 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 break;
5107
5108 case 'e':
5109 case 'E':
5110 case 'f':
5111 case 'g':
5112 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005113 pbuf = formatbuf;
5114 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5115 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005116 if (len < 0)
5117 goto onError;
5118 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005119 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005120 fill = '0';
5121 break;
5122
5123 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005124 pbuf = formatbuf;
5125 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126 if (len < 0)
5127 goto onError;
5128 break;
5129
5130 default:
5131 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005132 "unsupported format character '%c' (0x%x) "
5133 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005134 (31<=c && c<=126) ? c : '?',
5135 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005136 goto onError;
5137 }
5138 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005139 if (*pbuf == '-' || *pbuf == '+') {
5140 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005141 len--;
5142 }
5143 else if (flags & F_SIGN)
5144 sign = '+';
5145 else if (flags & F_BLANK)
5146 sign = ' ';
5147 else
5148 sign = 0;
5149 }
5150 if (width < len)
5151 width = len;
5152 if (rescnt < width + (sign != 0)) {
5153 reslen -= rescnt;
5154 rescnt = width + fmtcnt + 100;
5155 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005156 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005157 return NULL;
5158 res = PyUnicode_AS_UNICODE(result)
5159 + reslen - rescnt;
5160 }
5161 if (sign) {
5162 if (fill != ' ')
5163 *res++ = sign;
5164 rescnt--;
5165 if (width > len)
5166 width--;
5167 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005168 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5169 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005170 assert(pbuf[1] == c);
5171 if (fill != ' ') {
5172 *res++ = *pbuf++;
5173 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005174 }
Tim Petersfff53252001-04-12 18:38:48 +00005175 rescnt -= 2;
5176 width -= 2;
5177 if (width < 0)
5178 width = 0;
5179 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005180 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005181 if (width > len && !(flags & F_LJUST)) {
5182 do {
5183 --rescnt;
5184 *res++ = fill;
5185 } while (--width > len);
5186 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005187 if (fill == ' ') {
5188 if (sign)
5189 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005190 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005191 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005192 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005193 *res++ = *pbuf++;
5194 *res++ = *pbuf++;
5195 }
5196 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005197 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198 res += len;
5199 rescnt -= len;
5200 while (--width >= len) {
5201 --rescnt;
5202 *res++ = ' ';
5203 }
5204 if (dict && (argidx < arglen) && c != '%') {
5205 PyErr_SetString(PyExc_TypeError,
5206 "not all arguments converted");
5207 goto onError;
5208 }
5209 Py_XDECREF(temp);
5210 } /* '%' */
5211 } /* until end */
5212 if (argidx < arglen && !dict) {
5213 PyErr_SetString(PyExc_TypeError,
5214 "not all arguments converted");
5215 goto onError;
5216 }
5217
5218 if (args_owned) {
5219 Py_DECREF(args);
5220 }
5221 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005222 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005223 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005224 return (PyObject *)result;
5225
5226 onError:
5227 Py_XDECREF(result);
5228 Py_DECREF(uformat);
5229 if (args_owned) {
5230 Py_DECREF(args);
5231 }
5232 return NULL;
5233}
5234
5235static PyBufferProcs unicode_as_buffer = {
5236 (getreadbufferproc) unicode_buffer_getreadbuf,
5237 (getwritebufferproc) unicode_buffer_getwritebuf,
5238 (getsegcountproc) unicode_buffer_getsegcount,
5239 (getcharbufferproc) unicode_buffer_getcharbuf,
5240};
5241
5242PyTypeObject PyUnicode_Type = {
5243 PyObject_HEAD_INIT(&PyType_Type)
5244 0, /* ob_size */
5245 "unicode", /* tp_name */
5246 sizeof(PyUnicodeObject), /* tp_size */
5247 0, /* tp_itemsize */
5248 /* Slots */
5249 (destructor)_PyUnicode_Free, /* tp_dealloc */
5250 0, /* tp_print */
5251 (getattrfunc)unicode_getattr, /* tp_getattr */
5252 0, /* tp_setattr */
5253 (cmpfunc) unicode_compare, /* tp_compare */
5254 (reprfunc) unicode_repr, /* tp_repr */
5255 0, /* tp_as_number */
5256 &unicode_as_sequence, /* tp_as_sequence */
5257 0, /* tp_as_mapping */
5258 (hashfunc) unicode_hash, /* tp_hash*/
5259 0, /* tp_call*/
5260 (reprfunc) unicode_str, /* tp_str */
5261 (getattrofunc) NULL, /* tp_getattro */
5262 (setattrofunc) NULL, /* tp_setattro */
5263 &unicode_as_buffer, /* tp_as_buffer */
5264 Py_TPFLAGS_DEFAULT, /* tp_flags */
5265};
5266
5267/* Initialize the Unicode implementation */
5268
Thomas Wouters78890102000-07-22 19:25:51 +00005269void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005270{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005271 int i;
5272
Guido van Rossumd57fd912000-03-10 22:53:23 +00005273 /* Doublecheck the configuration... */
5274 if (sizeof(Py_UNICODE) != 2)
5275 Py_FatalError("Unicode configuration error: "
5276 "sizeof(Py_UNICODE) != 2 bytes");
5277
Fred Drakee4315f52000-05-09 19:53:39 +00005278 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005279 unicode_freelist = NULL;
5280 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005282 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005283 for (i = 0; i < 256; i++)
5284 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285}
5286
5287/* Finalize the Unicode implementation */
5288
5289void
Thomas Wouters78890102000-07-22 19:25:51 +00005290_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005291{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005292 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005293 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005294
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005295 Py_XDECREF(unicode_empty);
5296 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005297
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005298 for (i = 0; i < 256; i++) {
5299 if (unicode_latin1[i]) {
5300 Py_DECREF(unicode_latin1[i]);
5301 unicode_latin1[i] = NULL;
5302 }
5303 }
5304
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005305 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005306 PyUnicodeObject *v = u;
5307 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005308 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005309 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005310 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005311 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005312 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005313 unicode_freelist = NULL;
5314 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005315}