blob: 5da4d2f032efd33fa3d850cff0207780aa5b0a3b [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* --- Unicode Object ----------------------------------------------------- */
107
108static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110 int length)
111{
112 void *oldstr;
113
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000114 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000115 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000116 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118 /* Resizing shared object (unicode_empty or single character
119 objects) in-place is not allowed. Use PyUnicode_Resize()
120 instead ! */
121 if (unicode == unicode_empty ||
122 (unicode->length == 1 &&
123 unicode->str[0] < 256 &&
124 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 return -1;
128 }
129
130 /* We allocate one more byte to make sure the string is
131 Ux0000 terminated -- XXX is this needed ? */
132 oldstr = unicode->str;
133 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
134 if (!unicode->str) {
135 unicode->str = oldstr;
136 PyErr_NoMemory();
137 return -1;
138 }
139 unicode->str[length] = 0;
140 unicode->length = length;
141
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000142 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000143 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000144 if (unicode->defenc) {
145 Py_DECREF(unicode->defenc);
146 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147 }
148 unicode->hash = -1;
149
150 return 0;
151}
152
153/* We allocate one more byte to make sure the string is
154 Ux0000 terminated -- XXX is this needed ?
155
156 XXX This allocator could further be enhanced by assuring that the
157 free list never reduces its size below 1.
158
159*/
160
161static
162PyUnicodeObject *_PyUnicode_New(int length)
163{
164 register PyUnicodeObject *unicode;
165
166 /* Optimization for empty strings */
167 if (length == 0 && unicode_empty != NULL) {
168 Py_INCREF(unicode_empty);
169 return unicode_empty;
170 }
171
172 /* Unicode freelist & memory allocation */
173 if (unicode_freelist) {
174 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000175 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000176 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 /* Keep-Alive optimization: we only upsize the buffer,
179 never downsize it. */
180 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000182 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000183 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 }
185 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000186 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000188 }
189 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 }
191 else {
192 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
193 if (unicode == NULL)
194 return NULL;
195 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
196 }
197
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000198 if (!unicode->str) {
199 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000200 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 unicode->str[length] = 0;
203 unicode->length = length;
204 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000205 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000207
208 onError:
209 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000210 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212}
213
214static
215void _PyUnicode_Free(register PyUnicodeObject *unicode)
216{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 /* Keep-Alive optimization */
219 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000220 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 unicode->str = NULL;
222 unicode->length = 0;
223 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000224 if (unicode->defenc) {
225 Py_DECREF(unicode->defenc);
226 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 }
228 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 *(PyUnicodeObject **)unicode = unicode_freelist;
230 unicode_freelist = unicode;
231 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000234 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000235 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000236 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 }
238}
239
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000240int PyUnicode_Resize(PyObject **unicode,
241 int length)
242{
243 register PyUnicodeObject *v;
244
245 /* Argument checks */
246 if (unicode == NULL) {
247 PyErr_BadInternalCall();
248 return -1;
249 }
250 v = (PyUnicodeObject *)*unicode;
251 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
252 PyErr_BadInternalCall();
253 return -1;
254 }
255
256 /* Resizing unicode_empty and single character objects is not
257 possible since these are being shared. We simply return a fresh
258 copy with the same Unicode content. */
259 if (v->length != length &&
260 (v == unicode_empty || v->length == 1)) {
261 PyUnicodeObject *w = _PyUnicode_New(length);
262 if (w == NULL)
263 return -1;
264 Py_UNICODE_COPY(w->str, v->str,
265 length < v->length ? length : v->length);
266 *unicode = (PyObject *)w;
267 return 0;
268 }
269
270 /* Note that we don't have to modify *unicode for unshared Unicode
271 objects, since we can modify them in-place. */
272 return unicode_resize(v, length);
273}
274
275/* Internal API for use in unicodeobject.c only ! */
276#define _PyUnicode_Resize(unicodevar, length) \
277 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
278
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
280 int size)
281{
282 PyUnicodeObject *unicode;
283
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284 /* If the Unicode data is known at construction time, we can apply
285 some optimizations which share commonly used objects. */
286 if (u != NULL) {
287
288 /* Optimization for empty strings */
289 if (size == 0 && unicode_empty != NULL) {
290 Py_INCREF(unicode_empty);
291 return (PyObject *)unicode_empty;
292 }
293
294 /* Single character Unicode objects in the Latin-1 range are
295 shared when using this constructor */
296 if (size == 1 && *u < 256) {
297 unicode = unicode_latin1[*u];
298 if (!unicode) {
299 unicode = _PyUnicode_New(1);
300 unicode->str[0] = *u;
301 if (!unicode)
302 return NULL;
303 unicode_latin1[*u] = unicode;
304 }
305 Py_INCREF(unicode);
306 return (PyObject *)unicode;
307 }
308 }
309
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the Unicode data into the new object */
315 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317
318 return (PyObject *)unicode;
319}
320
321#ifdef HAVE_WCHAR_H
322
323PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
324 int size)
325{
326 PyUnicodeObject *unicode;
327
328 if (w == NULL) {
329 PyErr_BadInternalCall();
330 return NULL;
331 }
332
333 unicode = _PyUnicode_New(size);
334 if (!unicode)
335 return NULL;
336
337 /* Copy the wchar_t data into the new object */
338#ifdef HAVE_USABLE_WCHAR_T
339 memcpy(unicode->str, w, size * sizeof(wchar_t));
340#else
341 {
342 register Py_UNICODE *u;
343 register int i;
344 u = PyUnicode_AS_UNICODE(unicode);
345 for (i = size; i >= 0; i--)
346 *u++ = *w++;
347 }
348#endif
349
350 return (PyObject *)unicode;
351}
352
353int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
354 register wchar_t *w,
355 int size)
356{
357 if (unicode == NULL) {
358 PyErr_BadInternalCall();
359 return -1;
360 }
361 if (size > PyUnicode_GET_SIZE(unicode))
362 size = PyUnicode_GET_SIZE(unicode);
363#ifdef HAVE_USABLE_WCHAR_T
364 memcpy(w, unicode->str, size * sizeof(wchar_t));
365#else
366 {
367 register Py_UNICODE *u;
368 register int i;
369 u = PyUnicode_AS_UNICODE(unicode);
370 for (i = size; i >= 0; i--)
371 *w++ = *u++;
372 }
373#endif
374
375 return size;
376}
377
378#endif
379
380PyObject *PyUnicode_FromObject(register PyObject *obj)
381{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
383}
384
385PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
386 const char *encoding,
387 const char *errors)
388{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 const char *s;
390 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 int owned = 0;
392 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
394 if (obj == NULL) {
395 PyErr_BadInternalCall();
396 return NULL;
397 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000398
399 /* Coerce object */
400 if (PyInstance_Check(obj)) {
401 PyObject *func;
402 func = PyObject_GetAttrString(obj, "__str__");
403 if (func == NULL) {
404 PyErr_SetString(PyExc_TypeError,
405 "coercing to Unicode: instance doesn't define __str__");
406 return NULL;
407 }
408 obj = PyEval_CallObject(func, NULL);
409 Py_DECREF(func);
410 if (obj == NULL)
411 return NULL;
412 owned = 1;
413 }
414 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 v = obj;
417 if (encoding) {
418 PyErr_SetString(PyExc_TypeError,
419 "decoding Unicode is not supported");
420 return NULL;
421 }
422 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else if (PyString_Check(obj)) {
425 s = PyString_AS_STRING(obj);
426 len = PyString_GET_SIZE(obj);
427 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000428 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
429 /* Overwrite the error message with something more useful in
430 case of a TypeError. */
431 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000432 PyErr_Format(PyExc_TypeError,
433 "coercing to Unicode: need string or buffer, "
434 "%.80s found",
435 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000436 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000437 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000438
439 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 if (len == 0) {
441 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000442 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 else
445 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000446
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000448 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000449 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000450 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000451 return v;
452
453 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000454 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000456 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458}
459
460PyObject *PyUnicode_Decode(const char *s,
461 int size,
462 const char *encoding,
463 const char *errors)
464{
465 PyObject *buffer = NULL, *unicode;
466
Fred Drakee4315f52000-05-09 19:53:39 +0000467 if (encoding == NULL)
468 encoding = PyUnicode_GetDefaultEncoding();
469
470 /* Shortcuts for common default encodings */
471 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000473 else if (strcmp(encoding, "latin-1") == 0)
474 return PyUnicode_DecodeLatin1(s, size, errors);
475 else if (strcmp(encoding, "ascii") == 0)
476 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477
478 /* Decode via the codec registry */
479 buffer = PyBuffer_FromMemory((void *)s, size);
480 if (buffer == NULL)
481 goto onError;
482 unicode = PyCodec_Decode(buffer, encoding, errors);
483 if (unicode == NULL)
484 goto onError;
485 if (!PyUnicode_Check(unicode)) {
486 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000487 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488 unicode->ob_type->tp_name);
489 Py_DECREF(unicode);
490 goto onError;
491 }
492 Py_DECREF(buffer);
493 return unicode;
494
495 onError:
496 Py_XDECREF(buffer);
497 return NULL;
498}
499
500PyObject *PyUnicode_Encode(const Py_UNICODE *s,
501 int size,
502 const char *encoding,
503 const char *errors)
504{
505 PyObject *v, *unicode;
506
507 unicode = PyUnicode_FromUnicode(s, size);
508 if (unicode == NULL)
509 return NULL;
510 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
511 Py_DECREF(unicode);
512 return v;
513}
514
515PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
516 const char *encoding,
517 const char *errors)
518{
519 PyObject *v;
520
521 if (!PyUnicode_Check(unicode)) {
522 PyErr_BadArgument();
523 goto onError;
524 }
Fred Drakee4315f52000-05-09 19:53:39 +0000525
526 if (encoding == NULL)
527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (errors == NULL) {
531 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000533 else if (strcmp(encoding, "latin-1") == 0)
534 return PyUnicode_AsLatin1String(unicode);
535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_AsASCIIString(unicode);
537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000538
539 /* Encode via the codec registry */
540 v = PyCodec_Encode(unicode, encoding, errors);
541 if (v == NULL)
542 goto onError;
543 /* XXX Should we really enforce this ? */
544 if (!PyString_Check(v)) {
545 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000546 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547 v->ob_type->tp_name);
548 Py_DECREF(v);
549 goto onError;
550 }
551 return v;
552
553 onError:
554 return NULL;
555}
556
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000557/* Return a Python string holding the default encoded value of the
558 Unicode object.
559
560 The resulting string is cached in the Unicode object for subsequent
561 usage by this function. The cached version is needed to implement
562 the character buffer interface and will live (at least) as long as
563 the Unicode object itself.
564
565 The refcount of the string is *not* incremented.
566
567 *** Exported for internal use by the interpreter only !!! ***
568
569*/
570
571PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
572 const char *errors)
573{
574 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
575
576 if (v)
577 return v;
578 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
579 if (v && errors == NULL)
580 ((PyUnicodeObject *)unicode)->defenc = v;
581 return v;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
585{
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
590 return PyUnicode_AS_UNICODE(unicode);
591
592 onError:
593 return NULL;
594}
595
596int PyUnicode_GetSize(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_GET_SIZE(unicode);
603
604 onError:
605 return -1;
606}
607
Thomas Wouters78890102000-07-22 19:25:51 +0000608const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000609{
610 return unicode_default_encoding;
611}
612
613int PyUnicode_SetDefaultEncoding(const char *encoding)
614{
615 PyObject *v;
616
617 /* Make sure the encoding is valid. As side effect, this also
618 loads the encoding into the codec registry cache. */
619 v = _PyCodec_Lookup(encoding);
620 if (v == NULL)
621 goto onError;
622 Py_DECREF(v);
623 strncpy(unicode_default_encoding,
624 encoding,
625 sizeof(unicode_default_encoding));
626 return 0;
627
628 onError:
629 return -1;
630}
631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632/* --- UTF-8 Codec -------------------------------------------------------- */
633
634static
635char utf8_code_length[256] = {
636 /* Map UTF-8 encoded prefix byte to sequence length. zero means
637 illegal prefix. see RFC 2279 for details */
638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
652 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
653 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
654};
655
656static
657int utf8_decoding_error(const char **source,
658 Py_UNICODE **dest,
659 const char *errors,
660 const char *details)
661{
662 if ((errors == NULL) ||
663 (strcmp(errors,"strict") == 0)) {
664 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000665 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 details);
667 return -1;
668 }
669 else if (strcmp(errors,"ignore") == 0) {
670 (*source)++;
671 return 0;
672 }
673 else if (strcmp(errors,"replace") == 0) {
674 (*source)++;
675 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
676 (*dest)++;
677 return 0;
678 }
679 else {
680 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000681 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682 errors);
683 return -1;
684 }
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_DecodeUTF8(const char *s,
688 int size,
689 const char *errors)
690{
691 int n;
692 const char *e;
693 PyUnicodeObject *unicode;
694 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000696
697 /* Note: size will always be longer than the resulting Unicode
698 character count */
699 unicode = _PyUnicode_New(size);
700 if (!unicode)
701 return NULL;
702 if (size == 0)
703 return (PyObject *)unicode;
704
705 /* Unpack UTF-8 encoded data */
706 p = unicode->str;
707 e = s + size;
708
709 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000710 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711
712 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000713 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 s++;
715 continue;
716 }
717
718 n = utf8_code_length[ch];
719
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if (s + n > e) {
721 errmsg = "unexpected end of data";
722 goto utf8Error;
723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724
725 switch (n) {
726
727 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 errmsg = "unexpected code byte";
729 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
731 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000732 errmsg = "internal error";
733 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000734
735 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000736 if ((s[1] & 0xc0) != 0x80) {
737 errmsg = "invalid data";
738 goto utf8Error;
739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 if (ch < 0x80) {
742 errmsg = "illegal encoding";
743 goto utf8Error;
744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000747 break;
748
749 case 3:
750 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 (s[2] & 0xc0) != 0x80) {
752 errmsg = "invalid data";
753 goto utf8Error;
754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000756 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
757 errmsg = "illegal encoding";
758 goto utf8Error;
759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000761 *p++ = (Py_UNICODE)ch;
762 break;
763
764 case 4:
765 if ((s[1] & 0xc0) != 0x80 ||
766 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000767 (s[3] & 0xc0) != 0x80) {
768 errmsg = "invalid data";
769 goto utf8Error;
770 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000771 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
772 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
773 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000774 if ((ch < 0x10000) || /* minimum value allowed for 4
775 byte encoding */
776 (ch > 0x10ffff)) { /* maximum value allowed for
777 UTF-16 */
778 errmsg = "illegal encoding";
779 goto utf8Error;
780 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000781 /* compute and append the two surrogates: */
782
783 /* translate from 10000..10FFFF to 0..FFFF */
784 ch -= 0x10000;
785
786 /* high surrogate = top 10 bits added to D800 */
787 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
788
789 /* low surrogate = bottom 10 bits added to DC00 */
790 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791 break;
792
793 default:
794 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000795 errmsg = "unsupported Unicode code range";
796 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797 }
798 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000799 continue;
800
801 utf8Error:
802 if (utf8_decoding_error(&s, &p, errors, errmsg))
803 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804 }
805
806 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000807 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 goto onError;
809
810 return (PyObject *)unicode;
811
812onError:
813 Py_DECREF(unicode);
814 return NULL;
815}
816
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000817/* Not used anymore, now that the encoder supports UTF-16
818 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000819#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820static
821int utf8_encoding_error(const Py_UNICODE **source,
822 char **dest,
823 const char *errors,
824 const char *details)
825{
826 if ((errors == NULL) ||
827 (strcmp(errors,"strict") == 0)) {
828 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000829 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830 details);
831 return -1;
832 }
833 else if (strcmp(errors,"ignore") == 0) {
834 return 0;
835 }
836 else if (strcmp(errors,"replace") == 0) {
837 **dest = '?';
838 (*dest)++;
839 return 0;
840 }
841 else {
842 PyErr_Format(PyExc_ValueError,
843 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000844 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 errors);
846 return -1;
847 }
848}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000849#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850
851PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
852 int size,
853 const char *errors)
854{
855 PyObject *v;
856 char *p;
857 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000858 Py_UCS4 ch2;
859 unsigned int cbAllocated = 3 * size;
860 unsigned int cbWritten = 0;
861 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000863 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 if (v == NULL)
865 return NULL;
866 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000867 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868
869 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000870 while (i < size) {
871 Py_UCS4 ch = s[i++];
872 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000874 cbWritten++;
875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000876 else if (ch < 0x0800) {
877 *p++ = 0xc0 | (ch >> 6);
878 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 cbWritten += 2;
880 }
881 else {
882 /* Check for high surrogate */
883 if (0xD800 <= ch && ch <= 0xDBFF) {
884 if (i != size) {
885 ch2 = s[i];
886 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
887
888 if (cbWritten >= (cbAllocated - 4)) {
889 /* Provide enough room for some more
890 surrogates */
891 cbAllocated += 4*10;
892 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000893 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000894 }
895
896 /* combine the two values */
897 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
898
899 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000900 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000901 i++;
902 cbWritten += 4;
903 }
904 }
905 }
906 else {
907 *p++ = (char)(0xe0 | (ch >> 12));
908 cbWritten += 3;
909 }
910 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
911 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000912 }
913 }
914 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000915 if (_PyString_Resize(&v, p - q))
916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000917 return v;
918
919 onError:
920 Py_DECREF(v);
921 return NULL;
922}
923
Guido van Rossumd57fd912000-03-10 22:53:23 +0000924PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
925{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926 if (!PyUnicode_Check(unicode)) {
927 PyErr_BadArgument();
928 return NULL;
929 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000930 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
931 PyUnicode_GET_SIZE(unicode),
932 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933}
934
935/* --- UTF-16 Codec ------------------------------------------------------- */
936
937static
938int utf16_decoding_error(const Py_UNICODE **source,
939 Py_UNICODE **dest,
940 const char *errors,
941 const char *details)
942{
943 if ((errors == NULL) ||
944 (strcmp(errors,"strict") == 0)) {
945 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000946 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 details);
948 return -1;
949 }
950 else if (strcmp(errors,"ignore") == 0) {
951 return 0;
952 }
953 else if (strcmp(errors,"replace") == 0) {
954 if (dest) {
955 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
956 (*dest)++;
957 }
958 return 0;
959 }
960 else {
961 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000962 "UTF-16 decoding error; "
963 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 errors);
965 return -1;
966 }
967}
968
Guido van Rossumd57fd912000-03-10 22:53:23 +0000969PyObject *PyUnicode_DecodeUTF16(const char *s,
970 int size,
971 const char *errors,
972 int *byteorder)
973{
974 PyUnicodeObject *unicode;
975 Py_UNICODE *p;
976 const Py_UNICODE *q, *e;
977 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000978 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979
980 /* size should be an even number */
981 if (size % sizeof(Py_UNICODE) != 0) {
982 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
983 return NULL;
984 /* The remaining input chars are ignored if we fall through
985 here... */
986 }
987
988 /* Note: size will always be longer than the resulting Unicode
989 character count */
990 unicode = _PyUnicode_New(size);
991 if (!unicode)
992 return NULL;
993 if (size == 0)
994 return (PyObject *)unicode;
995
996 /* Unpack UTF-16 encoded data */
997 p = unicode->str;
998 q = (Py_UNICODE *)s;
999 e = q + (size / sizeof(Py_UNICODE));
1000
1001 if (byteorder)
1002 bo = *byteorder;
1003
1004 while (q < e) {
1005 register Py_UNICODE ch = *q++;
1006
1007 /* Check for BOM marks (U+FEFF) in the input and adjust
1008 current byte order setting accordingly. Swap input
1009 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
1010 !) */
1011#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1012 if (ch == 0xFEFF) {
1013 bo = -1;
1014 continue;
1015 } else if (ch == 0xFFFE) {
1016 bo = 1;
1017 continue;
1018 }
1019 if (bo == 1)
1020 ch = (ch >> 8) | (ch << 8);
1021#else
1022 if (ch == 0xFEFF) {
1023 bo = 1;
1024 continue;
1025 } else if (ch == 0xFFFE) {
1026 bo = -1;
1027 continue;
1028 }
1029 if (bo == -1)
1030 ch = (ch >> 8) | (ch << 8);
1031#endif
1032 if (ch < 0xD800 || ch > 0xDFFF) {
1033 *p++ = ch;
1034 continue;
1035 }
1036
1037 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001038 if (q >= e) {
1039 errmsg = "unexpected end of data";
1040 goto utf16Error;
1041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 if (0xDC00 <= *q && *q <= 0xDFFF) {
1043 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 /* This is valid data (a UTF-16 surrogate pair), but
1046 we are not able to store this information since our
1047 Py_UNICODE type only has 16 bits... this might
1048 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001049 errmsg = "code pairs are not supported";
1050 goto utf16Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 else
1053 continue;
1054 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 errmsg = "illegal encoding";
1056 /* Fall through to report the error */
1057
1058 utf16Error:
1059 if (utf16_decoding_error(&q, &p, errors, errmsg))
1060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 }
1062
1063 if (byteorder)
1064 *byteorder = bo;
1065
1066 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001067 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 goto onError;
1069
1070 return (PyObject *)unicode;
1071
1072onError:
1073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077#undef UTF16_ERROR
1078
1079PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1080 int size,
1081 const char *errors,
1082 int byteorder)
1083{
1084 PyObject *v;
1085 Py_UNICODE *p;
1086 char *q;
1087
1088 /* We don't create UTF-16 pairs... */
1089 v = PyString_FromStringAndSize(NULL,
1090 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1091 if (v == NULL)
1092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093
1094 q = PyString_AS_STRING(v);
1095 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if (byteorder == 0)
1097 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001098 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001099 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 if (byteorder == 0 ||
1101#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1102 byteorder == -1
1103#else
1104 byteorder == 1
1105#endif
1106 )
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001107 Py_UNICODE_COPY(p, s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 else
1109 while (size-- > 0) {
1110 Py_UNICODE ch = *s++;
1111 *p++ = (ch >> 8) | (ch << 8);
1112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 return v;
1114}
1115
1116PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1117{
1118 if (!PyUnicode_Check(unicode)) {
1119 PyErr_BadArgument();
1120 return NULL;
1121 }
1122 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1123 PyUnicode_GET_SIZE(unicode),
1124 NULL,
1125 0);
1126}
1127
1128/* --- Unicode Escape Codec ----------------------------------------------- */
1129
1130static
1131int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001132 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133 const char *errors,
1134 const char *details)
1135{
1136 if ((errors == NULL) ||
1137 (strcmp(errors,"strict") == 0)) {
1138 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001139 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 details);
1141 return -1;
1142 }
1143 else if (strcmp(errors,"ignore") == 0) {
1144 return 0;
1145 }
1146 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001147 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 return 0;
1149 }
1150 else {
1151 PyErr_Format(PyExc_ValueError,
1152 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 errors);
1155 return -1;
1156 }
1157}
1158
Fredrik Lundh06d12682001-01-24 07:59:11 +00001159static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1162 int size,
1163 const char *errors)
1164{
1165 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001166 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001168 char* message;
1169 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1170
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 /* Escaped strings will always be longer than the resulting
1172 Unicode string, so we start with size here and then reduce the
1173 length after conversion to the true value. */
1174 v = _PyUnicode_New(size);
1175 if (v == NULL)
1176 goto onError;
1177 if (size == 0)
1178 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 p = buf = PyUnicode_AS_UNICODE(v);
1181 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 while (s < end) {
1184 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001185 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001186 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187
1188 /* Non-escape characters are interpreted as Unicode ordinals */
1189 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001190 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 continue;
1192 }
1193
1194 /* \ - Escapes */
1195 s++;
1196 switch (*s++) {
1197
1198 /* \x escapes */
1199 case '\n': break;
1200 case '\\': *p++ = '\\'; break;
1201 case '\'': *p++ = '\''; break;
1202 case '\"': *p++ = '\"'; break;
1203 case 'b': *p++ = '\b'; break;
1204 case 'f': *p++ = '\014'; break; /* FF */
1205 case 't': *p++ = '\t'; break;
1206 case 'n': *p++ = '\n'; break;
1207 case 'r': *p++ = '\r'; break;
1208 case 'v': *p++ = '\013'; break; /* VT */
1209 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1210
1211 /* \OOO (octal) escapes */
1212 case '0': case '1': case '2': case '3':
1213 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001214 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001216 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001218 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001220 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 break;
1222
Fredrik Lundhccc74732001-02-18 22:13:49 +00001223 /* hex escapes */
1224 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001226 digits = 2;
1227 message = "truncated \\xXX escape";
1228 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229
Fredrik Lundhccc74732001-02-18 22:13:49 +00001230 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001232 digits = 4;
1233 message = "truncated \\uXXXX escape";
1234 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235
Fredrik Lundhccc74732001-02-18 22:13:49 +00001236 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001237 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001238 digits = 8;
1239 message = "truncated \\UXXXXXXXX escape";
1240 hexescape:
1241 chr = 0;
1242 for (i = 0; i < digits; i++) {
1243 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001244 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001245 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001246 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001247 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001248 i++;
1249 break;
1250 }
1251 chr = (chr<<4) & ~0xF;
1252 if (c >= '0' && c <= '9')
1253 chr += c - '0';
1254 else if (c >= 'a' && c <= 'f')
1255 chr += 10 + c - 'a';
1256 else
1257 chr += 10 + c - 'A';
1258 }
1259 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001260 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001261 /* when we get here, chr is a 32-bit unicode character */
1262 if (chr <= 0xffff)
1263 /* UCS-2 character */
1264 *p++ = (Py_UNICODE) chr;
1265 else if (chr <= 0x10ffff) {
1266 /* UCS-4 character. store as two surrogate characters */
1267 chr -= 0x10000L;
1268 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1269 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1270 } else {
1271 if (unicodeescape_decoding_error(
1272 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001273 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001274 )
1275 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001276 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001277 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001278 break;
1279
1280 /* \N{name} */
1281 case 'N':
1282 message = "malformed \\N character escape";
1283 if (ucnhash_CAPI == NULL) {
1284 /* load the unicode data module */
1285 PyObject *m, *v;
1286 m = PyImport_ImportModule("unicodedata");
1287 if (m == NULL)
1288 goto ucnhashError;
1289 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1290 Py_DECREF(m);
1291 if (v == NULL)
1292 goto ucnhashError;
1293 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1294 Py_DECREF(v);
1295 if (ucnhash_CAPI == NULL)
1296 goto ucnhashError;
1297 }
1298 if (*s == '{') {
1299 const char *start = s+1;
1300 /* look for the closing brace */
1301 while (*s != '}' && s < end)
1302 s++;
1303 if (s > start && s < end && *s == '}') {
1304 /* found a name. look it up in the unicode database */
1305 message = "unknown Unicode character name";
1306 s++;
1307 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1308 goto store;
1309 }
1310 }
1311 if (unicodeescape_decoding_error(&s, &x, errors, message))
1312 goto onError;
1313 *p++ = x;
1314 break;
1315
1316 default:
1317 *p++ = '\\';
1318 *p++ = (unsigned char)s[-1];
1319 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 }
1321 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001322 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 return (PyObject *)v;
1325
Fredrik Lundhccc74732001-02-18 22:13:49 +00001326ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001327 PyErr_SetString(
1328 PyExc_UnicodeError,
1329 "\\N escapes not supported (can't load unicodedata module)"
1330 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001331 return NULL;
1332
Fredrik Lundhccc74732001-02-18 22:13:49 +00001333onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 Py_XDECREF(v);
1335 return NULL;
1336}
1337
1338/* Return a Unicode-Escape string version of the Unicode object.
1339
1340 If quotes is true, the string is enclosed in u"" or u'' quotes as
1341 appropriate.
1342
1343*/
1344
Barry Warsaw51ac5802000-03-20 16:36:48 +00001345static const Py_UNICODE *findchar(const Py_UNICODE *s,
1346 int size,
1347 Py_UNICODE ch);
1348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349static
1350PyObject *unicodeescape_string(const Py_UNICODE *s,
1351 int size,
1352 int quotes)
1353{
1354 PyObject *repr;
1355 char *p;
1356 char *q;
1357
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001358 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359
1360 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1361 if (repr == NULL)
1362 return NULL;
1363
1364 p = q = PyString_AS_STRING(repr);
1365
1366 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 *p++ = 'u';
1368 *p++ = (findchar(s, size, '\'') &&
1369 !findchar(s, size, '"')) ? '"' : '\'';
1370 }
1371 while (size-- > 0) {
1372 Py_UNICODE ch = *s++;
1373 /* Escape quotes */
1374 if (quotes && (ch == q[1] || ch == '\\')) {
1375 *p++ = '\\';
1376 *p++ = (char) ch;
1377 }
1378 /* Map 16-bit characters to '\uxxxx' */
1379 else if (ch >= 256) {
1380 *p++ = '\\';
1381 *p++ = 'u';
1382 *p++ = hexdigit[(ch >> 12) & 0xf];
1383 *p++ = hexdigit[(ch >> 8) & 0xf];
1384 *p++ = hexdigit[(ch >> 4) & 0xf];
1385 *p++ = hexdigit[ch & 15];
1386 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001387 /* Map special whitespace to '\t', \n', '\r' */
1388 else if (ch == '\t') {
1389 *p++ = '\\';
1390 *p++ = 't';
1391 }
1392 else if (ch == '\n') {
1393 *p++ = '\\';
1394 *p++ = 'n';
1395 }
1396 else if (ch == '\r') {
1397 *p++ = '\\';
1398 *p++ = 'r';
1399 }
1400 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401 else if (ch < ' ' || ch >= 128) {
1402 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001403 *p++ = 'x';
1404 *p++ = hexdigit[(ch >> 4) & 0xf];
1405 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 }
1407 /* Copy everything else as-is */
1408 else
1409 *p++ = (char) ch;
1410 }
1411 if (quotes)
1412 *p++ = q[1];
1413
1414 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001415 if (_PyString_Resize(&repr, p - q))
1416 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
1418 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001419
1420 onError:
1421 Py_DECREF(repr);
1422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423}
1424
1425PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1426 int size)
1427{
1428 return unicodeescape_string(s, size, 0);
1429}
1430
1431PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1432{
1433 if (!PyUnicode_Check(unicode)) {
1434 PyErr_BadArgument();
1435 return NULL;
1436 }
1437 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1438 PyUnicode_GET_SIZE(unicode));
1439}
1440
1441/* --- Raw Unicode Escape Codec ------------------------------------------- */
1442
1443PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1444 int size,
1445 const char *errors)
1446{
1447 PyUnicodeObject *v;
1448 Py_UNICODE *p, *buf;
1449 const char *end;
1450 const char *bs;
1451
1452 /* Escaped strings will always be longer than the resulting
1453 Unicode string, so we start with size here and then reduce the
1454 length after conversion to the true value. */
1455 v = _PyUnicode_New(size);
1456 if (v == NULL)
1457 goto onError;
1458 if (size == 0)
1459 return (PyObject *)v;
1460 p = buf = PyUnicode_AS_UNICODE(v);
1461 end = s + size;
1462 while (s < end) {
1463 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001464 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 int i;
1466
1467 /* Non-escape characters are interpreted as Unicode ordinals */
1468 if (*s != '\\') {
1469 *p++ = (unsigned char)*s++;
1470 continue;
1471 }
1472
1473 /* \u-escapes are only interpreted iff the number of leading
1474 backslashes if odd */
1475 bs = s;
1476 for (;s < end;) {
1477 if (*s != '\\')
1478 break;
1479 *p++ = (unsigned char)*s++;
1480 }
1481 if (((s - bs) & 1) == 0 ||
1482 s >= end ||
1483 *s != 'u') {
1484 continue;
1485 }
1486 p--;
1487 s++;
1488
1489 /* \uXXXX with 4 hex digits */
1490 for (x = 0, i = 0; i < 4; i++) {
1491 c = (unsigned char)s[i];
1492 if (!isxdigit(c)) {
1493 if (unicodeescape_decoding_error(&s, &x, errors,
1494 "truncated \\uXXXX"))
1495 goto onError;
1496 i++;
1497 break;
1498 }
1499 x = (x<<4) & ~0xF;
1500 if (c >= '0' && c <= '9')
1501 x += c - '0';
1502 else if (c >= 'a' && c <= 'f')
1503 x += 10 + c - 'a';
1504 else
1505 x += 10 + c - 'A';
1506 }
1507 s += i;
1508 *p++ = x;
1509 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001510 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001511 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 return (PyObject *)v;
1513
1514 onError:
1515 Py_XDECREF(v);
1516 return NULL;
1517}
1518
1519PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1520 int size)
1521{
1522 PyObject *repr;
1523 char *p;
1524 char *q;
1525
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001526 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527
1528 repr = PyString_FromStringAndSize(NULL, 6 * size);
1529 if (repr == NULL)
1530 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001531 if (size == 0)
1532 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533
1534 p = q = PyString_AS_STRING(repr);
1535 while (size-- > 0) {
1536 Py_UNICODE ch = *s++;
1537 /* Map 16-bit characters to '\uxxxx' */
1538 if (ch >= 256) {
1539 *p++ = '\\';
1540 *p++ = 'u';
1541 *p++ = hexdigit[(ch >> 12) & 0xf];
1542 *p++ = hexdigit[(ch >> 8) & 0xf];
1543 *p++ = hexdigit[(ch >> 4) & 0xf];
1544 *p++ = hexdigit[ch & 15];
1545 }
1546 /* Copy everything else as-is */
1547 else
1548 *p++ = (char) ch;
1549 }
1550 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001551 if (_PyString_Resize(&repr, p - q))
1552 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553
1554 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001555
1556 onError:
1557 Py_DECREF(repr);
1558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559}
1560
1561PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1562{
1563 if (!PyUnicode_Check(unicode)) {
1564 PyErr_BadArgument();
1565 return NULL;
1566 }
1567 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1568 PyUnicode_GET_SIZE(unicode));
1569}
1570
1571/* --- Latin-1 Codec ------------------------------------------------------ */
1572
1573PyObject *PyUnicode_DecodeLatin1(const char *s,
1574 int size,
1575 const char *errors)
1576{
1577 PyUnicodeObject *v;
1578 Py_UNICODE *p;
1579
1580 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001581 if (size == 1 && *(unsigned char*)s < 256) {
1582 Py_UNICODE r = *(unsigned char*)s;
1583 return PyUnicode_FromUnicode(&r, 1);
1584 }
1585
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 v = _PyUnicode_New(size);
1587 if (v == NULL)
1588 goto onError;
1589 if (size == 0)
1590 return (PyObject *)v;
1591 p = PyUnicode_AS_UNICODE(v);
1592 while (size-- > 0)
1593 *p++ = (unsigned char)*s++;
1594 return (PyObject *)v;
1595
1596 onError:
1597 Py_XDECREF(v);
1598 return NULL;
1599}
1600
1601static
1602int latin1_encoding_error(const Py_UNICODE **source,
1603 char **dest,
1604 const char *errors,
1605 const char *details)
1606{
1607 if ((errors == NULL) ||
1608 (strcmp(errors,"strict") == 0)) {
1609 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001610 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 details);
1612 return -1;
1613 }
1614 else if (strcmp(errors,"ignore") == 0) {
1615 return 0;
1616 }
1617 else if (strcmp(errors,"replace") == 0) {
1618 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001619 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 return 0;
1621 }
1622 else {
1623 PyErr_Format(PyExc_ValueError,
1624 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001625 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 errors);
1627 return -1;
1628 }
1629}
1630
1631PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1632 int size,
1633 const char *errors)
1634{
1635 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001636 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001637
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 repr = PyString_FromStringAndSize(NULL, size);
1639 if (repr == NULL)
1640 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001641 if (size == 0)
1642 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643
1644 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001645 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 while (size-- > 0) {
1647 Py_UNICODE ch = *p++;
1648 if (ch >= 256) {
1649 if (latin1_encoding_error(&p, &s, errors,
1650 "ordinal not in range(256)"))
1651 goto onError;
1652 }
1653 else
1654 *s++ = (char)ch;
1655 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001656 /* Resize if error handling skipped some characters */
1657 if (s - start < PyString_GET_SIZE(repr))
1658 if (_PyString_Resize(&repr, s - start))
1659 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 return repr;
1661
1662 onError:
1663 Py_DECREF(repr);
1664 return NULL;
1665}
1666
1667PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1668{
1669 if (!PyUnicode_Check(unicode)) {
1670 PyErr_BadArgument();
1671 return NULL;
1672 }
1673 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1674 PyUnicode_GET_SIZE(unicode),
1675 NULL);
1676}
1677
1678/* --- 7-bit ASCII Codec -------------------------------------------------- */
1679
1680static
1681int ascii_decoding_error(const char **source,
1682 Py_UNICODE **dest,
1683 const char *errors,
1684 const char *details)
1685{
1686 if ((errors == NULL) ||
1687 (strcmp(errors,"strict") == 0)) {
1688 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001689 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 details);
1691 return -1;
1692 }
1693 else if (strcmp(errors,"ignore") == 0) {
1694 return 0;
1695 }
1696 else if (strcmp(errors,"replace") == 0) {
1697 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1698 (*dest)++;
1699 return 0;
1700 }
1701 else {
1702 PyErr_Format(PyExc_ValueError,
1703 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001704 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705 errors);
1706 return -1;
1707 }
1708}
1709
1710PyObject *PyUnicode_DecodeASCII(const char *s,
1711 int size,
1712 const char *errors)
1713{
1714 PyUnicodeObject *v;
1715 Py_UNICODE *p;
1716
1717 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001718 if (size == 1 && *(unsigned char*)s < 128) {
1719 Py_UNICODE r = *(unsigned char*)s;
1720 return PyUnicode_FromUnicode(&r, 1);
1721 }
1722
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 v = _PyUnicode_New(size);
1724 if (v == NULL)
1725 goto onError;
1726 if (size == 0)
1727 return (PyObject *)v;
1728 p = PyUnicode_AS_UNICODE(v);
1729 while (size-- > 0) {
1730 register unsigned char c;
1731
1732 c = (unsigned char)*s++;
1733 if (c < 128)
1734 *p++ = c;
1735 else if (ascii_decoding_error(&s, &p, errors,
1736 "ordinal not in range(128)"))
1737 goto onError;
1738 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001739 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001740 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001741 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 return (PyObject *)v;
1743
1744 onError:
1745 Py_XDECREF(v);
1746 return NULL;
1747}
1748
1749static
1750int ascii_encoding_error(const Py_UNICODE **source,
1751 char **dest,
1752 const char *errors,
1753 const char *details)
1754{
1755 if ((errors == NULL) ||
1756 (strcmp(errors,"strict") == 0)) {
1757 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001758 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 details);
1760 return -1;
1761 }
1762 else if (strcmp(errors,"ignore") == 0) {
1763 return 0;
1764 }
1765 else if (strcmp(errors,"replace") == 0) {
1766 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001767 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 return 0;
1769 }
1770 else {
1771 PyErr_Format(PyExc_ValueError,
1772 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001773 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 errors);
1775 return -1;
1776 }
1777}
1778
1779PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1780 int size,
1781 const char *errors)
1782{
1783 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001784 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001785
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 repr = PyString_FromStringAndSize(NULL, size);
1787 if (repr == NULL)
1788 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001789 if (size == 0)
1790 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791
1792 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001793 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 while (size-- > 0) {
1795 Py_UNICODE ch = *p++;
1796 if (ch >= 128) {
1797 if (ascii_encoding_error(&p, &s, errors,
1798 "ordinal not in range(128)"))
1799 goto onError;
1800 }
1801 else
1802 *s++ = (char)ch;
1803 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001804 /* Resize if error handling skipped some characters */
1805 if (s - start < PyString_GET_SIZE(repr))
1806 if (_PyString_Resize(&repr, s - start))
1807 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 return repr;
1809
1810 onError:
1811 Py_DECREF(repr);
1812 return NULL;
1813}
1814
1815PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1816{
1817 if (!PyUnicode_Check(unicode)) {
1818 PyErr_BadArgument();
1819 return NULL;
1820 }
1821 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1822 PyUnicode_GET_SIZE(unicode),
1823 NULL);
1824}
1825
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001826#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001828/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001829
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001830PyObject *PyUnicode_DecodeMBCS(const char *s,
1831 int size,
1832 const char *errors)
1833{
1834 PyUnicodeObject *v;
1835 Py_UNICODE *p;
1836
1837 /* First get the size of the result */
1838 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001839 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001840 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1841
1842 v = _PyUnicode_New(usize);
1843 if (v == NULL)
1844 return NULL;
1845 if (usize == 0)
1846 return (PyObject *)v;
1847 p = PyUnicode_AS_UNICODE(v);
1848 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1849 Py_DECREF(v);
1850 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1851 }
1852
1853 return (PyObject *)v;
1854}
1855
1856PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1857 int size,
1858 const char *errors)
1859{
1860 PyObject *repr;
1861 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001862 DWORD mbcssize;
1863
1864 /* If there are no characters, bail now! */
1865 if (size==0)
1866 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001867
1868 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001869 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001870 if (mbcssize==0)
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872
1873 repr = PyString_FromStringAndSize(NULL, mbcssize);
1874 if (repr == NULL)
1875 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001876 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001877 return repr;
1878
1879 /* Do the conversion */
1880 s = PyString_AS_STRING(repr);
1881 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1882 Py_DECREF(repr);
1883 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1884 }
1885 return repr;
1886}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001887
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001888#endif /* MS_WIN32 */
1889
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890/* --- Character Mapping Codec -------------------------------------------- */
1891
1892static
1893int charmap_decoding_error(const char **source,
1894 Py_UNICODE **dest,
1895 const char *errors,
1896 const char *details)
1897{
1898 if ((errors == NULL) ||
1899 (strcmp(errors,"strict") == 0)) {
1900 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001901 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 details);
1903 return -1;
1904 }
1905 else if (strcmp(errors,"ignore") == 0) {
1906 return 0;
1907 }
1908 else if (strcmp(errors,"replace") == 0) {
1909 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1910 (*dest)++;
1911 return 0;
1912 }
1913 else {
1914 PyErr_Format(PyExc_ValueError,
1915 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001916 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 errors);
1918 return -1;
1919 }
1920}
1921
1922PyObject *PyUnicode_DecodeCharmap(const char *s,
1923 int size,
1924 PyObject *mapping,
1925 const char *errors)
1926{
1927 PyUnicodeObject *v;
1928 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001929 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930
1931 /* Default to Latin-1 */
1932 if (mapping == NULL)
1933 return PyUnicode_DecodeLatin1(s, size, errors);
1934
1935 v = _PyUnicode_New(size);
1936 if (v == NULL)
1937 goto onError;
1938 if (size == 0)
1939 return (PyObject *)v;
1940 p = PyUnicode_AS_UNICODE(v);
1941 while (size-- > 0) {
1942 unsigned char ch = *s++;
1943 PyObject *w, *x;
1944
1945 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1946 w = PyInt_FromLong((long)ch);
1947 if (w == NULL)
1948 goto onError;
1949 x = PyObject_GetItem(mapping, w);
1950 Py_DECREF(w);
1951 if (x == NULL) {
1952 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001953 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001955 x = Py_None;
1956 Py_INCREF(x);
1957 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001958 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 }
1960
1961 /* Apply mapping */
1962 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001963 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 if (value < 0 || value > 65535) {
1965 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001966 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 Py_DECREF(x);
1968 goto onError;
1969 }
1970 *p++ = (Py_UNICODE)value;
1971 }
1972 else if (x == Py_None) {
1973 /* undefined mapping */
1974 if (charmap_decoding_error(&s, &p, errors,
1975 "character maps to <undefined>")) {
1976 Py_DECREF(x);
1977 goto onError;
1978 }
1979 }
1980 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001981 int targetsize = PyUnicode_GET_SIZE(x);
1982
1983 if (targetsize == 1)
1984 /* 1-1 mapping */
1985 *p++ = *PyUnicode_AS_UNICODE(x);
1986
1987 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001989 if (targetsize > extrachars) {
1990 /* resize first */
1991 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1992 int needed = (targetsize - extrachars) + \
1993 (targetsize << 2);
1994 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001995 if (_PyUnicode_Resize(&v,
1996 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001997 Py_DECREF(x);
1998 goto onError;
1999 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002000 p = PyUnicode_AS_UNICODE(v) + oldpos;
2001 }
2002 Py_UNICODE_COPY(p,
2003 PyUnicode_AS_UNICODE(x),
2004 targetsize);
2005 p += targetsize;
2006 extrachars -= targetsize;
2007 }
2008 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 }
2010 else {
2011 /* wrong return value */
2012 PyErr_SetString(PyExc_TypeError,
2013 "character mapping must return integer, None or unicode");
2014 Py_DECREF(x);
2015 goto onError;
2016 }
2017 Py_DECREF(x);
2018 }
2019 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002020 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 goto onError;
2022 return (PyObject *)v;
2023
2024 onError:
2025 Py_XDECREF(v);
2026 return NULL;
2027}
2028
2029static
2030int charmap_encoding_error(const Py_UNICODE **source,
2031 char **dest,
2032 const char *errors,
2033 const char *details)
2034{
2035 if ((errors == NULL) ||
2036 (strcmp(errors,"strict") == 0)) {
2037 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002038 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 details);
2040 return -1;
2041 }
2042 else if (strcmp(errors,"ignore") == 0) {
2043 return 0;
2044 }
2045 else if (strcmp(errors,"replace") == 0) {
2046 **dest = '?';
2047 (*dest)++;
2048 return 0;
2049 }
2050 else {
2051 PyErr_Format(PyExc_ValueError,
2052 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002053 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 errors);
2055 return -1;
2056 }
2057}
2058
2059PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2060 int size,
2061 PyObject *mapping,
2062 const char *errors)
2063{
2064 PyObject *v;
2065 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002066 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067
2068 /* Default to Latin-1 */
2069 if (mapping == NULL)
2070 return PyUnicode_EncodeLatin1(p, size, errors);
2071
2072 v = PyString_FromStringAndSize(NULL, size);
2073 if (v == NULL)
2074 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002075 if (size == 0)
2076 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 s = PyString_AS_STRING(v);
2078 while (size-- > 0) {
2079 Py_UNICODE ch = *p++;
2080 PyObject *w, *x;
2081
2082 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2083 w = PyInt_FromLong((long)ch);
2084 if (w == NULL)
2085 goto onError;
2086 x = PyObject_GetItem(mapping, w);
2087 Py_DECREF(w);
2088 if (x == NULL) {
2089 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002090 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002092 x = Py_None;
2093 Py_INCREF(x);
2094 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002095 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 }
2097
2098 /* Apply mapping */
2099 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002100 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 if (value < 0 || value > 255) {
2102 PyErr_SetString(PyExc_TypeError,
2103 "character mapping must be in range(256)");
2104 Py_DECREF(x);
2105 goto onError;
2106 }
2107 *s++ = (char)value;
2108 }
2109 else if (x == Py_None) {
2110 /* undefined mapping */
2111 if (charmap_encoding_error(&p, &s, errors,
2112 "character maps to <undefined>")) {
2113 Py_DECREF(x);
2114 goto onError;
2115 }
2116 }
2117 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002118 int targetsize = PyString_GET_SIZE(x);
2119
2120 if (targetsize == 1)
2121 /* 1-1 mapping */
2122 *s++ = *PyString_AS_STRING(x);
2123
2124 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002126 if (targetsize > extrachars) {
2127 /* resize first */
2128 int oldpos = (int)(s - PyString_AS_STRING(v));
2129 int needed = (targetsize - extrachars) + \
2130 (targetsize << 2);
2131 extrachars += needed;
2132 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002133 Py_DECREF(x);
2134 goto onError;
2135 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002136 s = PyString_AS_STRING(v) + oldpos;
2137 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002138 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002139 s += targetsize;
2140 extrachars -= targetsize;
2141 }
2142 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143 }
2144 else {
2145 /* wrong return value */
2146 PyErr_SetString(PyExc_TypeError,
2147 "character mapping must return integer, None or unicode");
2148 Py_DECREF(x);
2149 goto onError;
2150 }
2151 Py_DECREF(x);
2152 }
2153 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2154 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2155 goto onError;
2156 return v;
2157
2158 onError:
2159 Py_DECREF(v);
2160 return NULL;
2161}
2162
2163PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2164 PyObject *mapping)
2165{
2166 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2167 PyErr_BadArgument();
2168 return NULL;
2169 }
2170 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2171 PyUnicode_GET_SIZE(unicode),
2172 mapping,
2173 NULL);
2174}
2175
2176static
2177int translate_error(const Py_UNICODE **source,
2178 Py_UNICODE **dest,
2179 const char *errors,
2180 const char *details)
2181{
2182 if ((errors == NULL) ||
2183 (strcmp(errors,"strict") == 0)) {
2184 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002185 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 details);
2187 return -1;
2188 }
2189 else if (strcmp(errors,"ignore") == 0) {
2190 return 0;
2191 }
2192 else if (strcmp(errors,"replace") == 0) {
2193 **dest = '?';
2194 (*dest)++;
2195 return 0;
2196 }
2197 else {
2198 PyErr_Format(PyExc_ValueError,
2199 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002200 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 errors);
2202 return -1;
2203 }
2204}
2205
2206PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2207 int size,
2208 PyObject *mapping,
2209 const char *errors)
2210{
2211 PyUnicodeObject *v;
2212 Py_UNICODE *p;
2213
2214 if (mapping == NULL) {
2215 PyErr_BadArgument();
2216 return NULL;
2217 }
2218
2219 /* Output will never be longer than input */
2220 v = _PyUnicode_New(size);
2221 if (v == NULL)
2222 goto onError;
2223 if (size == 0)
2224 goto done;
2225 p = PyUnicode_AS_UNICODE(v);
2226 while (size-- > 0) {
2227 Py_UNICODE ch = *s++;
2228 PyObject *w, *x;
2229
2230 /* Get mapping */
2231 w = PyInt_FromLong(ch);
2232 if (w == NULL)
2233 goto onError;
2234 x = PyObject_GetItem(mapping, w);
2235 Py_DECREF(w);
2236 if (x == NULL) {
2237 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2238 /* No mapping found: default to 1-1 mapping */
2239 PyErr_Clear();
2240 *p++ = ch;
2241 continue;
2242 }
2243 goto onError;
2244 }
2245
2246 /* Apply mapping */
2247 if (PyInt_Check(x))
2248 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2249 else if (x == Py_None) {
2250 /* undefined mapping */
2251 if (translate_error(&s, &p, errors,
2252 "character maps to <undefined>")) {
2253 Py_DECREF(x);
2254 goto onError;
2255 }
2256 }
2257 else if (PyUnicode_Check(x)) {
2258 if (PyUnicode_GET_SIZE(x) != 1) {
2259 /* 1-n mapping */
2260 PyErr_SetString(PyExc_NotImplementedError,
2261 "1-n mappings are currently not implemented");
2262 Py_DECREF(x);
2263 goto onError;
2264 }
2265 *p++ = *PyUnicode_AS_UNICODE(x);
2266 }
2267 else {
2268 /* wrong return value */
2269 PyErr_SetString(PyExc_TypeError,
2270 "translate mapping must return integer, None or unicode");
2271 Py_DECREF(x);
2272 goto onError;
2273 }
2274 Py_DECREF(x);
2275 }
2276 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002277 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002278 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279
2280 done:
2281 return (PyObject *)v;
2282
2283 onError:
2284 Py_XDECREF(v);
2285 return NULL;
2286}
2287
2288PyObject *PyUnicode_Translate(PyObject *str,
2289 PyObject *mapping,
2290 const char *errors)
2291{
2292 PyObject *result;
2293
2294 str = PyUnicode_FromObject(str);
2295 if (str == NULL)
2296 goto onError;
2297 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2298 PyUnicode_GET_SIZE(str),
2299 mapping,
2300 errors);
2301 Py_DECREF(str);
2302 return result;
2303
2304 onError:
2305 Py_XDECREF(str);
2306 return NULL;
2307}
2308
Guido van Rossum9e896b32000-04-05 20:11:21 +00002309/* --- Decimal Encoder ---------------------------------------------------- */
2310
2311int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2312 int length,
2313 char *output,
2314 const char *errors)
2315{
2316 Py_UNICODE *p, *end;
2317
2318 if (output == NULL) {
2319 PyErr_BadArgument();
2320 return -1;
2321 }
2322
2323 p = s;
2324 end = s + length;
2325 while (p < end) {
2326 register Py_UNICODE ch = *p++;
2327 int decimal;
2328
2329 if (Py_UNICODE_ISSPACE(ch)) {
2330 *output++ = ' ';
2331 continue;
2332 }
2333 decimal = Py_UNICODE_TODECIMAL(ch);
2334 if (decimal >= 0) {
2335 *output++ = '0' + decimal;
2336 continue;
2337 }
Guido van Rossumba477042000-04-06 18:18:10 +00002338 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002339 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002340 continue;
2341 }
2342 /* All other characters are considered invalid */
2343 if (errors == NULL || strcmp(errors, "strict") == 0) {
2344 PyErr_SetString(PyExc_ValueError,
2345 "invalid decimal Unicode string");
2346 goto onError;
2347 }
2348 else if (strcmp(errors, "ignore") == 0)
2349 continue;
2350 else if (strcmp(errors, "replace") == 0) {
2351 *output++ = '?';
2352 continue;
2353 }
2354 }
2355 /* 0-terminate the output string */
2356 *output++ = '\0';
2357 return 0;
2358
2359 onError:
2360 return -1;
2361}
2362
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363/* --- Helpers ------------------------------------------------------------ */
2364
2365static
2366int count(PyUnicodeObject *self,
2367 int start,
2368 int end,
2369 PyUnicodeObject *substring)
2370{
2371 int count = 0;
2372
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002373 if (start < 0)
2374 start += self->length;
2375 if (start < 0)
2376 start = 0;
2377 if (end > self->length)
2378 end = self->length;
2379 if (end < 0)
2380 end += self->length;
2381 if (end < 0)
2382 end = 0;
2383
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002384 if (substring->length == 0)
2385 return (end - start + 1);
2386
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 end -= substring->length;
2388
2389 while (start <= end)
2390 if (Py_UNICODE_MATCH(self, start, substring)) {
2391 count++;
2392 start += substring->length;
2393 } else
2394 start++;
2395
2396 return count;
2397}
2398
2399int PyUnicode_Count(PyObject *str,
2400 PyObject *substr,
2401 int start,
2402 int end)
2403{
2404 int result;
2405
2406 str = PyUnicode_FromObject(str);
2407 if (str == NULL)
2408 return -1;
2409 substr = PyUnicode_FromObject(substr);
2410 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002411 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 return -1;
2413 }
2414
2415 result = count((PyUnicodeObject *)str,
2416 start, end,
2417 (PyUnicodeObject *)substr);
2418
2419 Py_DECREF(str);
2420 Py_DECREF(substr);
2421 return result;
2422}
2423
2424static
2425int findstring(PyUnicodeObject *self,
2426 PyUnicodeObject *substring,
2427 int start,
2428 int end,
2429 int direction)
2430{
2431 if (start < 0)
2432 start += self->length;
2433 if (start < 0)
2434 start = 0;
2435
2436 if (substring->length == 0)
2437 return start;
2438
2439 if (end > self->length)
2440 end = self->length;
2441 if (end < 0)
2442 end += self->length;
2443 if (end < 0)
2444 end = 0;
2445
2446 end -= substring->length;
2447
2448 if (direction < 0) {
2449 for (; end >= start; end--)
2450 if (Py_UNICODE_MATCH(self, end, substring))
2451 return end;
2452 } else {
2453 for (; start <= end; start++)
2454 if (Py_UNICODE_MATCH(self, start, substring))
2455 return start;
2456 }
2457
2458 return -1;
2459}
2460
2461int PyUnicode_Find(PyObject *str,
2462 PyObject *substr,
2463 int start,
2464 int end,
2465 int direction)
2466{
2467 int result;
2468
2469 str = PyUnicode_FromObject(str);
2470 if (str == NULL)
2471 return -1;
2472 substr = PyUnicode_FromObject(substr);
2473 if (substr == NULL) {
2474 Py_DECREF(substr);
2475 return -1;
2476 }
2477
2478 result = findstring((PyUnicodeObject *)str,
2479 (PyUnicodeObject *)substr,
2480 start, end, direction);
2481 Py_DECREF(str);
2482 Py_DECREF(substr);
2483 return result;
2484}
2485
2486static
2487int tailmatch(PyUnicodeObject *self,
2488 PyUnicodeObject *substring,
2489 int start,
2490 int end,
2491 int direction)
2492{
2493 if (start < 0)
2494 start += self->length;
2495 if (start < 0)
2496 start = 0;
2497
2498 if (substring->length == 0)
2499 return 1;
2500
2501 if (end > self->length)
2502 end = self->length;
2503 if (end < 0)
2504 end += self->length;
2505 if (end < 0)
2506 end = 0;
2507
2508 end -= substring->length;
2509 if (end < start)
2510 return 0;
2511
2512 if (direction > 0) {
2513 if (Py_UNICODE_MATCH(self, end, substring))
2514 return 1;
2515 } else {
2516 if (Py_UNICODE_MATCH(self, start, substring))
2517 return 1;
2518 }
2519
2520 return 0;
2521}
2522
2523int PyUnicode_Tailmatch(PyObject *str,
2524 PyObject *substr,
2525 int start,
2526 int end,
2527 int direction)
2528{
2529 int result;
2530
2531 str = PyUnicode_FromObject(str);
2532 if (str == NULL)
2533 return -1;
2534 substr = PyUnicode_FromObject(substr);
2535 if (substr == NULL) {
2536 Py_DECREF(substr);
2537 return -1;
2538 }
2539
2540 result = tailmatch((PyUnicodeObject *)str,
2541 (PyUnicodeObject *)substr,
2542 start, end, direction);
2543 Py_DECREF(str);
2544 Py_DECREF(substr);
2545 return result;
2546}
2547
2548static
2549const Py_UNICODE *findchar(const Py_UNICODE *s,
2550 int size,
2551 Py_UNICODE ch)
2552{
2553 /* like wcschr, but doesn't stop at NULL characters */
2554
2555 while (size-- > 0) {
2556 if (*s == ch)
2557 return s;
2558 s++;
2559 }
2560
2561 return NULL;
2562}
2563
2564/* Apply fixfct filter to the Unicode object self and return a
2565 reference to the modified object */
2566
2567static
2568PyObject *fixup(PyUnicodeObject *self,
2569 int (*fixfct)(PyUnicodeObject *s))
2570{
2571
2572 PyUnicodeObject *u;
2573
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002574 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 if (u == NULL)
2576 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002577
2578 Py_UNICODE_COPY(u->str, self->str, self->length);
2579
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 if (!fixfct(u)) {
2581 /* fixfct should return TRUE if it modified the buffer. If
2582 FALSE, return a reference to the original buffer instead
2583 (to save space, not time) */
2584 Py_INCREF(self);
2585 Py_DECREF(u);
2586 return (PyObject*) self;
2587 }
2588 return (PyObject*) u;
2589}
2590
2591static
2592int fixupper(PyUnicodeObject *self)
2593{
2594 int len = self->length;
2595 Py_UNICODE *s = self->str;
2596 int status = 0;
2597
2598 while (len-- > 0) {
2599 register Py_UNICODE ch;
2600
2601 ch = Py_UNICODE_TOUPPER(*s);
2602 if (ch != *s) {
2603 status = 1;
2604 *s = ch;
2605 }
2606 s++;
2607 }
2608
2609 return status;
2610}
2611
2612static
2613int fixlower(PyUnicodeObject *self)
2614{
2615 int len = self->length;
2616 Py_UNICODE *s = self->str;
2617 int status = 0;
2618
2619 while (len-- > 0) {
2620 register Py_UNICODE ch;
2621
2622 ch = Py_UNICODE_TOLOWER(*s);
2623 if (ch != *s) {
2624 status = 1;
2625 *s = ch;
2626 }
2627 s++;
2628 }
2629
2630 return status;
2631}
2632
2633static
2634int fixswapcase(PyUnicodeObject *self)
2635{
2636 int len = self->length;
2637 Py_UNICODE *s = self->str;
2638 int status = 0;
2639
2640 while (len-- > 0) {
2641 if (Py_UNICODE_ISUPPER(*s)) {
2642 *s = Py_UNICODE_TOLOWER(*s);
2643 status = 1;
2644 } else if (Py_UNICODE_ISLOWER(*s)) {
2645 *s = Py_UNICODE_TOUPPER(*s);
2646 status = 1;
2647 }
2648 s++;
2649 }
2650
2651 return status;
2652}
2653
2654static
2655int fixcapitalize(PyUnicodeObject *self)
2656{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002657 int len = self->length;
2658 Py_UNICODE *s = self->str;
2659 int status = 0;
2660
2661 if (len == 0)
2662 return 0;
2663 if (Py_UNICODE_ISLOWER(*s)) {
2664 *s = Py_UNICODE_TOUPPER(*s);
2665 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002667 s++;
2668 while (--len > 0) {
2669 if (Py_UNICODE_ISUPPER(*s)) {
2670 *s = Py_UNICODE_TOLOWER(*s);
2671 status = 1;
2672 }
2673 s++;
2674 }
2675 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676}
2677
2678static
2679int fixtitle(PyUnicodeObject *self)
2680{
2681 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2682 register Py_UNICODE *e;
2683 int previous_is_cased;
2684
2685 /* Shortcut for single character strings */
2686 if (PyUnicode_GET_SIZE(self) == 1) {
2687 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2688 if (*p != ch) {
2689 *p = ch;
2690 return 1;
2691 }
2692 else
2693 return 0;
2694 }
2695
2696 e = p + PyUnicode_GET_SIZE(self);
2697 previous_is_cased = 0;
2698 for (; p < e; p++) {
2699 register const Py_UNICODE ch = *p;
2700
2701 if (previous_is_cased)
2702 *p = Py_UNICODE_TOLOWER(ch);
2703 else
2704 *p = Py_UNICODE_TOTITLE(ch);
2705
2706 if (Py_UNICODE_ISLOWER(ch) ||
2707 Py_UNICODE_ISUPPER(ch) ||
2708 Py_UNICODE_ISTITLE(ch))
2709 previous_is_cased = 1;
2710 else
2711 previous_is_cased = 0;
2712 }
2713 return 1;
2714}
2715
2716PyObject *PyUnicode_Join(PyObject *separator,
2717 PyObject *seq)
2718{
2719 Py_UNICODE *sep;
2720 int seplen;
2721 PyUnicodeObject *res = NULL;
2722 int reslen = 0;
2723 Py_UNICODE *p;
2724 int seqlen = 0;
2725 int sz = 100;
2726 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002727 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002728
Tim Peters2cfe3682001-05-05 05:36:48 +00002729 it = PyObject_GetIter(seq);
2730 if (it == NULL)
2731 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732
2733 if (separator == NULL) {
2734 Py_UNICODE blank = ' ';
2735 sep = &blank;
2736 seplen = 1;
2737 }
2738 else {
2739 separator = PyUnicode_FromObject(separator);
2740 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002741 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002742 sep = PyUnicode_AS_UNICODE(separator);
2743 seplen = PyUnicode_GET_SIZE(separator);
2744 }
2745
2746 res = _PyUnicode_New(sz);
2747 if (res == NULL)
2748 goto onError;
2749 p = PyUnicode_AS_UNICODE(res);
2750 reslen = 0;
2751
Tim Peters2cfe3682001-05-05 05:36:48 +00002752 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002753 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002754 PyObject *item = PyIter_Next(it);
2755 if (item == NULL) {
2756 if (PyErr_Occurred())
2757 goto onError;
2758 break;
2759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760 if (!PyUnicode_Check(item)) {
2761 PyObject *v;
2762 v = PyUnicode_FromObject(item);
2763 Py_DECREF(item);
2764 item = v;
2765 if (item == NULL)
2766 goto onError;
2767 }
2768 itemlen = PyUnicode_GET_SIZE(item);
2769 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002770 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002771 goto onError;
2772 sz *= 2;
2773 p = PyUnicode_AS_UNICODE(res) + reslen;
2774 }
2775 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002776 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002777 p += seplen;
2778 reslen += seplen;
2779 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002780 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002781 p += itemlen;
2782 reslen += itemlen;
2783 Py_DECREF(item);
2784 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002785 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786 goto onError;
2787
2788 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002789 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002790 return (PyObject *)res;
2791
2792 onError:
2793 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002794 Py_XDECREF(res);
2795 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002796 return NULL;
2797}
2798
2799static
2800PyUnicodeObject *pad(PyUnicodeObject *self,
2801 int left,
2802 int right,
2803 Py_UNICODE fill)
2804{
2805 PyUnicodeObject *u;
2806
2807 if (left < 0)
2808 left = 0;
2809 if (right < 0)
2810 right = 0;
2811
2812 if (left == 0 && right == 0) {
2813 Py_INCREF(self);
2814 return self;
2815 }
2816
2817 u = _PyUnicode_New(left + self->length + right);
2818 if (u) {
2819 if (left)
2820 Py_UNICODE_FILL(u->str, fill, left);
2821 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2822 if (right)
2823 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2824 }
2825
2826 return u;
2827}
2828
2829#define SPLIT_APPEND(data, left, right) \
2830 str = PyUnicode_FromUnicode(data + left, right - left); \
2831 if (!str) \
2832 goto onError; \
2833 if (PyList_Append(list, str)) { \
2834 Py_DECREF(str); \
2835 goto onError; \
2836 } \
2837 else \
2838 Py_DECREF(str);
2839
2840static
2841PyObject *split_whitespace(PyUnicodeObject *self,
2842 PyObject *list,
2843 int maxcount)
2844{
2845 register int i;
2846 register int j;
2847 int len = self->length;
2848 PyObject *str;
2849
2850 for (i = j = 0; i < len; ) {
2851 /* find a token */
2852 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2853 i++;
2854 j = i;
2855 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2856 i++;
2857 if (j < i) {
2858 if (maxcount-- <= 0)
2859 break;
2860 SPLIT_APPEND(self->str, j, i);
2861 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2862 i++;
2863 j = i;
2864 }
2865 }
2866 if (j < len) {
2867 SPLIT_APPEND(self->str, j, len);
2868 }
2869 return list;
2870
2871 onError:
2872 Py_DECREF(list);
2873 return NULL;
2874}
2875
2876PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002877 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002878{
2879 register int i;
2880 register int j;
2881 int len;
2882 PyObject *list;
2883 PyObject *str;
2884 Py_UNICODE *data;
2885
2886 string = PyUnicode_FromObject(string);
2887 if (string == NULL)
2888 return NULL;
2889 data = PyUnicode_AS_UNICODE(string);
2890 len = PyUnicode_GET_SIZE(string);
2891
Guido van Rossumd57fd912000-03-10 22:53:23 +00002892 list = PyList_New(0);
2893 if (!list)
2894 goto onError;
2895
2896 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002897 int eol;
2898
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899 /* Find a line and append it */
2900 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2901 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902
2903 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002904 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 if (i < len) {
2906 if (data[i] == '\r' && i + 1 < len &&
2907 data[i+1] == '\n')
2908 i += 2;
2909 else
2910 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002911 if (keepends)
2912 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002913 }
Guido van Rossum86662912000-04-11 15:38:46 +00002914 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002915 j = i;
2916 }
2917 if (j < len) {
2918 SPLIT_APPEND(data, j, len);
2919 }
2920
2921 Py_DECREF(string);
2922 return list;
2923
2924 onError:
2925 Py_DECREF(list);
2926 Py_DECREF(string);
2927 return NULL;
2928}
2929
2930static
2931PyObject *split_char(PyUnicodeObject *self,
2932 PyObject *list,
2933 Py_UNICODE ch,
2934 int maxcount)
2935{
2936 register int i;
2937 register int j;
2938 int len = self->length;
2939 PyObject *str;
2940
2941 for (i = j = 0; i < len; ) {
2942 if (self->str[i] == ch) {
2943 if (maxcount-- <= 0)
2944 break;
2945 SPLIT_APPEND(self->str, j, i);
2946 i = j = i + 1;
2947 } else
2948 i++;
2949 }
2950 if (j <= len) {
2951 SPLIT_APPEND(self->str, j, len);
2952 }
2953 return list;
2954
2955 onError:
2956 Py_DECREF(list);
2957 return NULL;
2958}
2959
2960static
2961PyObject *split_substring(PyUnicodeObject *self,
2962 PyObject *list,
2963 PyUnicodeObject *substring,
2964 int maxcount)
2965{
2966 register int i;
2967 register int j;
2968 int len = self->length;
2969 int sublen = substring->length;
2970 PyObject *str;
2971
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002972 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002973 if (Py_UNICODE_MATCH(self, i, substring)) {
2974 if (maxcount-- <= 0)
2975 break;
2976 SPLIT_APPEND(self->str, j, i);
2977 i = j = i + sublen;
2978 } else
2979 i++;
2980 }
2981 if (j <= len) {
2982 SPLIT_APPEND(self->str, j, len);
2983 }
2984 return list;
2985
2986 onError:
2987 Py_DECREF(list);
2988 return NULL;
2989}
2990
2991#undef SPLIT_APPEND
2992
2993static
2994PyObject *split(PyUnicodeObject *self,
2995 PyUnicodeObject *substring,
2996 int maxcount)
2997{
2998 PyObject *list;
2999
3000 if (maxcount < 0)
3001 maxcount = INT_MAX;
3002
3003 list = PyList_New(0);
3004 if (!list)
3005 return NULL;
3006
3007 if (substring == NULL)
3008 return split_whitespace(self,list,maxcount);
3009
3010 else if (substring->length == 1)
3011 return split_char(self,list,substring->str[0],maxcount);
3012
3013 else if (substring->length == 0) {
3014 Py_DECREF(list);
3015 PyErr_SetString(PyExc_ValueError, "empty separator");
3016 return NULL;
3017 }
3018 else
3019 return split_substring(self,list,substring,maxcount);
3020}
3021
3022static
3023PyObject *strip(PyUnicodeObject *self,
3024 int left,
3025 int right)
3026{
3027 Py_UNICODE *p = self->str;
3028 int start = 0;
3029 int end = self->length;
3030
3031 if (left)
3032 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3033 start++;
3034
3035 if (right)
3036 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3037 end--;
3038
3039 if (start == 0 && end == self->length) {
3040 /* couldn't strip anything off, return original string */
3041 Py_INCREF(self);
3042 return (PyObject*) self;
3043 }
3044
3045 return (PyObject*) PyUnicode_FromUnicode(
3046 self->str + start,
3047 end - start
3048 );
3049}
3050
3051static
3052PyObject *replace(PyUnicodeObject *self,
3053 PyUnicodeObject *str1,
3054 PyUnicodeObject *str2,
3055 int maxcount)
3056{
3057 PyUnicodeObject *u;
3058
3059 if (maxcount < 0)
3060 maxcount = INT_MAX;
3061
3062 if (str1->length == 1 && str2->length == 1) {
3063 int i;
3064
3065 /* replace characters */
3066 if (!findchar(self->str, self->length, str1->str[0])) {
3067 /* nothing to replace, return original string */
3068 Py_INCREF(self);
3069 u = self;
3070 } else {
3071 Py_UNICODE u1 = str1->str[0];
3072 Py_UNICODE u2 = str2->str[0];
3073
3074 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003075 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003076 self->length
3077 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003078 if (u != NULL) {
3079 Py_UNICODE_COPY(u->str, self->str,
3080 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 for (i = 0; i < u->length; i++)
3082 if (u->str[i] == u1) {
3083 if (--maxcount < 0)
3084 break;
3085 u->str[i] = u2;
3086 }
3087 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003088 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003089
3090 } else {
3091 int n, i;
3092 Py_UNICODE *p;
3093
3094 /* replace strings */
3095 n = count(self, 0, self->length, str1);
3096 if (n > maxcount)
3097 n = maxcount;
3098 if (n == 0) {
3099 /* nothing to replace, return original string */
3100 Py_INCREF(self);
3101 u = self;
3102 } else {
3103 u = _PyUnicode_New(
3104 self->length + n * (str2->length - str1->length));
3105 if (u) {
3106 i = 0;
3107 p = u->str;
3108 while (i <= self->length - str1->length)
3109 if (Py_UNICODE_MATCH(self, i, str1)) {
3110 /* replace string segment */
3111 Py_UNICODE_COPY(p, str2->str, str2->length);
3112 p += str2->length;
3113 i += str1->length;
3114 if (--n <= 0) {
3115 /* copy remaining part */
3116 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3117 break;
3118 }
3119 } else
3120 *p++ = self->str[i++];
3121 }
3122 }
3123 }
3124
3125 return (PyObject *) u;
3126}
3127
3128/* --- Unicode Object Methods --------------------------------------------- */
3129
3130static char title__doc__[] =
3131"S.title() -> unicode\n\
3132\n\
3133Return a titlecased version of S, i.e. words start with title case\n\
3134characters, all remaining cased characters have lower case.";
3135
3136static PyObject*
3137unicode_title(PyUnicodeObject *self, PyObject *args)
3138{
3139 if (!PyArg_NoArgs(args))
3140 return NULL;
3141 return fixup(self, fixtitle);
3142}
3143
3144static char capitalize__doc__[] =
3145"S.capitalize() -> unicode\n\
3146\n\
3147Return a capitalized version of S, i.e. make the first character\n\
3148have upper case.";
3149
3150static PyObject*
3151unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3152{
3153 if (!PyArg_NoArgs(args))
3154 return NULL;
3155 return fixup(self, fixcapitalize);
3156}
3157
3158#if 0
3159static char capwords__doc__[] =
3160"S.capwords() -> unicode\n\
3161\n\
3162Apply .capitalize() to all words in S and return the result with\n\
3163normalized whitespace (all whitespace strings are replaced by ' ').";
3164
3165static PyObject*
3166unicode_capwords(PyUnicodeObject *self, PyObject *args)
3167{
3168 PyObject *list;
3169 PyObject *item;
3170 int i;
3171
3172 if (!PyArg_NoArgs(args))
3173 return NULL;
3174
3175 /* Split into words */
3176 list = split(self, NULL, -1);
3177 if (!list)
3178 return NULL;
3179
3180 /* Capitalize each word */
3181 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3182 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3183 fixcapitalize);
3184 if (item == NULL)
3185 goto onError;
3186 Py_DECREF(PyList_GET_ITEM(list, i));
3187 PyList_SET_ITEM(list, i, item);
3188 }
3189
3190 /* Join the words to form a new string */
3191 item = PyUnicode_Join(NULL, list);
3192
3193onError:
3194 Py_DECREF(list);
3195 return (PyObject *)item;
3196}
3197#endif
3198
3199static char center__doc__[] =
3200"S.center(width) -> unicode\n\
3201\n\
3202Return S centered in a Unicode string of length width. Padding is done\n\
3203using spaces.";
3204
3205static PyObject *
3206unicode_center(PyUnicodeObject *self, PyObject *args)
3207{
3208 int marg, left;
3209 int width;
3210
3211 if (!PyArg_ParseTuple(args, "i:center", &width))
3212 return NULL;
3213
3214 if (self->length >= width) {
3215 Py_INCREF(self);
3216 return (PyObject*) self;
3217 }
3218
3219 marg = width - self->length;
3220 left = marg / 2 + (marg & width & 1);
3221
3222 return (PyObject*) pad(self, left, marg - left, ' ');
3223}
3224
Marc-André Lemburge5034372000-08-08 08:04:29 +00003225#if 0
3226
3227/* This code should go into some future Unicode collation support
3228 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003229 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003230
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003231/* speedy UTF-16 code point order comparison */
3232/* gleaned from: */
3233/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3234
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003235static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003236{
3237 0, 0, 0, 0, 0, 0, 0, 0,
3238 0, 0, 0, 0, 0, 0, 0, 0,
3239 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003240 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003241};
3242
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243static int
3244unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3245{
3246 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003247
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 Py_UNICODE *s1 = str1->str;
3249 Py_UNICODE *s2 = str2->str;
3250
3251 len1 = str1->length;
3252 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003253
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003255 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003256 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003257
3258 c1 = *s1++;
3259 c2 = *s2++;
3260 if (c1 > (1<<11) * 26)
3261 c1 += utf16Fixup[c1>>11];
3262 if (c2 > (1<<11) * 26)
3263 c2 += utf16Fixup[c2>>11];
3264
3265 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003266 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003267 if (diff)
3268 return (diff < 0) ? -1 : (diff != 0);
3269 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 }
3271
3272 return (len1 < len2) ? -1 : (len1 != len2);
3273}
3274
Marc-André Lemburge5034372000-08-08 08:04:29 +00003275#else
3276
3277static int
3278unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3279{
3280 register int len1, len2;
3281
3282 Py_UNICODE *s1 = str1->str;
3283 Py_UNICODE *s2 = str2->str;
3284
3285 len1 = str1->length;
3286 len2 = str2->length;
3287
3288 while (len1 > 0 && len2 > 0) {
3289 register long diff;
3290
3291 diff = (long)*s1++ - (long)*s2++;
3292 if (diff)
3293 return (diff < 0) ? -1 : (diff != 0);
3294 len1--; len2--;
3295 }
3296
3297 return (len1 < len2) ? -1 : (len1 != len2);
3298}
3299
3300#endif
3301
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302int PyUnicode_Compare(PyObject *left,
3303 PyObject *right)
3304{
3305 PyUnicodeObject *u = NULL, *v = NULL;
3306 int result;
3307
3308 /* Coerce the two arguments */
3309 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3310 if (u == NULL)
3311 goto onError;
3312 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3313 if (v == NULL)
3314 goto onError;
3315
Thomas Wouters7e474022000-07-16 12:04:32 +00003316 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 if (v == u) {
3318 Py_DECREF(u);
3319 Py_DECREF(v);
3320 return 0;
3321 }
3322
3323 result = unicode_compare(u, v);
3324
3325 Py_DECREF(u);
3326 Py_DECREF(v);
3327 return result;
3328
3329onError:
3330 Py_XDECREF(u);
3331 Py_XDECREF(v);
3332 return -1;
3333}
3334
Guido van Rossum403d68b2000-03-13 15:55:09 +00003335int PyUnicode_Contains(PyObject *container,
3336 PyObject *element)
3337{
3338 PyUnicodeObject *u = NULL, *v = NULL;
3339 int result;
3340 register const Py_UNICODE *p, *e;
3341 register Py_UNICODE ch;
3342
3343 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003344 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003345 if (v == NULL) {
3346 PyErr_SetString(PyExc_TypeError,
3347 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003348 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003349 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003350 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3351 if (u == NULL) {
3352 Py_DECREF(v);
3353 goto onError;
3354 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003355
3356 /* Check v in u */
3357 if (PyUnicode_GET_SIZE(v) != 1) {
3358 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003359 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003360 goto onError;
3361 }
3362 ch = *PyUnicode_AS_UNICODE(v);
3363 p = PyUnicode_AS_UNICODE(u);
3364 e = p + PyUnicode_GET_SIZE(u);
3365 result = 0;
3366 while (p < e) {
3367 if (*p++ == ch) {
3368 result = 1;
3369 break;
3370 }
3371 }
3372
3373 Py_DECREF(u);
3374 Py_DECREF(v);
3375 return result;
3376
3377onError:
3378 Py_XDECREF(u);
3379 Py_XDECREF(v);
3380 return -1;
3381}
3382
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383/* Concat to string or Unicode object giving a new Unicode object. */
3384
3385PyObject *PyUnicode_Concat(PyObject *left,
3386 PyObject *right)
3387{
3388 PyUnicodeObject *u = NULL, *v = NULL, *w;
3389
3390 /* Coerce the two arguments */
3391 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3392 if (u == NULL)
3393 goto onError;
3394 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3395 if (v == NULL)
3396 goto onError;
3397
3398 /* Shortcuts */
3399 if (v == unicode_empty) {
3400 Py_DECREF(v);
3401 return (PyObject *)u;
3402 }
3403 if (u == unicode_empty) {
3404 Py_DECREF(u);
3405 return (PyObject *)v;
3406 }
3407
3408 /* Concat the two Unicode strings */
3409 w = _PyUnicode_New(u->length + v->length);
3410 if (w == NULL)
3411 goto onError;
3412 Py_UNICODE_COPY(w->str, u->str, u->length);
3413 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3414
3415 Py_DECREF(u);
3416 Py_DECREF(v);
3417 return (PyObject *)w;
3418
3419onError:
3420 Py_XDECREF(u);
3421 Py_XDECREF(v);
3422 return NULL;
3423}
3424
3425static char count__doc__[] =
3426"S.count(sub[, start[, end]]) -> int\n\
3427\n\
3428Return the number of occurrences of substring sub in Unicode string\n\
3429S[start:end]. Optional arguments start and end are\n\
3430interpreted as in slice notation.";
3431
3432static PyObject *
3433unicode_count(PyUnicodeObject *self, PyObject *args)
3434{
3435 PyUnicodeObject *substring;
3436 int start = 0;
3437 int end = INT_MAX;
3438 PyObject *result;
3439
Guido van Rossumb8872e62000-05-09 14:14:27 +00003440 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3441 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442 return NULL;
3443
3444 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3445 (PyObject *)substring);
3446 if (substring == NULL)
3447 return NULL;
3448
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 if (start < 0)
3450 start += self->length;
3451 if (start < 0)
3452 start = 0;
3453 if (end > self->length)
3454 end = self->length;
3455 if (end < 0)
3456 end += self->length;
3457 if (end < 0)
3458 end = 0;
3459
3460 result = PyInt_FromLong((long) count(self, start, end, substring));
3461
3462 Py_DECREF(substring);
3463 return result;
3464}
3465
3466static char encode__doc__[] =
3467"S.encode([encoding[,errors]]) -> string\n\
3468\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003469Return an encoded string version of S. Default encoding is the current\n\
3470default string encoding. errors may be given to set a different error\n\
3471handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3472a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473
3474static PyObject *
3475unicode_encode(PyUnicodeObject *self, PyObject *args)
3476{
3477 char *encoding = NULL;
3478 char *errors = NULL;
3479 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3480 return NULL;
3481 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3482}
3483
3484static char expandtabs__doc__[] =
3485"S.expandtabs([tabsize]) -> unicode\n\
3486\n\
3487Return a copy of S where all tab characters are expanded using spaces.\n\
3488If tabsize is not given, a tab size of 8 characters is assumed.";
3489
3490static PyObject*
3491unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3492{
3493 Py_UNICODE *e;
3494 Py_UNICODE *p;
3495 Py_UNICODE *q;
3496 int i, j;
3497 PyUnicodeObject *u;
3498 int tabsize = 8;
3499
3500 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3501 return NULL;
3502
Thomas Wouters7e474022000-07-16 12:04:32 +00003503 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 i = j = 0;
3505 e = self->str + self->length;
3506 for (p = self->str; p < e; p++)
3507 if (*p == '\t') {
3508 if (tabsize > 0)
3509 j += tabsize - (j % tabsize);
3510 }
3511 else {
3512 j++;
3513 if (*p == '\n' || *p == '\r') {
3514 i += j;
3515 j = 0;
3516 }
3517 }
3518
3519 /* Second pass: create output string and fill it */
3520 u = _PyUnicode_New(i + j);
3521 if (!u)
3522 return NULL;
3523
3524 j = 0;
3525 q = u->str;
3526
3527 for (p = self->str; p < e; p++)
3528 if (*p == '\t') {
3529 if (tabsize > 0) {
3530 i = tabsize - (j % tabsize);
3531 j += i;
3532 while (i--)
3533 *q++ = ' ';
3534 }
3535 }
3536 else {
3537 j++;
3538 *q++ = *p;
3539 if (*p == '\n' || *p == '\r')
3540 j = 0;
3541 }
3542
3543 return (PyObject*) u;
3544}
3545
3546static char find__doc__[] =
3547"S.find(sub [,start [,end]]) -> int\n\
3548\n\
3549Return the lowest index in S where substring sub is found,\n\
3550such that sub is contained within s[start,end]. Optional\n\
3551arguments start and end are interpreted as in slice notation.\n\
3552\n\
3553Return -1 on failure.";
3554
3555static PyObject *
3556unicode_find(PyUnicodeObject *self, PyObject *args)
3557{
3558 PyUnicodeObject *substring;
3559 int start = 0;
3560 int end = INT_MAX;
3561 PyObject *result;
3562
Guido van Rossumb8872e62000-05-09 14:14:27 +00003563 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3564 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 return NULL;
3566 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3567 (PyObject *)substring);
3568 if (substring == NULL)
3569 return NULL;
3570
3571 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3572
3573 Py_DECREF(substring);
3574 return result;
3575}
3576
3577static PyObject *
3578unicode_getitem(PyUnicodeObject *self, int index)
3579{
3580 if (index < 0 || index >= self->length) {
3581 PyErr_SetString(PyExc_IndexError, "string index out of range");
3582 return NULL;
3583 }
3584
3585 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3586}
3587
3588static long
3589unicode_hash(PyUnicodeObject *self)
3590{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003591 /* Since Unicode objects compare equal to their ASCII string
3592 counterparts, they should use the individual character values
3593 as basis for their hash value. This is needed to assure that
3594 strings and Unicode objects behave in the same way as
3595 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596
Fredrik Lundhdde61642000-07-10 18:27:47 +00003597 register int len;
3598 register Py_UNICODE *p;
3599 register long x;
3600
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 if (self->hash != -1)
3602 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003603 len = PyUnicode_GET_SIZE(self);
3604 p = PyUnicode_AS_UNICODE(self);
3605 x = *p << 7;
3606 while (--len >= 0)
3607 x = (1000003*x) ^ *p++;
3608 x ^= PyUnicode_GET_SIZE(self);
3609 if (x == -1)
3610 x = -2;
3611 self->hash = x;
3612 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613}
3614
3615static char index__doc__[] =
3616"S.index(sub [,start [,end]]) -> int\n\
3617\n\
3618Like S.find() but raise ValueError when the substring is not found.";
3619
3620static PyObject *
3621unicode_index(PyUnicodeObject *self, PyObject *args)
3622{
3623 int result;
3624 PyUnicodeObject *substring;
3625 int start = 0;
3626 int end = INT_MAX;
3627
Guido van Rossumb8872e62000-05-09 14:14:27 +00003628 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3629 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 return NULL;
3631
3632 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3633 (PyObject *)substring);
3634 if (substring == NULL)
3635 return NULL;
3636
3637 result = findstring(self, substring, start, end, 1);
3638
3639 Py_DECREF(substring);
3640 if (result < 0) {
3641 PyErr_SetString(PyExc_ValueError, "substring not found");
3642 return NULL;
3643 }
3644 return PyInt_FromLong(result);
3645}
3646
3647static char islower__doc__[] =
3648"S.islower() -> int\n\
3649\n\
3650Return 1 if all cased characters in S are lowercase and there is\n\
3651at least one cased character in S, 0 otherwise.";
3652
3653static PyObject*
3654unicode_islower(PyUnicodeObject *self, PyObject *args)
3655{
3656 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3657 register const Py_UNICODE *e;
3658 int cased;
3659
3660 if (!PyArg_NoArgs(args))
3661 return NULL;
3662
3663 /* Shortcut for single character strings */
3664 if (PyUnicode_GET_SIZE(self) == 1)
3665 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003667 /* Special case for empty strings */
3668 if (PyString_GET_SIZE(self) == 0)
3669 return PyInt_FromLong(0);
3670
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 e = p + PyUnicode_GET_SIZE(self);
3672 cased = 0;
3673 for (; p < e; p++) {
3674 register const Py_UNICODE ch = *p;
3675
3676 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3677 return PyInt_FromLong(0);
3678 else if (!cased && Py_UNICODE_ISLOWER(ch))
3679 cased = 1;
3680 }
3681 return PyInt_FromLong(cased);
3682}
3683
3684static char isupper__doc__[] =
3685"S.isupper() -> int\n\
3686\n\
3687Return 1 if all cased characters in S are uppercase and there is\n\
3688at least one cased character in S, 0 otherwise.";
3689
3690static PyObject*
3691unicode_isupper(PyUnicodeObject *self, PyObject *args)
3692{
3693 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3694 register const Py_UNICODE *e;
3695 int cased;
3696
3697 if (!PyArg_NoArgs(args))
3698 return NULL;
3699
3700 /* Shortcut for single character strings */
3701 if (PyUnicode_GET_SIZE(self) == 1)
3702 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3703
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003704 /* Special case for empty strings */
3705 if (PyString_GET_SIZE(self) == 0)
3706 return PyInt_FromLong(0);
3707
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 e = p + PyUnicode_GET_SIZE(self);
3709 cased = 0;
3710 for (; p < e; p++) {
3711 register const Py_UNICODE ch = *p;
3712
3713 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3714 return PyInt_FromLong(0);
3715 else if (!cased && Py_UNICODE_ISUPPER(ch))
3716 cased = 1;
3717 }
3718 return PyInt_FromLong(cased);
3719}
3720
3721static char istitle__doc__[] =
3722"S.istitle() -> int\n\
3723\n\
3724Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3725may only follow uncased characters and lowercase characters only cased\n\
3726ones. Return 0 otherwise.";
3727
3728static PyObject*
3729unicode_istitle(PyUnicodeObject *self, PyObject *args)
3730{
3731 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3732 register const Py_UNICODE *e;
3733 int cased, previous_is_cased;
3734
3735 if (!PyArg_NoArgs(args))
3736 return NULL;
3737
3738 /* Shortcut for single character strings */
3739 if (PyUnicode_GET_SIZE(self) == 1)
3740 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3741 (Py_UNICODE_ISUPPER(*p) != 0));
3742
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003743 /* Special case for empty strings */
3744 if (PyString_GET_SIZE(self) == 0)
3745 return PyInt_FromLong(0);
3746
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 e = p + PyUnicode_GET_SIZE(self);
3748 cased = 0;
3749 previous_is_cased = 0;
3750 for (; p < e; p++) {
3751 register const Py_UNICODE ch = *p;
3752
3753 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3754 if (previous_is_cased)
3755 return PyInt_FromLong(0);
3756 previous_is_cased = 1;
3757 cased = 1;
3758 }
3759 else if (Py_UNICODE_ISLOWER(ch)) {
3760 if (!previous_is_cased)
3761 return PyInt_FromLong(0);
3762 previous_is_cased = 1;
3763 cased = 1;
3764 }
3765 else
3766 previous_is_cased = 0;
3767 }
3768 return PyInt_FromLong(cased);
3769}
3770
3771static char isspace__doc__[] =
3772"S.isspace() -> int\n\
3773\n\
3774Return 1 if there are only whitespace characters in S,\n\
37750 otherwise.";
3776
3777static PyObject*
3778unicode_isspace(PyUnicodeObject *self, PyObject *args)
3779{
3780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3781 register const Py_UNICODE *e;
3782
3783 if (!PyArg_NoArgs(args))
3784 return NULL;
3785
3786 /* Shortcut for single character strings */
3787 if (PyUnicode_GET_SIZE(self) == 1 &&
3788 Py_UNICODE_ISSPACE(*p))
3789 return PyInt_FromLong(1);
3790
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003791 /* Special case for empty strings */
3792 if (PyString_GET_SIZE(self) == 0)
3793 return PyInt_FromLong(0);
3794
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 e = p + PyUnicode_GET_SIZE(self);
3796 for (; p < e; p++) {
3797 if (!Py_UNICODE_ISSPACE(*p))
3798 return PyInt_FromLong(0);
3799 }
3800 return PyInt_FromLong(1);
3801}
3802
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003803static char isalpha__doc__[] =
3804"S.isalpha() -> int\n\
3805\n\
3806Return 1 if all characters in S are alphabetic\n\
3807and there is at least one character in S, 0 otherwise.";
3808
3809static PyObject*
3810unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3811{
3812 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3813 register const Py_UNICODE *e;
3814
3815 if (!PyArg_NoArgs(args))
3816 return NULL;
3817
3818 /* Shortcut for single character strings */
3819 if (PyUnicode_GET_SIZE(self) == 1 &&
3820 Py_UNICODE_ISALPHA(*p))
3821 return PyInt_FromLong(1);
3822
3823 /* Special case for empty strings */
3824 if (PyString_GET_SIZE(self) == 0)
3825 return PyInt_FromLong(0);
3826
3827 e = p + PyUnicode_GET_SIZE(self);
3828 for (; p < e; p++) {
3829 if (!Py_UNICODE_ISALPHA(*p))
3830 return PyInt_FromLong(0);
3831 }
3832 return PyInt_FromLong(1);
3833}
3834
3835static char isalnum__doc__[] =
3836"S.isalnum() -> int\n\
3837\n\
3838Return 1 if all characters in S are alphanumeric\n\
3839and there is at least one character in S, 0 otherwise.";
3840
3841static PyObject*
3842unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3843{
3844 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3845 register const Py_UNICODE *e;
3846
3847 if (!PyArg_NoArgs(args))
3848 return NULL;
3849
3850 /* Shortcut for single character strings */
3851 if (PyUnicode_GET_SIZE(self) == 1 &&
3852 Py_UNICODE_ISALNUM(*p))
3853 return PyInt_FromLong(1);
3854
3855 /* Special case for empty strings */
3856 if (PyString_GET_SIZE(self) == 0)
3857 return PyInt_FromLong(0);
3858
3859 e = p + PyUnicode_GET_SIZE(self);
3860 for (; p < e; p++) {
3861 if (!Py_UNICODE_ISALNUM(*p))
3862 return PyInt_FromLong(0);
3863 }
3864 return PyInt_FromLong(1);
3865}
3866
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867static char isdecimal__doc__[] =
3868"S.isdecimal() -> int\n\
3869\n\
3870Return 1 if there are only decimal characters in S,\n\
38710 otherwise.";
3872
3873static PyObject*
3874unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3875{
3876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3877 register const Py_UNICODE *e;
3878
3879 if (!PyArg_NoArgs(args))
3880 return NULL;
3881
3882 /* Shortcut for single character strings */
3883 if (PyUnicode_GET_SIZE(self) == 1 &&
3884 Py_UNICODE_ISDECIMAL(*p))
3885 return PyInt_FromLong(1);
3886
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003887 /* Special case for empty strings */
3888 if (PyString_GET_SIZE(self) == 0)
3889 return PyInt_FromLong(0);
3890
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 e = p + PyUnicode_GET_SIZE(self);
3892 for (; p < e; p++) {
3893 if (!Py_UNICODE_ISDECIMAL(*p))
3894 return PyInt_FromLong(0);
3895 }
3896 return PyInt_FromLong(1);
3897}
3898
3899static char isdigit__doc__[] =
3900"S.isdigit() -> int\n\
3901\n\
3902Return 1 if there are only digit characters in S,\n\
39030 otherwise.";
3904
3905static PyObject*
3906unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3907{
3908 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3909 register const Py_UNICODE *e;
3910
3911 if (!PyArg_NoArgs(args))
3912 return NULL;
3913
3914 /* Shortcut for single character strings */
3915 if (PyUnicode_GET_SIZE(self) == 1 &&
3916 Py_UNICODE_ISDIGIT(*p))
3917 return PyInt_FromLong(1);
3918
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003919 /* Special case for empty strings */
3920 if (PyString_GET_SIZE(self) == 0)
3921 return PyInt_FromLong(0);
3922
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 e = p + PyUnicode_GET_SIZE(self);
3924 for (; p < e; p++) {
3925 if (!Py_UNICODE_ISDIGIT(*p))
3926 return PyInt_FromLong(0);
3927 }
3928 return PyInt_FromLong(1);
3929}
3930
3931static char isnumeric__doc__[] =
3932"S.isnumeric() -> int\n\
3933\n\
3934Return 1 if there are only numeric characters in S,\n\
39350 otherwise.";
3936
3937static PyObject*
3938unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3939{
3940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3941 register const Py_UNICODE *e;
3942
3943 if (!PyArg_NoArgs(args))
3944 return NULL;
3945
3946 /* Shortcut for single character strings */
3947 if (PyUnicode_GET_SIZE(self) == 1 &&
3948 Py_UNICODE_ISNUMERIC(*p))
3949 return PyInt_FromLong(1);
3950
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003951 /* Special case for empty strings */
3952 if (PyString_GET_SIZE(self) == 0)
3953 return PyInt_FromLong(0);
3954
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 e = p + PyUnicode_GET_SIZE(self);
3956 for (; p < e; p++) {
3957 if (!Py_UNICODE_ISNUMERIC(*p))
3958 return PyInt_FromLong(0);
3959 }
3960 return PyInt_FromLong(1);
3961}
3962
3963static char join__doc__[] =
3964"S.join(sequence) -> unicode\n\
3965\n\
3966Return a string which is the concatenation of the strings in the\n\
3967sequence. The separator between elements is S.";
3968
3969static PyObject*
3970unicode_join(PyUnicodeObject *self, PyObject *args)
3971{
3972 PyObject *data;
3973 if (!PyArg_ParseTuple(args, "O:join", &data))
3974 return NULL;
3975
3976 return PyUnicode_Join((PyObject *)self, data);
3977}
3978
3979static int
3980unicode_length(PyUnicodeObject *self)
3981{
3982 return self->length;
3983}
3984
3985static char ljust__doc__[] =
3986"S.ljust(width) -> unicode\n\
3987\n\
3988Return S left justified in a Unicode string of length width. Padding is\n\
3989done using spaces.";
3990
3991static PyObject *
3992unicode_ljust(PyUnicodeObject *self, PyObject *args)
3993{
3994 int width;
3995 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3996 return NULL;
3997
3998 if (self->length >= width) {
3999 Py_INCREF(self);
4000 return (PyObject*) self;
4001 }
4002
4003 return (PyObject*) pad(self, 0, width - self->length, ' ');
4004}
4005
4006static char lower__doc__[] =
4007"S.lower() -> unicode\n\
4008\n\
4009Return a copy of the string S converted to lowercase.";
4010
4011static PyObject*
4012unicode_lower(PyUnicodeObject *self, PyObject *args)
4013{
4014 if (!PyArg_NoArgs(args))
4015 return NULL;
4016 return fixup(self, fixlower);
4017}
4018
4019static char lstrip__doc__[] =
4020"S.lstrip() -> unicode\n\
4021\n\
4022Return a copy of the string S with leading whitespace removed.";
4023
4024static PyObject *
4025unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4026{
4027 if (!PyArg_NoArgs(args))
4028 return NULL;
4029 return strip(self, 1, 0);
4030}
4031
4032static PyObject*
4033unicode_repeat(PyUnicodeObject *str, int len)
4034{
4035 PyUnicodeObject *u;
4036 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004037 int nchars;
4038 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039
4040 if (len < 0)
4041 len = 0;
4042
4043 if (len == 1) {
4044 /* no repeat, return original string */
4045 Py_INCREF(str);
4046 return (PyObject*) str;
4047 }
Tim Peters8f422462000-09-09 06:13:41 +00004048
4049 /* ensure # of chars needed doesn't overflow int and # of bytes
4050 * needed doesn't overflow size_t
4051 */
4052 nchars = len * str->length;
4053 if (len && nchars / len != str->length) {
4054 PyErr_SetString(PyExc_OverflowError,
4055 "repeated string is too long");
4056 return NULL;
4057 }
4058 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4059 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4060 PyErr_SetString(PyExc_OverflowError,
4061 "repeated string is too long");
4062 return NULL;
4063 }
4064 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 if (!u)
4066 return NULL;
4067
4068 p = u->str;
4069
4070 while (len-- > 0) {
4071 Py_UNICODE_COPY(p, str->str, str->length);
4072 p += str->length;
4073 }
4074
4075 return (PyObject*) u;
4076}
4077
4078PyObject *PyUnicode_Replace(PyObject *obj,
4079 PyObject *subobj,
4080 PyObject *replobj,
4081 int maxcount)
4082{
4083 PyObject *self;
4084 PyObject *str1;
4085 PyObject *str2;
4086 PyObject *result;
4087
4088 self = PyUnicode_FromObject(obj);
4089 if (self == NULL)
4090 return NULL;
4091 str1 = PyUnicode_FromObject(subobj);
4092 if (str1 == NULL) {
4093 Py_DECREF(self);
4094 return NULL;
4095 }
4096 str2 = PyUnicode_FromObject(replobj);
4097 if (str2 == NULL) {
4098 Py_DECREF(self);
4099 Py_DECREF(str1);
4100 return NULL;
4101 }
4102 result = replace((PyUnicodeObject *)self,
4103 (PyUnicodeObject *)str1,
4104 (PyUnicodeObject *)str2,
4105 maxcount);
4106 Py_DECREF(self);
4107 Py_DECREF(str1);
4108 Py_DECREF(str2);
4109 return result;
4110}
4111
4112static char replace__doc__[] =
4113"S.replace (old, new[, maxsplit]) -> unicode\n\
4114\n\
4115Return a copy of S with all occurrences of substring\n\
4116old replaced by new. If the optional argument maxsplit is\n\
4117given, only the first maxsplit occurrences are replaced.";
4118
4119static PyObject*
4120unicode_replace(PyUnicodeObject *self, PyObject *args)
4121{
4122 PyUnicodeObject *str1;
4123 PyUnicodeObject *str2;
4124 int maxcount = -1;
4125 PyObject *result;
4126
4127 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4128 return NULL;
4129 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4130 if (str1 == NULL)
4131 return NULL;
4132 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4133 if (str2 == NULL)
4134 return NULL;
4135
4136 result = replace(self, str1, str2, maxcount);
4137
4138 Py_DECREF(str1);
4139 Py_DECREF(str2);
4140 return result;
4141}
4142
4143static
4144PyObject *unicode_repr(PyObject *unicode)
4145{
4146 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4147 PyUnicode_GET_SIZE(unicode),
4148 1);
4149}
4150
4151static char rfind__doc__[] =
4152"S.rfind(sub [,start [,end]]) -> int\n\
4153\n\
4154Return the highest index in S where substring sub is found,\n\
4155such that sub is contained within s[start,end]. Optional\n\
4156arguments start and end are interpreted as in slice notation.\n\
4157\n\
4158Return -1 on failure.";
4159
4160static PyObject *
4161unicode_rfind(PyUnicodeObject *self, PyObject *args)
4162{
4163 PyUnicodeObject *substring;
4164 int start = 0;
4165 int end = INT_MAX;
4166 PyObject *result;
4167
Guido van Rossumb8872e62000-05-09 14:14:27 +00004168 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4169 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 return NULL;
4171 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4172 (PyObject *)substring);
4173 if (substring == NULL)
4174 return NULL;
4175
4176 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4177
4178 Py_DECREF(substring);
4179 return result;
4180}
4181
4182static char rindex__doc__[] =
4183"S.rindex(sub [,start [,end]]) -> int\n\
4184\n\
4185Like S.rfind() but raise ValueError when the substring is not found.";
4186
4187static PyObject *
4188unicode_rindex(PyUnicodeObject *self, PyObject *args)
4189{
4190 int result;
4191 PyUnicodeObject *substring;
4192 int start = 0;
4193 int end = INT_MAX;
4194
Guido van Rossumb8872e62000-05-09 14:14:27 +00004195 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4196 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 return NULL;
4198 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4199 (PyObject *)substring);
4200 if (substring == NULL)
4201 return NULL;
4202
4203 result = findstring(self, substring, start, end, -1);
4204
4205 Py_DECREF(substring);
4206 if (result < 0) {
4207 PyErr_SetString(PyExc_ValueError, "substring not found");
4208 return NULL;
4209 }
4210 return PyInt_FromLong(result);
4211}
4212
4213static char rjust__doc__[] =
4214"S.rjust(width) -> unicode\n\
4215\n\
4216Return S right justified in a Unicode string of length width. Padding is\n\
4217done using spaces.";
4218
4219static PyObject *
4220unicode_rjust(PyUnicodeObject *self, PyObject *args)
4221{
4222 int width;
4223 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4224 return NULL;
4225
4226 if (self->length >= width) {
4227 Py_INCREF(self);
4228 return (PyObject*) self;
4229 }
4230
4231 return (PyObject*) pad(self, width - self->length, 0, ' ');
4232}
4233
4234static char rstrip__doc__[] =
4235"S.rstrip() -> unicode\n\
4236\n\
4237Return a copy of the string S with trailing whitespace removed.";
4238
4239static PyObject *
4240unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4241{
4242 if (!PyArg_NoArgs(args))
4243 return NULL;
4244 return strip(self, 0, 1);
4245}
4246
4247static PyObject*
4248unicode_slice(PyUnicodeObject *self, int start, int end)
4249{
4250 /* standard clamping */
4251 if (start < 0)
4252 start = 0;
4253 if (end < 0)
4254 end = 0;
4255 if (end > self->length)
4256 end = self->length;
4257 if (start == 0 && end == self->length) {
4258 /* full slice, return original string */
4259 Py_INCREF(self);
4260 return (PyObject*) self;
4261 }
4262 if (start > end)
4263 start = end;
4264 /* copy slice */
4265 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4266 end - start);
4267}
4268
4269PyObject *PyUnicode_Split(PyObject *s,
4270 PyObject *sep,
4271 int maxsplit)
4272{
4273 PyObject *result;
4274
4275 s = PyUnicode_FromObject(s);
4276 if (s == NULL)
4277 return NULL;
4278 if (sep != NULL) {
4279 sep = PyUnicode_FromObject(sep);
4280 if (sep == NULL) {
4281 Py_DECREF(s);
4282 return NULL;
4283 }
4284 }
4285
4286 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4287
4288 Py_DECREF(s);
4289 Py_XDECREF(sep);
4290 return result;
4291}
4292
4293static char split__doc__[] =
4294"S.split([sep [,maxsplit]]) -> list of strings\n\
4295\n\
4296Return a list of the words in S, using sep as the\n\
4297delimiter string. If maxsplit is given, at most maxsplit\n\
4298splits are done. If sep is not specified, any whitespace string\n\
4299is a separator.";
4300
4301static PyObject*
4302unicode_split(PyUnicodeObject *self, PyObject *args)
4303{
4304 PyObject *substring = Py_None;
4305 int maxcount = -1;
4306
4307 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4308 return NULL;
4309
4310 if (substring == Py_None)
4311 return split(self, NULL, maxcount);
4312 else if (PyUnicode_Check(substring))
4313 return split(self, (PyUnicodeObject *)substring, maxcount);
4314 else
4315 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4316}
4317
4318static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004319"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320\n\
4321Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004322Line breaks are not included in the resulting list unless keepends\n\
4323is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324
4325static PyObject*
4326unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4327{
Guido van Rossum86662912000-04-11 15:38:46 +00004328 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329
Guido van Rossum86662912000-04-11 15:38:46 +00004330 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 return NULL;
4332
Guido van Rossum86662912000-04-11 15:38:46 +00004333 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334}
4335
4336static
4337PyObject *unicode_str(PyUnicodeObject *self)
4338{
Fred Drakee4315f52000-05-09 19:53:39 +00004339 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340}
4341
4342static char strip__doc__[] =
4343"S.strip() -> unicode\n\
4344\n\
4345Return a copy of S with leading and trailing whitespace removed.";
4346
4347static PyObject *
4348unicode_strip(PyUnicodeObject *self, PyObject *args)
4349{
4350 if (!PyArg_NoArgs(args))
4351 return NULL;
4352 return strip(self, 1, 1);
4353}
4354
4355static char swapcase__doc__[] =
4356"S.swapcase() -> unicode\n\
4357\n\
4358Return a copy of S with uppercase characters converted to lowercase\n\
4359and vice versa.";
4360
4361static PyObject*
4362unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4363{
4364 if (!PyArg_NoArgs(args))
4365 return NULL;
4366 return fixup(self, fixswapcase);
4367}
4368
4369static char translate__doc__[] =
4370"S.translate(table) -> unicode\n\
4371\n\
4372Return a copy of the string S, where all characters have been mapped\n\
4373through the given translation table, which must be a mapping of\n\
4374Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4375are left untouched. Characters mapped to None are deleted.";
4376
4377static PyObject*
4378unicode_translate(PyUnicodeObject *self, PyObject *args)
4379{
4380 PyObject *table;
4381
4382 if (!PyArg_ParseTuple(args, "O:translate", &table))
4383 return NULL;
4384 return PyUnicode_TranslateCharmap(self->str,
4385 self->length,
4386 table,
4387 "ignore");
4388}
4389
4390static char upper__doc__[] =
4391"S.upper() -> unicode\n\
4392\n\
4393Return a copy of S converted to uppercase.";
4394
4395static PyObject*
4396unicode_upper(PyUnicodeObject *self, PyObject *args)
4397{
4398 if (!PyArg_NoArgs(args))
4399 return NULL;
4400 return fixup(self, fixupper);
4401}
4402
4403#if 0
4404static char zfill__doc__[] =
4405"S.zfill(width) -> unicode\n\
4406\n\
4407Pad a numeric string x with zeros on the left, to fill a field\n\
4408of the specified width. The string x is never truncated.";
4409
4410static PyObject *
4411unicode_zfill(PyUnicodeObject *self, PyObject *args)
4412{
4413 int fill;
4414 PyUnicodeObject *u;
4415
4416 int width;
4417 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4418 return NULL;
4419
4420 if (self->length >= width) {
4421 Py_INCREF(self);
4422 return (PyObject*) self;
4423 }
4424
4425 fill = width - self->length;
4426
4427 u = pad(self, fill, 0, '0');
4428
4429 if (u->str[fill] == '+' || u->str[fill] == '-') {
4430 /* move sign to beginning of string */
4431 u->str[0] = u->str[fill];
4432 u->str[fill] = '0';
4433 }
4434
4435 return (PyObject*) u;
4436}
4437#endif
4438
4439#if 0
4440static PyObject*
4441unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4442{
4443 if (!PyArg_NoArgs(args))
4444 return NULL;
4445 return PyInt_FromLong(unicode_freelist_size);
4446}
4447#endif
4448
4449static char startswith__doc__[] =
4450"S.startswith(prefix[, start[, end]]) -> int\n\
4451\n\
4452Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4453optional start, test S beginning at that position. With optional end, stop\n\
4454comparing S at that position.";
4455
4456static PyObject *
4457unicode_startswith(PyUnicodeObject *self,
4458 PyObject *args)
4459{
4460 PyUnicodeObject *substring;
4461 int start = 0;
4462 int end = INT_MAX;
4463 PyObject *result;
4464
Guido van Rossumb8872e62000-05-09 14:14:27 +00004465 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4466 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 return NULL;
4468 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4469 (PyObject *)substring);
4470 if (substring == NULL)
4471 return NULL;
4472
4473 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4474
4475 Py_DECREF(substring);
4476 return result;
4477}
4478
4479
4480static char endswith__doc__[] =
4481"S.endswith(suffix[, start[, end]]) -> int\n\
4482\n\
4483Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4484optional start, test S beginning at that position. With optional end, stop\n\
4485comparing S at that position.";
4486
4487static PyObject *
4488unicode_endswith(PyUnicodeObject *self,
4489 PyObject *args)
4490{
4491 PyUnicodeObject *substring;
4492 int start = 0;
4493 int end = INT_MAX;
4494 PyObject *result;
4495
Guido van Rossumb8872e62000-05-09 14:14:27 +00004496 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4497 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 return NULL;
4499 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4500 (PyObject *)substring);
4501 if (substring == NULL)
4502 return NULL;
4503
4504 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4505
4506 Py_DECREF(substring);
4507 return result;
4508}
4509
4510
4511static PyMethodDef unicode_methods[] = {
4512
4513 /* Order is according to common usage: often used methods should
4514 appear first, since lookup is done sequentially. */
4515
4516 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4517 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4518 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4519 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4520 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4521 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4522 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4523 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4524 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4525 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4526 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4527 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4528 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4529 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4530/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4531 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4532 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4533 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4534 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4535 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4536 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4537 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4538 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4539 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4540 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4541 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4542 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4543 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4544 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4545 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4546 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4547 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4548 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004549 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4550 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551#if 0
4552 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4553 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4554#endif
4555
4556#if 0
4557 /* This one is just used for debugging the implementation. */
4558 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4559#endif
4560
4561 {NULL, NULL}
4562};
4563
4564static PyObject *
4565unicode_getattr(PyUnicodeObject *self, char *name)
4566{
4567 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4568}
4569
4570static PySequenceMethods unicode_as_sequence = {
4571 (inquiry) unicode_length, /* sq_length */
4572 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4573 (intargfunc) unicode_repeat, /* sq_repeat */
4574 (intargfunc) unicode_getitem, /* sq_item */
4575 (intintargfunc) unicode_slice, /* sq_slice */
4576 0, /* sq_ass_item */
4577 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004578 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579};
4580
4581static int
4582unicode_buffer_getreadbuf(PyUnicodeObject *self,
4583 int index,
4584 const void **ptr)
4585{
4586 if (index != 0) {
4587 PyErr_SetString(PyExc_SystemError,
4588 "accessing non-existent unicode segment");
4589 return -1;
4590 }
4591 *ptr = (void *) self->str;
4592 return PyUnicode_GET_DATA_SIZE(self);
4593}
4594
4595static int
4596unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4597 const void **ptr)
4598{
4599 PyErr_SetString(PyExc_TypeError,
4600 "cannot use unicode as modifyable buffer");
4601 return -1;
4602}
4603
4604static int
4605unicode_buffer_getsegcount(PyUnicodeObject *self,
4606 int *lenp)
4607{
4608 if (lenp)
4609 *lenp = PyUnicode_GET_DATA_SIZE(self);
4610 return 1;
4611}
4612
4613static int
4614unicode_buffer_getcharbuf(PyUnicodeObject *self,
4615 int index,
4616 const void **ptr)
4617{
4618 PyObject *str;
4619
4620 if (index != 0) {
4621 PyErr_SetString(PyExc_SystemError,
4622 "accessing non-existent unicode segment");
4623 return -1;
4624 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004625 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 if (str == NULL)
4627 return -1;
4628 *ptr = (void *) PyString_AS_STRING(str);
4629 return PyString_GET_SIZE(str);
4630}
4631
4632/* Helpers for PyUnicode_Format() */
4633
4634static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004635getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636{
4637 int argidx = *p_argidx;
4638 if (argidx < arglen) {
4639 (*p_argidx)++;
4640 if (arglen < 0)
4641 return args;
4642 else
4643 return PyTuple_GetItem(args, argidx);
4644 }
4645 PyErr_SetString(PyExc_TypeError,
4646 "not enough arguments for format string");
4647 return NULL;
4648}
4649
4650#define F_LJUST (1<<0)
4651#define F_SIGN (1<<1)
4652#define F_BLANK (1<<2)
4653#define F_ALT (1<<3)
4654#define F_ZERO (1<<4)
4655
4656static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658{
4659 register int i;
4660 int len;
4661 va_list va;
4662 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664
4665 /* First, format the string as char array, then expand to Py_UNICODE
4666 array. */
4667 charbuffer = (char *)buffer;
4668 len = vsprintf(charbuffer, format, va);
4669 for (i = len - 1; i >= 0; i--)
4670 buffer[i] = (Py_UNICODE) charbuffer[i];
4671
4672 va_end(va);
4673 return len;
4674}
4675
4676static int
4677formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004678 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 int flags,
4680 int prec,
4681 int type,
4682 PyObject *v)
4683{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004684 /* fmt = '%#.' + `prec` + `type`
4685 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 char fmt[20];
4687 double x;
4688
4689 x = PyFloat_AsDouble(v);
4690 if (x == -1.0 && PyErr_Occurred())
4691 return -1;
4692 if (prec < 0)
4693 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4695 type = 'g';
4696 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004697 /* worst case length calc to ensure no buffer overrun:
4698 fmt = %#.<prec>g
4699 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4700 for any double rep.)
4701 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4702 If prec=0 the effective precision is 1 (the leading digit is
4703 always given), therefore increase by one to 10+prec. */
4704 if (buflen <= (size_t)10 + (size_t)prec) {
4705 PyErr_SetString(PyExc_OverflowError,
4706 "formatted float is too long (precision too long?)");
4707 return -1;
4708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 return usprintf(buf, fmt, x);
4710}
4711
Tim Peters38fd5b62000-09-21 05:43:11 +00004712static PyObject*
4713formatlong(PyObject *val, int flags, int prec, int type)
4714{
4715 char *buf;
4716 int i, len;
4717 PyObject *str; /* temporary string object. */
4718 PyUnicodeObject *result;
4719
4720 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4721 if (!str)
4722 return NULL;
4723 result = _PyUnicode_New(len);
4724 for (i = 0; i < len; i++)
4725 result->str[i] = buf[i];
4726 result->str[len] = 0;
4727 Py_DECREF(str);
4728 return (PyObject*)result;
4729}
4730
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731static int
4732formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004733 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 int flags,
4735 int prec,
4736 int type,
4737 PyObject *v)
4738{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004739 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004740 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4741 + 1 + 1 = 24*/
4742 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004744 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004745
4746 x = PyInt_AsLong(v);
4747 if (x == -1 && PyErr_Occurred())
4748 return -1;
4749 if (prec < 0)
4750 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004751 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4752 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4753 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4754 PyErr_SetString(PyExc_OverflowError,
4755 "formatted integer is too long (precision too long?)");
4756 return -1;
4757 }
Tim Petersfff53252001-04-12 18:38:48 +00004758 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4759 * but we want it (for consistency with other %#x conversions, and
4760 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004761 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4762 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4763 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004764 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004765 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4766 /* Only way to know what the platform does is to try it. */
4767 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4768 if (fmt[1] != (char)type) {
4769 /* Supply our own leading 0x/0X -- needed under std C */
4770 use_native_c_format = 0;
4771 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4772 }
4773 }
4774 if (use_native_c_format)
4775 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004776 return usprintf(buf, fmt, x);
4777}
4778
4779static int
4780formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004781 size_t buflen,
4782 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004783{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004784 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004785 if (PyUnicode_Check(v)) {
4786 if (PyUnicode_GET_SIZE(v) != 1)
4787 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004789 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004790
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004791 else if (PyString_Check(v)) {
4792 if (PyString_GET_SIZE(v) != 1)
4793 goto onError;
4794 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4795 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004796
4797 else {
4798 /* Integer input truncated to a character */
4799 long x;
4800 x = PyInt_AsLong(v);
4801 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004802 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004803 buf[0] = (char) x;
4804 }
4805 buf[1] = '\0';
4806 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004807
4808 onError:
4809 PyErr_SetString(PyExc_TypeError,
4810 "%c requires int or char");
4811 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812}
4813
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004814/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4815
4816 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4817 chars are formatted. XXX This is a magic number. Each formatting
4818 routine does bounds checking to ensure no overflow, but a better
4819 solution may be to malloc a buffer of appropriate size for each
4820 format. For now, the current solution is sufficient.
4821*/
4822#define FORMATBUFLEN (size_t)120
4823
Guido van Rossumd57fd912000-03-10 22:53:23 +00004824PyObject *PyUnicode_Format(PyObject *format,
4825 PyObject *args)
4826{
4827 Py_UNICODE *fmt, *res;
4828 int fmtcnt, rescnt, reslen, arglen, argidx;
4829 int args_owned = 0;
4830 PyUnicodeObject *result = NULL;
4831 PyObject *dict = NULL;
4832 PyObject *uformat;
4833
4834 if (format == NULL || args == NULL) {
4835 PyErr_BadInternalCall();
4836 return NULL;
4837 }
4838 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004839 if (uformat == NULL)
4840 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 fmt = PyUnicode_AS_UNICODE(uformat);
4842 fmtcnt = PyUnicode_GET_SIZE(uformat);
4843
4844 reslen = rescnt = fmtcnt + 100;
4845 result = _PyUnicode_New(reslen);
4846 if (result == NULL)
4847 goto onError;
4848 res = PyUnicode_AS_UNICODE(result);
4849
4850 if (PyTuple_Check(args)) {
4851 arglen = PyTuple_Size(args);
4852 argidx = 0;
4853 }
4854 else {
4855 arglen = -1;
4856 argidx = -2;
4857 }
4858 if (args->ob_type->tp_as_mapping)
4859 dict = args;
4860
4861 while (--fmtcnt >= 0) {
4862 if (*fmt != '%') {
4863 if (--rescnt < 0) {
4864 rescnt = fmtcnt + 100;
4865 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004866 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004867 return NULL;
4868 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4869 --rescnt;
4870 }
4871 *res++ = *fmt++;
4872 }
4873 else {
4874 /* Got a format specifier */
4875 int flags = 0;
4876 int width = -1;
4877 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004878 Py_UNICODE c = '\0';
4879 Py_UNICODE fill;
4880 PyObject *v = NULL;
4881 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004882 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004883 Py_UNICODE sign;
4884 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004885 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004886
4887 fmt++;
4888 if (*fmt == '(') {
4889 Py_UNICODE *keystart;
4890 int keylen;
4891 PyObject *key;
4892 int pcount = 1;
4893
4894 if (dict == NULL) {
4895 PyErr_SetString(PyExc_TypeError,
4896 "format requires a mapping");
4897 goto onError;
4898 }
4899 ++fmt;
4900 --fmtcnt;
4901 keystart = fmt;
4902 /* Skip over balanced parentheses */
4903 while (pcount > 0 && --fmtcnt >= 0) {
4904 if (*fmt == ')')
4905 --pcount;
4906 else if (*fmt == '(')
4907 ++pcount;
4908 fmt++;
4909 }
4910 keylen = fmt - keystart - 1;
4911 if (fmtcnt < 0 || pcount > 0) {
4912 PyErr_SetString(PyExc_ValueError,
4913 "incomplete format key");
4914 goto onError;
4915 }
Fred Drakee4315f52000-05-09 19:53:39 +00004916 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004917 then looked up since Python uses strings to hold
4918 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004919 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004920 key = PyUnicode_EncodeUTF8(keystart,
4921 keylen,
4922 NULL);
4923 if (key == NULL)
4924 goto onError;
4925 if (args_owned) {
4926 Py_DECREF(args);
4927 args_owned = 0;
4928 }
4929 args = PyObject_GetItem(dict, key);
4930 Py_DECREF(key);
4931 if (args == NULL) {
4932 goto onError;
4933 }
4934 args_owned = 1;
4935 arglen = -1;
4936 argidx = -2;
4937 }
4938 while (--fmtcnt >= 0) {
4939 switch (c = *fmt++) {
4940 case '-': flags |= F_LJUST; continue;
4941 case '+': flags |= F_SIGN; continue;
4942 case ' ': flags |= F_BLANK; continue;
4943 case '#': flags |= F_ALT; continue;
4944 case '0': flags |= F_ZERO; continue;
4945 }
4946 break;
4947 }
4948 if (c == '*') {
4949 v = getnextarg(args, arglen, &argidx);
4950 if (v == NULL)
4951 goto onError;
4952 if (!PyInt_Check(v)) {
4953 PyErr_SetString(PyExc_TypeError,
4954 "* wants int");
4955 goto onError;
4956 }
4957 width = PyInt_AsLong(v);
4958 if (width < 0) {
4959 flags |= F_LJUST;
4960 width = -width;
4961 }
4962 if (--fmtcnt >= 0)
4963 c = *fmt++;
4964 }
4965 else if (c >= '0' && c <= '9') {
4966 width = c - '0';
4967 while (--fmtcnt >= 0) {
4968 c = *fmt++;
4969 if (c < '0' || c > '9')
4970 break;
4971 if ((width*10) / 10 != width) {
4972 PyErr_SetString(PyExc_ValueError,
4973 "width too big");
4974 goto onError;
4975 }
4976 width = width*10 + (c - '0');
4977 }
4978 }
4979 if (c == '.') {
4980 prec = 0;
4981 if (--fmtcnt >= 0)
4982 c = *fmt++;
4983 if (c == '*') {
4984 v = getnextarg(args, arglen, &argidx);
4985 if (v == NULL)
4986 goto onError;
4987 if (!PyInt_Check(v)) {
4988 PyErr_SetString(PyExc_TypeError,
4989 "* wants int");
4990 goto onError;
4991 }
4992 prec = PyInt_AsLong(v);
4993 if (prec < 0)
4994 prec = 0;
4995 if (--fmtcnt >= 0)
4996 c = *fmt++;
4997 }
4998 else if (c >= '0' && c <= '9') {
4999 prec = c - '0';
5000 while (--fmtcnt >= 0) {
5001 c = Py_CHARMASK(*fmt++);
5002 if (c < '0' || c > '9')
5003 break;
5004 if ((prec*10) / 10 != prec) {
5005 PyErr_SetString(PyExc_ValueError,
5006 "prec too big");
5007 goto onError;
5008 }
5009 prec = prec*10 + (c - '0');
5010 }
5011 }
5012 } /* prec */
5013 if (fmtcnt >= 0) {
5014 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005015 if (--fmtcnt >= 0)
5016 c = *fmt++;
5017 }
5018 }
5019 if (fmtcnt < 0) {
5020 PyErr_SetString(PyExc_ValueError,
5021 "incomplete format");
5022 goto onError;
5023 }
5024 if (c != '%') {
5025 v = getnextarg(args, arglen, &argidx);
5026 if (v == NULL)
5027 goto onError;
5028 }
5029 sign = 0;
5030 fill = ' ';
5031 switch (c) {
5032
5033 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005034 pbuf = formatbuf;
5035 /* presume that buffer length is at least 1 */
5036 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005037 len = 1;
5038 break;
5039
5040 case 's':
5041 case 'r':
5042 if (PyUnicode_Check(v) && c == 's') {
5043 temp = v;
5044 Py_INCREF(temp);
5045 }
5046 else {
5047 PyObject *unicode;
5048 if (c == 's')
5049 temp = PyObject_Str(v);
5050 else
5051 temp = PyObject_Repr(v);
5052 if (temp == NULL)
5053 goto onError;
5054 if (!PyString_Check(temp)) {
5055 /* XXX Note: this should never happen, since
5056 PyObject_Repr() and PyObject_Str() assure
5057 this */
5058 Py_DECREF(temp);
5059 PyErr_SetString(PyExc_TypeError,
5060 "%s argument has non-string str()");
5061 goto onError;
5062 }
Fred Drakee4315f52000-05-09 19:53:39 +00005063 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005065 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005066 "strict");
5067 Py_DECREF(temp);
5068 temp = unicode;
5069 if (temp == NULL)
5070 goto onError;
5071 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005072 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005073 len = PyUnicode_GET_SIZE(temp);
5074 if (prec >= 0 && len > prec)
5075 len = prec;
5076 break;
5077
5078 case 'i':
5079 case 'd':
5080 case 'u':
5081 case 'o':
5082 case 'x':
5083 case 'X':
5084 if (c == 'i')
5085 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005086 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005087 temp = formatlong(v, flags, prec, c);
5088 if (!temp)
5089 goto onError;
5090 pbuf = PyUnicode_AS_UNICODE(temp);
5091 len = PyUnicode_GET_SIZE(temp);
5092 /* unbounded ints can always produce
5093 a sign character! */
5094 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005096 else {
5097 pbuf = formatbuf;
5098 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5099 flags, prec, c, v);
5100 if (len < 0)
5101 goto onError;
5102 /* only d conversion is signed */
5103 sign = c == 'd';
5104 }
5105 if (flags & F_ZERO)
5106 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 break;
5108
5109 case 'e':
5110 case 'E':
5111 case 'f':
5112 case 'g':
5113 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005114 pbuf = formatbuf;
5115 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5116 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 if (len < 0)
5118 goto onError;
5119 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005120 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 fill = '0';
5122 break;
5123
5124 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005125 pbuf = formatbuf;
5126 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 if (len < 0)
5128 goto onError;
5129 break;
5130
5131 default:
5132 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005133 "unsupported format character '%c' (0x%x) "
5134 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005135 (31<=c && c<=126) ? c : '?',
5136 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 goto onError;
5138 }
5139 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005140 if (*pbuf == '-' || *pbuf == '+') {
5141 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005142 len--;
5143 }
5144 else if (flags & F_SIGN)
5145 sign = '+';
5146 else if (flags & F_BLANK)
5147 sign = ' ';
5148 else
5149 sign = 0;
5150 }
5151 if (width < len)
5152 width = len;
5153 if (rescnt < width + (sign != 0)) {
5154 reslen -= rescnt;
5155 rescnt = width + fmtcnt + 100;
5156 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005157 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005158 return NULL;
5159 res = PyUnicode_AS_UNICODE(result)
5160 + reslen - rescnt;
5161 }
5162 if (sign) {
5163 if (fill != ' ')
5164 *res++ = sign;
5165 rescnt--;
5166 if (width > len)
5167 width--;
5168 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005169 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5170 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005171 assert(pbuf[1] == c);
5172 if (fill != ' ') {
5173 *res++ = *pbuf++;
5174 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005175 }
Tim Petersfff53252001-04-12 18:38:48 +00005176 rescnt -= 2;
5177 width -= 2;
5178 if (width < 0)
5179 width = 0;
5180 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005181 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005182 if (width > len && !(flags & F_LJUST)) {
5183 do {
5184 --rescnt;
5185 *res++ = fill;
5186 } while (--width > len);
5187 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005188 if (fill == ' ') {
5189 if (sign)
5190 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005191 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005192 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005193 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005194 *res++ = *pbuf++;
5195 *res++ = *pbuf++;
5196 }
5197 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005198 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005199 res += len;
5200 rescnt -= len;
5201 while (--width >= len) {
5202 --rescnt;
5203 *res++ = ' ';
5204 }
5205 if (dict && (argidx < arglen) && c != '%') {
5206 PyErr_SetString(PyExc_TypeError,
5207 "not all arguments converted");
5208 goto onError;
5209 }
5210 Py_XDECREF(temp);
5211 } /* '%' */
5212 } /* until end */
5213 if (argidx < arglen && !dict) {
5214 PyErr_SetString(PyExc_TypeError,
5215 "not all arguments converted");
5216 goto onError;
5217 }
5218
5219 if (args_owned) {
5220 Py_DECREF(args);
5221 }
5222 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005223 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005224 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005225 return (PyObject *)result;
5226
5227 onError:
5228 Py_XDECREF(result);
5229 Py_DECREF(uformat);
5230 if (args_owned) {
5231 Py_DECREF(args);
5232 }
5233 return NULL;
5234}
5235
5236static PyBufferProcs unicode_as_buffer = {
5237 (getreadbufferproc) unicode_buffer_getreadbuf,
5238 (getwritebufferproc) unicode_buffer_getwritebuf,
5239 (getsegcountproc) unicode_buffer_getsegcount,
5240 (getcharbufferproc) unicode_buffer_getcharbuf,
5241};
5242
5243PyTypeObject PyUnicode_Type = {
5244 PyObject_HEAD_INIT(&PyType_Type)
5245 0, /* ob_size */
5246 "unicode", /* tp_name */
5247 sizeof(PyUnicodeObject), /* tp_size */
5248 0, /* tp_itemsize */
5249 /* Slots */
5250 (destructor)_PyUnicode_Free, /* tp_dealloc */
5251 0, /* tp_print */
5252 (getattrfunc)unicode_getattr, /* tp_getattr */
5253 0, /* tp_setattr */
5254 (cmpfunc) unicode_compare, /* tp_compare */
5255 (reprfunc) unicode_repr, /* tp_repr */
5256 0, /* tp_as_number */
5257 &unicode_as_sequence, /* tp_as_sequence */
5258 0, /* tp_as_mapping */
5259 (hashfunc) unicode_hash, /* tp_hash*/
5260 0, /* tp_call*/
5261 (reprfunc) unicode_str, /* tp_str */
5262 (getattrofunc) NULL, /* tp_getattro */
5263 (setattrofunc) NULL, /* tp_setattro */
5264 &unicode_as_buffer, /* tp_as_buffer */
5265 Py_TPFLAGS_DEFAULT, /* tp_flags */
5266};
5267
5268/* Initialize the Unicode implementation */
5269
Thomas Wouters78890102000-07-22 19:25:51 +00005270void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005272 int i;
5273
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274 /* Doublecheck the configuration... */
5275 if (sizeof(Py_UNICODE) != 2)
5276 Py_FatalError("Unicode configuration error: "
5277 "sizeof(Py_UNICODE) != 2 bytes");
5278
Fred Drakee4315f52000-05-09 19:53:39 +00005279 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005280 unicode_freelist = NULL;
5281 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005282 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005283 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005284 for (i = 0; i < 256; i++)
5285 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005286}
5287
5288/* Finalize the Unicode implementation */
5289
5290void
Thomas Wouters78890102000-07-22 19:25:51 +00005291_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005293 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005294 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005295
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005296 Py_XDECREF(unicode_empty);
5297 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005298
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005299 for (i = 0; i < 256; i++) {
5300 if (unicode_latin1[i]) {
5301 Py_DECREF(unicode_latin1[i]);
5302 unicode_latin1[i] = NULL;
5303 }
5304 }
5305
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005306 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307 PyUnicodeObject *v = u;
5308 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005309 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005310 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005311 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005312 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005313 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005314 unicode_freelist = NULL;
5315 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005316}