blob: 1d72c0d7bd28f4b1718bf5eefa7649635972a5d5 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* --- Unicode Object ----------------------------------------------------- */
107
108static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110 int length)
111{
112 void *oldstr;
113
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000114 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000115 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000116 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118 /* Resizing shared object (unicode_empty or single character
119 objects) in-place is not allowed. Use PyUnicode_Resize()
120 instead ! */
121 if (unicode == unicode_empty ||
122 (unicode->length == 1 &&
123 unicode->str[0] < 256 &&
124 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 return -1;
128 }
129
130 /* We allocate one more byte to make sure the string is
131 Ux0000 terminated -- XXX is this needed ? */
132 oldstr = unicode->str;
133 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
134 if (!unicode->str) {
135 unicode->str = oldstr;
136 PyErr_NoMemory();
137 return -1;
138 }
139 unicode->str[length] = 0;
140 unicode->length = length;
141
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000142 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000143 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000144 if (unicode->defenc) {
145 Py_DECREF(unicode->defenc);
146 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147 }
148 unicode->hash = -1;
149
150 return 0;
151}
152
153/* We allocate one more byte to make sure the string is
154 Ux0000 terminated -- XXX is this needed ?
155
156 XXX This allocator could further be enhanced by assuring that the
157 free list never reduces its size below 1.
158
159*/
160
161static
162PyUnicodeObject *_PyUnicode_New(int length)
163{
164 register PyUnicodeObject *unicode;
165
166 /* Optimization for empty strings */
167 if (length == 0 && unicode_empty != NULL) {
168 Py_INCREF(unicode_empty);
169 return unicode_empty;
170 }
171
172 /* Unicode freelist & memory allocation */
173 if (unicode_freelist) {
174 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000175 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000176 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 /* Keep-Alive optimization: we only upsize the buffer,
179 never downsize it. */
180 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000182 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000183 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 }
185 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000186 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000188 }
189 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 }
191 else {
192 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
193 if (unicode == NULL)
194 return NULL;
195 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
196 }
197
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000198 if (!unicode->str) {
199 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000200 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 unicode->str[length] = 0;
203 unicode->length = length;
204 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000205 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000207
208 onError:
209 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000210 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212}
213
214static
215void _PyUnicode_Free(register PyUnicodeObject *unicode)
216{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 /* Keep-Alive optimization */
219 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000220 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 unicode->str = NULL;
222 unicode->length = 0;
223 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000224 if (unicode->defenc) {
225 Py_DECREF(unicode->defenc);
226 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 }
228 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 *(PyUnicodeObject **)unicode = unicode_freelist;
230 unicode_freelist = unicode;
231 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000234 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000235 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000236 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 }
238}
239
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000240int PyUnicode_Resize(PyObject **unicode,
241 int length)
242{
243 register PyUnicodeObject *v;
244
245 /* Argument checks */
246 if (unicode == NULL) {
247 PyErr_BadInternalCall();
248 return -1;
249 }
250 v = (PyUnicodeObject *)*unicode;
251 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
252 PyErr_BadInternalCall();
253 return -1;
254 }
255
256 /* Resizing unicode_empty and single character objects is not
257 possible since these are being shared. We simply return a fresh
258 copy with the same Unicode content. */
259 if (v->length != length &&
260 (v == unicode_empty || v->length == 1)) {
261 PyUnicodeObject *w = _PyUnicode_New(length);
262 if (w == NULL)
263 return -1;
264 Py_UNICODE_COPY(w->str, v->str,
265 length < v->length ? length : v->length);
266 *unicode = (PyObject *)w;
267 return 0;
268 }
269
270 /* Note that we don't have to modify *unicode for unshared Unicode
271 objects, since we can modify them in-place. */
272 return unicode_resize(v, length);
273}
274
275/* Internal API for use in unicodeobject.c only ! */
276#define _PyUnicode_Resize(unicodevar, length) \
277 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
278
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
280 int size)
281{
282 PyUnicodeObject *unicode;
283
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284 /* If the Unicode data is known at construction time, we can apply
285 some optimizations which share commonly used objects. */
286 if (u != NULL) {
287
288 /* Optimization for empty strings */
289 if (size == 0 && unicode_empty != NULL) {
290 Py_INCREF(unicode_empty);
291 return (PyObject *)unicode_empty;
292 }
293
294 /* Single character Unicode objects in the Latin-1 range are
295 shared when using this constructor */
296 if (size == 1 && *u < 256) {
297 unicode = unicode_latin1[*u];
298 if (!unicode) {
299 unicode = _PyUnicode_New(1);
300 unicode->str[0] = *u;
301 if (!unicode)
302 return NULL;
303 unicode_latin1[*u] = unicode;
304 }
305 Py_INCREF(unicode);
306 return (PyObject *)unicode;
307 }
308 }
309
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the Unicode data into the new object */
315 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317
318 return (PyObject *)unicode;
319}
320
321#ifdef HAVE_WCHAR_H
322
323PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
324 int size)
325{
326 PyUnicodeObject *unicode;
327
328 if (w == NULL) {
329 PyErr_BadInternalCall();
330 return NULL;
331 }
332
333 unicode = _PyUnicode_New(size);
334 if (!unicode)
335 return NULL;
336
337 /* Copy the wchar_t data into the new object */
338#ifdef HAVE_USABLE_WCHAR_T
339 memcpy(unicode->str, w, size * sizeof(wchar_t));
340#else
341 {
342 register Py_UNICODE *u;
343 register int i;
344 u = PyUnicode_AS_UNICODE(unicode);
345 for (i = size; i >= 0; i--)
346 *u++ = *w++;
347 }
348#endif
349
350 return (PyObject *)unicode;
351}
352
353int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
354 register wchar_t *w,
355 int size)
356{
357 if (unicode == NULL) {
358 PyErr_BadInternalCall();
359 return -1;
360 }
361 if (size > PyUnicode_GET_SIZE(unicode))
362 size = PyUnicode_GET_SIZE(unicode);
363#ifdef HAVE_USABLE_WCHAR_T
364 memcpy(w, unicode->str, size * sizeof(wchar_t));
365#else
366 {
367 register Py_UNICODE *u;
368 register int i;
369 u = PyUnicode_AS_UNICODE(unicode);
370 for (i = size; i >= 0; i--)
371 *w++ = *u++;
372 }
373#endif
374
375 return size;
376}
377
378#endif
379
380PyObject *PyUnicode_FromObject(register PyObject *obj)
381{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
383}
384
385PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
386 const char *encoding,
387 const char *errors)
388{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 const char *s;
390 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 int owned = 0;
392 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
394 if (obj == NULL) {
395 PyErr_BadInternalCall();
396 return NULL;
397 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000398
399 /* Coerce object */
400 if (PyInstance_Check(obj)) {
401 PyObject *func;
402 func = PyObject_GetAttrString(obj, "__str__");
403 if (func == NULL) {
404 PyErr_SetString(PyExc_TypeError,
405 "coercing to Unicode: instance doesn't define __str__");
406 return NULL;
407 }
408 obj = PyEval_CallObject(func, NULL);
409 Py_DECREF(func);
410 if (obj == NULL)
411 return NULL;
412 owned = 1;
413 }
414 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 v = obj;
417 if (encoding) {
418 PyErr_SetString(PyExc_TypeError,
419 "decoding Unicode is not supported");
420 return NULL;
421 }
422 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else if (PyString_Check(obj)) {
425 s = PyString_AS_STRING(obj);
426 len = PyString_GET_SIZE(obj);
427 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000428 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
429 /* Overwrite the error message with something more useful in
430 case of a TypeError. */
431 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000432 PyErr_Format(PyExc_TypeError,
433 "coercing to Unicode: need string or buffer, "
434 "%.80s found",
435 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000436 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000437 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000438
439 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 if (len == 0) {
441 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000442 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 else
445 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000446
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000448 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000449 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000450 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000451 return v;
452
453 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000454 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000456 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458}
459
460PyObject *PyUnicode_Decode(const char *s,
461 int size,
462 const char *encoding,
463 const char *errors)
464{
465 PyObject *buffer = NULL, *unicode;
466
Fred Drakee4315f52000-05-09 19:53:39 +0000467 if (encoding == NULL)
468 encoding = PyUnicode_GetDefaultEncoding();
469
470 /* Shortcuts for common default encodings */
471 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000473 else if (strcmp(encoding, "latin-1") == 0)
474 return PyUnicode_DecodeLatin1(s, size, errors);
475 else if (strcmp(encoding, "ascii") == 0)
476 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477
478 /* Decode via the codec registry */
479 buffer = PyBuffer_FromMemory((void *)s, size);
480 if (buffer == NULL)
481 goto onError;
482 unicode = PyCodec_Decode(buffer, encoding, errors);
483 if (unicode == NULL)
484 goto onError;
485 if (!PyUnicode_Check(unicode)) {
486 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000487 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488 unicode->ob_type->tp_name);
489 Py_DECREF(unicode);
490 goto onError;
491 }
492 Py_DECREF(buffer);
493 return unicode;
494
495 onError:
496 Py_XDECREF(buffer);
497 return NULL;
498}
499
500PyObject *PyUnicode_Encode(const Py_UNICODE *s,
501 int size,
502 const char *encoding,
503 const char *errors)
504{
505 PyObject *v, *unicode;
506
507 unicode = PyUnicode_FromUnicode(s, size);
508 if (unicode == NULL)
509 return NULL;
510 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
511 Py_DECREF(unicode);
512 return v;
513}
514
515PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
516 const char *encoding,
517 const char *errors)
518{
519 PyObject *v;
520
521 if (!PyUnicode_Check(unicode)) {
522 PyErr_BadArgument();
523 goto onError;
524 }
Fred Drakee4315f52000-05-09 19:53:39 +0000525
526 if (encoding == NULL)
527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (errors == NULL) {
531 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000533 else if (strcmp(encoding, "latin-1") == 0)
534 return PyUnicode_AsLatin1String(unicode);
535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_AsASCIIString(unicode);
537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000538
539 /* Encode via the codec registry */
540 v = PyCodec_Encode(unicode, encoding, errors);
541 if (v == NULL)
542 goto onError;
543 /* XXX Should we really enforce this ? */
544 if (!PyString_Check(v)) {
545 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000546 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547 v->ob_type->tp_name);
548 Py_DECREF(v);
549 goto onError;
550 }
551 return v;
552
553 onError:
554 return NULL;
555}
556
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000557/* Return a Python string holding the default encoded value of the
558 Unicode object.
559
560 The resulting string is cached in the Unicode object for subsequent
561 usage by this function. The cached version is needed to implement
562 the character buffer interface and will live (at least) as long as
563 the Unicode object itself.
564
565 The refcount of the string is *not* incremented.
566
567 *** Exported for internal use by the interpreter only !!! ***
568
569*/
570
571PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
572 const char *errors)
573{
574 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
575
576 if (v)
577 return v;
578 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
579 if (v && errors == NULL)
580 ((PyUnicodeObject *)unicode)->defenc = v;
581 return v;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
585{
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
590 return PyUnicode_AS_UNICODE(unicode);
591
592 onError:
593 return NULL;
594}
595
596int PyUnicode_GetSize(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_GET_SIZE(unicode);
603
604 onError:
605 return -1;
606}
607
Thomas Wouters78890102000-07-22 19:25:51 +0000608const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000609{
610 return unicode_default_encoding;
611}
612
613int PyUnicode_SetDefaultEncoding(const char *encoding)
614{
615 PyObject *v;
616
617 /* Make sure the encoding is valid. As side effect, this also
618 loads the encoding into the codec registry cache. */
619 v = _PyCodec_Lookup(encoding);
620 if (v == NULL)
621 goto onError;
622 Py_DECREF(v);
623 strncpy(unicode_default_encoding,
624 encoding,
625 sizeof(unicode_default_encoding));
626 return 0;
627
628 onError:
629 return -1;
630}
631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632/* --- UTF-8 Codec -------------------------------------------------------- */
633
634static
635char utf8_code_length[256] = {
636 /* Map UTF-8 encoded prefix byte to sequence length. zero means
637 illegal prefix. see RFC 2279 for details */
638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
652 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
653 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
654};
655
656static
657int utf8_decoding_error(const char **source,
658 Py_UNICODE **dest,
659 const char *errors,
660 const char *details)
661{
662 if ((errors == NULL) ||
663 (strcmp(errors,"strict") == 0)) {
664 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000665 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 details);
667 return -1;
668 }
669 else if (strcmp(errors,"ignore") == 0) {
670 (*source)++;
671 return 0;
672 }
673 else if (strcmp(errors,"replace") == 0) {
674 (*source)++;
675 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
676 (*dest)++;
677 return 0;
678 }
679 else {
680 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000681 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682 errors);
683 return -1;
684 }
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_DecodeUTF8(const char *s,
688 int size,
689 const char *errors)
690{
691 int n;
692 const char *e;
693 PyUnicodeObject *unicode;
694 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000696
697 /* Note: size will always be longer than the resulting Unicode
698 character count */
699 unicode = _PyUnicode_New(size);
700 if (!unicode)
701 return NULL;
702 if (size == 0)
703 return (PyObject *)unicode;
704
705 /* Unpack UTF-8 encoded data */
706 p = unicode->str;
707 e = s + size;
708
709 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000710 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711
712 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000713 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 s++;
715 continue;
716 }
717
718 n = utf8_code_length[ch];
719
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if (s + n > e) {
721 errmsg = "unexpected end of data";
722 goto utf8Error;
723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724
725 switch (n) {
726
727 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 errmsg = "unexpected code byte";
729 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
731 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000732 errmsg = "internal error";
733 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000734
735 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000736 if ((s[1] & 0xc0) != 0x80) {
737 errmsg = "invalid data";
738 goto utf8Error;
739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 if (ch < 0x80) {
742 errmsg = "illegal encoding";
743 goto utf8Error;
744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000747 break;
748
749 case 3:
750 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 (s[2] & 0xc0) != 0x80) {
752 errmsg = "invalid data";
753 goto utf8Error;
754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000756 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
757 errmsg = "illegal encoding";
758 goto utf8Error;
759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000761 *p++ = (Py_UNICODE)ch;
762 break;
763
764 case 4:
765 if ((s[1] & 0xc0) != 0x80 ||
766 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000767 (s[3] & 0xc0) != 0x80) {
768 errmsg = "invalid data";
769 goto utf8Error;
770 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000771 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
772 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
773 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000774 if ((ch < 0x10000) || /* minimum value allowed for 4
775 byte encoding */
776 (ch > 0x10ffff)) { /* maximum value allowed for
777 UTF-16 */
778 errmsg = "illegal encoding";
779 goto utf8Error;
780 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000781 /* compute and append the two surrogates: */
782
783 /* translate from 10000..10FFFF to 0..FFFF */
784 ch -= 0x10000;
785
786 /* high surrogate = top 10 bits added to D800 */
787 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
788
789 /* low surrogate = bottom 10 bits added to DC00 */
790 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791 break;
792
793 default:
794 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000795 errmsg = "unsupported Unicode code range";
796 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797 }
798 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000799 continue;
800
801 utf8Error:
802 if (utf8_decoding_error(&s, &p, errors, errmsg))
803 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804 }
805
806 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000807 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 goto onError;
809
810 return (PyObject *)unicode;
811
812onError:
813 Py_DECREF(unicode);
814 return NULL;
815}
816
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000817/* Not used anymore, now that the encoder supports UTF-16
818 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000819#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820static
821int utf8_encoding_error(const Py_UNICODE **source,
822 char **dest,
823 const char *errors,
824 const char *details)
825{
826 if ((errors == NULL) ||
827 (strcmp(errors,"strict") == 0)) {
828 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000829 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830 details);
831 return -1;
832 }
833 else if (strcmp(errors,"ignore") == 0) {
834 return 0;
835 }
836 else if (strcmp(errors,"replace") == 0) {
837 **dest = '?';
838 (*dest)++;
839 return 0;
840 }
841 else {
842 PyErr_Format(PyExc_ValueError,
843 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000844 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 errors);
846 return -1;
847 }
848}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000849#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850
851PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
852 int size,
853 const char *errors)
854{
855 PyObject *v;
856 char *p;
857 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000858 Py_UCS4 ch2;
859 unsigned int cbAllocated = 3 * size;
860 unsigned int cbWritten = 0;
861 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000863 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 if (v == NULL)
865 return NULL;
866 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000867 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868
869 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000870 while (i < size) {
871 Py_UCS4 ch = s[i++];
872 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000874 cbWritten++;
875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000876 else if (ch < 0x0800) {
877 *p++ = 0xc0 | (ch >> 6);
878 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 cbWritten += 2;
880 }
881 else {
882 /* Check for high surrogate */
883 if (0xD800 <= ch && ch <= 0xDBFF) {
884 if (i != size) {
885 ch2 = s[i];
886 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
887
888 if (cbWritten >= (cbAllocated - 4)) {
889 /* Provide enough room for some more
890 surrogates */
891 cbAllocated += 4*10;
892 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000893 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000894 }
895
896 /* combine the two values */
897 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
898
899 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000900 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000901 i++;
902 cbWritten += 4;
903 }
904 }
905 }
906 else {
907 *p++ = (char)(0xe0 | (ch >> 12));
908 cbWritten += 3;
909 }
910 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
911 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000912 }
913 }
914 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000915 if (_PyString_Resize(&v, p - q))
916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000917 return v;
918
919 onError:
920 Py_DECREF(v);
921 return NULL;
922}
923
Guido van Rossumd57fd912000-03-10 22:53:23 +0000924PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
925{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926 if (!PyUnicode_Check(unicode)) {
927 PyErr_BadArgument();
928 return NULL;
929 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000930 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
931 PyUnicode_GET_SIZE(unicode),
932 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933}
934
935/* --- UTF-16 Codec ------------------------------------------------------- */
936
937static
938int utf16_decoding_error(const Py_UNICODE **source,
939 Py_UNICODE **dest,
940 const char *errors,
941 const char *details)
942{
943 if ((errors == NULL) ||
944 (strcmp(errors,"strict") == 0)) {
945 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000946 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 details);
948 return -1;
949 }
950 else if (strcmp(errors,"ignore") == 0) {
951 return 0;
952 }
953 else if (strcmp(errors,"replace") == 0) {
954 if (dest) {
955 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
956 (*dest)++;
957 }
958 return 0;
959 }
960 else {
961 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000962 "UTF-16 decoding error; "
963 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 errors);
965 return -1;
966 }
967}
968
Guido van Rossumd57fd912000-03-10 22:53:23 +0000969PyObject *PyUnicode_DecodeUTF16(const char *s,
970 int size,
971 const char *errors,
972 int *byteorder)
973{
974 PyUnicodeObject *unicode;
975 Py_UNICODE *p;
976 const Py_UNICODE *q, *e;
977 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000978 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979
980 /* size should be an even number */
981 if (size % sizeof(Py_UNICODE) != 0) {
982 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
983 return NULL;
984 /* The remaining input chars are ignored if we fall through
985 here... */
986 }
987
988 /* Note: size will always be longer than the resulting Unicode
989 character count */
990 unicode = _PyUnicode_New(size);
991 if (!unicode)
992 return NULL;
993 if (size == 0)
994 return (PyObject *)unicode;
995
996 /* Unpack UTF-16 encoded data */
997 p = unicode->str;
998 q = (Py_UNICODE *)s;
999 e = q + (size / sizeof(Py_UNICODE));
1000
1001 if (byteorder)
1002 bo = *byteorder;
1003
1004 while (q < e) {
1005 register Py_UNICODE ch = *q++;
1006
1007 /* Check for BOM marks (U+FEFF) in the input and adjust
1008 current byte order setting accordingly. Swap input
1009 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
1010 !) */
1011#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1012 if (ch == 0xFEFF) {
1013 bo = -1;
1014 continue;
1015 } else if (ch == 0xFFFE) {
1016 bo = 1;
1017 continue;
1018 }
1019 if (bo == 1)
1020 ch = (ch >> 8) | (ch << 8);
1021#else
1022 if (ch == 0xFEFF) {
1023 bo = 1;
1024 continue;
1025 } else if (ch == 0xFFFE) {
1026 bo = -1;
1027 continue;
1028 }
1029 if (bo == -1)
1030 ch = (ch >> 8) | (ch << 8);
1031#endif
1032 if (ch < 0xD800 || ch > 0xDFFF) {
1033 *p++ = ch;
1034 continue;
1035 }
1036
1037 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001038 if (q >= e) {
1039 errmsg = "unexpected end of data";
1040 goto utf16Error;
1041 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001042 if (0xDC00 <= *q && *q <= 0xDFFF) {
1043 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001044 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001045 /* This is valid data (a UTF-16 surrogate pair), but
1046 we are not able to store this information since our
1047 Py_UNICODE type only has 16 bits... this might
1048 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001049 errmsg = "code pairs are not supported";
1050 goto utf16Error;
1051 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001052 else
1053 continue;
1054 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001055 errmsg = "illegal encoding";
1056 /* Fall through to report the error */
1057
1058 utf16Error:
1059 if (utf16_decoding_error(&q, &p, errors, errmsg))
1060 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001061 }
1062
1063 if (byteorder)
1064 *byteorder = bo;
1065
1066 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001067 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 goto onError;
1069
1070 return (PyObject *)unicode;
1071
1072onError:
1073 Py_DECREF(unicode);
1074 return NULL;
1075}
1076
1077#undef UTF16_ERROR
1078
1079PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1080 int size,
1081 const char *errors,
1082 int byteorder)
1083{
1084 PyObject *v;
1085 Py_UNICODE *p;
1086 char *q;
1087
1088 /* We don't create UTF-16 pairs... */
1089 v = PyString_FromStringAndSize(NULL,
1090 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1091 if (v == NULL)
1092 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001093
1094 q = PyString_AS_STRING(v);
1095 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001096 if (byteorder == 0)
1097 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001098 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001099 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001100 if (byteorder == 0 ||
1101#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1102 byteorder == -1
1103#else
1104 byteorder == 1
1105#endif
1106 )
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001107 Py_UNICODE_COPY(p, s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 else
1109 while (size-- > 0) {
1110 Py_UNICODE ch = *s++;
1111 *p++ = (ch >> 8) | (ch << 8);
1112 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 return v;
1114}
1115
1116PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1117{
1118 if (!PyUnicode_Check(unicode)) {
1119 PyErr_BadArgument();
1120 return NULL;
1121 }
1122 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1123 PyUnicode_GET_SIZE(unicode),
1124 NULL,
1125 0);
1126}
1127
1128/* --- Unicode Escape Codec ----------------------------------------------- */
1129
1130static
1131int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001132 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133 const char *errors,
1134 const char *details)
1135{
1136 if ((errors == NULL) ||
1137 (strcmp(errors,"strict") == 0)) {
1138 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001139 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001140 details);
1141 return -1;
1142 }
1143 else if (strcmp(errors,"ignore") == 0) {
1144 return 0;
1145 }
1146 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001147 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 return 0;
1149 }
1150 else {
1151 PyErr_Format(PyExc_ValueError,
1152 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001153 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 errors);
1155 return -1;
1156 }
1157}
1158
Fredrik Lundh06d12682001-01-24 07:59:11 +00001159static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001160
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1162 int size,
1163 const char *errors)
1164{
1165 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001166 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001168 char* message;
1169 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1170
Guido van Rossumd57fd912000-03-10 22:53:23 +00001171 /* Escaped strings will always be longer than the resulting
1172 Unicode string, so we start with size here and then reduce the
1173 length after conversion to the true value. */
1174 v = _PyUnicode_New(size);
1175 if (v == NULL)
1176 goto onError;
1177 if (size == 0)
1178 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001179
Guido van Rossumd57fd912000-03-10 22:53:23 +00001180 p = buf = PyUnicode_AS_UNICODE(v);
1181 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183 while (s < end) {
1184 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001185 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001186 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187
1188 /* Non-escape characters are interpreted as Unicode ordinals */
1189 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001190 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 continue;
1192 }
1193
1194 /* \ - Escapes */
1195 s++;
1196 switch (*s++) {
1197
1198 /* \x escapes */
1199 case '\n': break;
1200 case '\\': *p++ = '\\'; break;
1201 case '\'': *p++ = '\''; break;
1202 case '\"': *p++ = '\"'; break;
1203 case 'b': *p++ = '\b'; break;
1204 case 'f': *p++ = '\014'; break; /* FF */
1205 case 't': *p++ = '\t'; break;
1206 case 'n': *p++ = '\n'; break;
1207 case 'r': *p++ = '\r'; break;
1208 case 'v': *p++ = '\013'; break; /* VT */
1209 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1210
1211 /* \OOO (octal) escapes */
1212 case '0': case '1': case '2': case '3':
1213 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001214 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001215 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001216 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001217 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001218 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001219 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001220 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001221 break;
1222
Fredrik Lundhccc74732001-02-18 22:13:49 +00001223 /* hex escapes */
1224 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001226 digits = 2;
1227 message = "truncated \\xXX escape";
1228 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229
Fredrik Lundhccc74732001-02-18 22:13:49 +00001230 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001232 digits = 4;
1233 message = "truncated \\uXXXX escape";
1234 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001235
Fredrik Lundhccc74732001-02-18 22:13:49 +00001236 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001237 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001238 digits = 8;
1239 message = "truncated \\UXXXXXXXX escape";
1240 hexescape:
1241 chr = 0;
1242 for (i = 0; i < digits; i++) {
1243 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001244 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001245 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001246 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001247 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001248 i++;
1249 break;
1250 }
1251 chr = (chr<<4) & ~0xF;
1252 if (c >= '0' && c <= '9')
1253 chr += c - '0';
1254 else if (c >= 'a' && c <= 'f')
1255 chr += 10 + c - 'a';
1256 else
1257 chr += 10 + c - 'A';
1258 }
1259 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001260 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001261 /* when we get here, chr is a 32-bit unicode character */
1262 if (chr <= 0xffff)
1263 /* UCS-2 character */
1264 *p++ = (Py_UNICODE) chr;
1265 else if (chr <= 0x10ffff) {
1266 /* UCS-4 character. store as two surrogate characters */
1267 chr -= 0x10000L;
1268 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1269 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1270 } else {
1271 if (unicodeescape_decoding_error(
1272 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001273 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001274 )
1275 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001276 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001277 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001278 break;
1279
1280 /* \N{name} */
1281 case 'N':
1282 message = "malformed \\N character escape";
1283 if (ucnhash_CAPI == NULL) {
1284 /* load the unicode data module */
1285 PyObject *m, *v;
1286 m = PyImport_ImportModule("unicodedata");
1287 if (m == NULL)
1288 goto ucnhashError;
1289 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1290 Py_DECREF(m);
1291 if (v == NULL)
1292 goto ucnhashError;
1293 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1294 Py_DECREF(v);
1295 if (ucnhash_CAPI == NULL)
1296 goto ucnhashError;
1297 }
1298 if (*s == '{') {
1299 const char *start = s+1;
1300 /* look for the closing brace */
1301 while (*s != '}' && s < end)
1302 s++;
1303 if (s > start && s < end && *s == '}') {
1304 /* found a name. look it up in the unicode database */
1305 message = "unknown Unicode character name";
1306 s++;
1307 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1308 goto store;
1309 }
1310 }
1311 if (unicodeescape_decoding_error(&s, &x, errors, message))
1312 goto onError;
1313 *p++ = x;
1314 break;
1315
1316 default:
1317 *p++ = '\\';
1318 *p++ = (unsigned char)s[-1];
1319 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001320 }
1321 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001322 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001323 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001324 return (PyObject *)v;
1325
Fredrik Lundhccc74732001-02-18 22:13:49 +00001326ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001327 PyErr_SetString(
1328 PyExc_UnicodeError,
1329 "\\N escapes not supported (can't load unicodedata module)"
1330 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001331 return NULL;
1332
Fredrik Lundhccc74732001-02-18 22:13:49 +00001333onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001334 Py_XDECREF(v);
1335 return NULL;
1336}
1337
1338/* Return a Unicode-Escape string version of the Unicode object.
1339
1340 If quotes is true, the string is enclosed in u"" or u'' quotes as
1341 appropriate.
1342
1343*/
1344
Barry Warsaw51ac5802000-03-20 16:36:48 +00001345static const Py_UNICODE *findchar(const Py_UNICODE *s,
1346 int size,
1347 Py_UNICODE ch);
1348
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349static
1350PyObject *unicodeescape_string(const Py_UNICODE *s,
1351 int size,
1352 int quotes)
1353{
1354 PyObject *repr;
1355 char *p;
1356 char *q;
1357
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001358 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001359
1360 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1361 if (repr == NULL)
1362 return NULL;
1363
1364 p = q = PyString_AS_STRING(repr);
1365
1366 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 *p++ = 'u';
1368 *p++ = (findchar(s, size, '\'') &&
1369 !findchar(s, size, '"')) ? '"' : '\'';
1370 }
1371 while (size-- > 0) {
1372 Py_UNICODE ch = *s++;
1373 /* Escape quotes */
1374 if (quotes && (ch == q[1] || ch == '\\')) {
1375 *p++ = '\\';
1376 *p++ = (char) ch;
1377 }
1378 /* Map 16-bit characters to '\uxxxx' */
1379 else if (ch >= 256) {
1380 *p++ = '\\';
1381 *p++ = 'u';
1382 *p++ = hexdigit[(ch >> 12) & 0xf];
1383 *p++ = hexdigit[(ch >> 8) & 0xf];
1384 *p++ = hexdigit[(ch >> 4) & 0xf];
1385 *p++ = hexdigit[ch & 15];
1386 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001387 /* Map special whitespace to '\t', \n', '\r' */
1388 else if (ch == '\t') {
1389 *p++ = '\\';
1390 *p++ = 't';
1391 }
1392 else if (ch == '\n') {
1393 *p++ = '\\';
1394 *p++ = 'n';
1395 }
1396 else if (ch == '\r') {
1397 *p++ = '\\';
1398 *p++ = 'r';
1399 }
1400 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001401 else if (ch < ' ' || ch >= 128) {
1402 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001403 *p++ = 'x';
1404 *p++ = hexdigit[(ch >> 4) & 0xf];
1405 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001406 }
1407 /* Copy everything else as-is */
1408 else
1409 *p++ = (char) ch;
1410 }
1411 if (quotes)
1412 *p++ = q[1];
1413
1414 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001415 if (_PyString_Resize(&repr, p - q))
1416 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417
1418 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001419
1420 onError:
1421 Py_DECREF(repr);
1422 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001423}
1424
1425PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1426 int size)
1427{
1428 return unicodeescape_string(s, size, 0);
1429}
1430
1431PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1432{
1433 if (!PyUnicode_Check(unicode)) {
1434 PyErr_BadArgument();
1435 return NULL;
1436 }
1437 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1438 PyUnicode_GET_SIZE(unicode));
1439}
1440
1441/* --- Raw Unicode Escape Codec ------------------------------------------- */
1442
1443PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1444 int size,
1445 const char *errors)
1446{
1447 PyUnicodeObject *v;
1448 Py_UNICODE *p, *buf;
1449 const char *end;
1450 const char *bs;
1451
1452 /* Escaped strings will always be longer than the resulting
1453 Unicode string, so we start with size here and then reduce the
1454 length after conversion to the true value. */
1455 v = _PyUnicode_New(size);
1456 if (v == NULL)
1457 goto onError;
1458 if (size == 0)
1459 return (PyObject *)v;
1460 p = buf = PyUnicode_AS_UNICODE(v);
1461 end = s + size;
1462 while (s < end) {
1463 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001464 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 int i;
1466
1467 /* Non-escape characters are interpreted as Unicode ordinals */
1468 if (*s != '\\') {
1469 *p++ = (unsigned char)*s++;
1470 continue;
1471 }
1472
1473 /* \u-escapes are only interpreted iff the number of leading
1474 backslashes if odd */
1475 bs = s;
1476 for (;s < end;) {
1477 if (*s != '\\')
1478 break;
1479 *p++ = (unsigned char)*s++;
1480 }
1481 if (((s - bs) & 1) == 0 ||
1482 s >= end ||
1483 *s != 'u') {
1484 continue;
1485 }
1486 p--;
1487 s++;
1488
1489 /* \uXXXX with 4 hex digits */
1490 for (x = 0, i = 0; i < 4; i++) {
1491 c = (unsigned char)s[i];
1492 if (!isxdigit(c)) {
1493 if (unicodeescape_decoding_error(&s, &x, errors,
1494 "truncated \\uXXXX"))
1495 goto onError;
1496 i++;
1497 break;
1498 }
1499 x = (x<<4) & ~0xF;
1500 if (c >= '0' && c <= '9')
1501 x += c - '0';
1502 else if (c >= 'a' && c <= 'f')
1503 x += 10 + c - 'a';
1504 else
1505 x += 10 + c - 'A';
1506 }
1507 s += i;
1508 *p++ = x;
1509 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001510 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001511 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 return (PyObject *)v;
1513
1514 onError:
1515 Py_XDECREF(v);
1516 return NULL;
1517}
1518
1519PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1520 int size)
1521{
1522 PyObject *repr;
1523 char *p;
1524 char *q;
1525
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001526 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527
1528 repr = PyString_FromStringAndSize(NULL, 6 * size);
1529 if (repr == NULL)
1530 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001531 if (size == 0)
1532 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533
1534 p = q = PyString_AS_STRING(repr);
1535 while (size-- > 0) {
1536 Py_UNICODE ch = *s++;
1537 /* Map 16-bit characters to '\uxxxx' */
1538 if (ch >= 256) {
1539 *p++ = '\\';
1540 *p++ = 'u';
1541 *p++ = hexdigit[(ch >> 12) & 0xf];
1542 *p++ = hexdigit[(ch >> 8) & 0xf];
1543 *p++ = hexdigit[(ch >> 4) & 0xf];
1544 *p++ = hexdigit[ch & 15];
1545 }
1546 /* Copy everything else as-is */
1547 else
1548 *p++ = (char) ch;
1549 }
1550 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001551 if (_PyString_Resize(&repr, p - q))
1552 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553
1554 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001555
1556 onError:
1557 Py_DECREF(repr);
1558 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559}
1560
1561PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1562{
1563 if (!PyUnicode_Check(unicode)) {
1564 PyErr_BadArgument();
1565 return NULL;
1566 }
1567 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1568 PyUnicode_GET_SIZE(unicode));
1569}
1570
1571/* --- Latin-1 Codec ------------------------------------------------------ */
1572
1573PyObject *PyUnicode_DecodeLatin1(const char *s,
1574 int size,
1575 const char *errors)
1576{
1577 PyUnicodeObject *v;
1578 Py_UNICODE *p;
1579
1580 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001581 if (size == 1 && *(unsigned char*)s < 256) {
1582 Py_UNICODE r = *(unsigned char*)s;
1583 return PyUnicode_FromUnicode(&r, 1);
1584 }
1585
Guido van Rossumd57fd912000-03-10 22:53:23 +00001586 v = _PyUnicode_New(size);
1587 if (v == NULL)
1588 goto onError;
1589 if (size == 0)
1590 return (PyObject *)v;
1591 p = PyUnicode_AS_UNICODE(v);
1592 while (size-- > 0)
1593 *p++ = (unsigned char)*s++;
1594 return (PyObject *)v;
1595
1596 onError:
1597 Py_XDECREF(v);
1598 return NULL;
1599}
1600
1601static
1602int latin1_encoding_error(const Py_UNICODE **source,
1603 char **dest,
1604 const char *errors,
1605 const char *details)
1606{
1607 if ((errors == NULL) ||
1608 (strcmp(errors,"strict") == 0)) {
1609 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001610 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611 details);
1612 return -1;
1613 }
1614 else if (strcmp(errors,"ignore") == 0) {
1615 return 0;
1616 }
1617 else if (strcmp(errors,"replace") == 0) {
1618 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001619 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 return 0;
1621 }
1622 else {
1623 PyErr_Format(PyExc_ValueError,
1624 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001625 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626 errors);
1627 return -1;
1628 }
1629}
1630
1631PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1632 int size,
1633 const char *errors)
1634{
1635 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001636 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001637
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 repr = PyString_FromStringAndSize(NULL, size);
1639 if (repr == NULL)
1640 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001641 if (size == 0)
1642 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643
1644 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001645 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 while (size-- > 0) {
1647 Py_UNICODE ch = *p++;
1648 if (ch >= 256) {
1649 if (latin1_encoding_error(&p, &s, errors,
1650 "ordinal not in range(256)"))
1651 goto onError;
1652 }
1653 else
1654 *s++ = (char)ch;
1655 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001656 /* Resize if error handling skipped some characters */
1657 if (s - start < PyString_GET_SIZE(repr))
1658 if (_PyString_Resize(&repr, s - start))
1659 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 return repr;
1661
1662 onError:
1663 Py_DECREF(repr);
1664 return NULL;
1665}
1666
1667PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1668{
1669 if (!PyUnicode_Check(unicode)) {
1670 PyErr_BadArgument();
1671 return NULL;
1672 }
1673 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1674 PyUnicode_GET_SIZE(unicode),
1675 NULL);
1676}
1677
1678/* --- 7-bit ASCII Codec -------------------------------------------------- */
1679
1680static
1681int ascii_decoding_error(const char **source,
1682 Py_UNICODE **dest,
1683 const char *errors,
1684 const char *details)
1685{
1686 if ((errors == NULL) ||
1687 (strcmp(errors,"strict") == 0)) {
1688 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001689 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001690 details);
1691 return -1;
1692 }
1693 else if (strcmp(errors,"ignore") == 0) {
1694 return 0;
1695 }
1696 else if (strcmp(errors,"replace") == 0) {
1697 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1698 (*dest)++;
1699 return 0;
1700 }
1701 else {
1702 PyErr_Format(PyExc_ValueError,
1703 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001704 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705 errors);
1706 return -1;
1707 }
1708}
1709
1710PyObject *PyUnicode_DecodeASCII(const char *s,
1711 int size,
1712 const char *errors)
1713{
1714 PyUnicodeObject *v;
1715 Py_UNICODE *p;
1716
1717 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001718 if (size == 1 && *(unsigned char*)s < 128) {
1719 Py_UNICODE r = *(unsigned char*)s;
1720 return PyUnicode_FromUnicode(&r, 1);
1721 }
1722
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 v = _PyUnicode_New(size);
1724 if (v == NULL)
1725 goto onError;
1726 if (size == 0)
1727 return (PyObject *)v;
1728 p = PyUnicode_AS_UNICODE(v);
1729 while (size-- > 0) {
1730 register unsigned char c;
1731
1732 c = (unsigned char)*s++;
1733 if (c < 128)
1734 *p++ = c;
1735 else if (ascii_decoding_error(&s, &p, errors,
1736 "ordinal not in range(128)"))
1737 goto onError;
1738 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001739 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001740 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001741 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 return (PyObject *)v;
1743
1744 onError:
1745 Py_XDECREF(v);
1746 return NULL;
1747}
1748
1749static
1750int ascii_encoding_error(const Py_UNICODE **source,
1751 char **dest,
1752 const char *errors,
1753 const char *details)
1754{
1755 if ((errors == NULL) ||
1756 (strcmp(errors,"strict") == 0)) {
1757 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001758 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 details);
1760 return -1;
1761 }
1762 else if (strcmp(errors,"ignore") == 0) {
1763 return 0;
1764 }
1765 else if (strcmp(errors,"replace") == 0) {
1766 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001767 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001768 return 0;
1769 }
1770 else {
1771 PyErr_Format(PyExc_ValueError,
1772 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001773 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001774 errors);
1775 return -1;
1776 }
1777}
1778
1779PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1780 int size,
1781 const char *errors)
1782{
1783 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001784 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001785
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 repr = PyString_FromStringAndSize(NULL, size);
1787 if (repr == NULL)
1788 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001789 if (size == 0)
1790 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001791
1792 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001793 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 while (size-- > 0) {
1795 Py_UNICODE ch = *p++;
1796 if (ch >= 128) {
1797 if (ascii_encoding_error(&p, &s, errors,
1798 "ordinal not in range(128)"))
1799 goto onError;
1800 }
1801 else
1802 *s++ = (char)ch;
1803 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001804 /* Resize if error handling skipped some characters */
1805 if (s - start < PyString_GET_SIZE(repr))
1806 if (_PyString_Resize(&repr, s - start))
1807 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 return repr;
1809
1810 onError:
1811 Py_DECREF(repr);
1812 return NULL;
1813}
1814
1815PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1816{
1817 if (!PyUnicode_Check(unicode)) {
1818 PyErr_BadArgument();
1819 return NULL;
1820 }
1821 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1822 PyUnicode_GET_SIZE(unicode),
1823 NULL);
1824}
1825
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001826#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001827
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001828/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001829
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001830PyObject *PyUnicode_DecodeMBCS(const char *s,
1831 int size,
1832 const char *errors)
1833{
1834 PyUnicodeObject *v;
1835 Py_UNICODE *p;
1836
1837 /* First get the size of the result */
1838 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001839 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001840 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1841
1842 v = _PyUnicode_New(usize);
1843 if (v == NULL)
1844 return NULL;
1845 if (usize == 0)
1846 return (PyObject *)v;
1847 p = PyUnicode_AS_UNICODE(v);
1848 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1849 Py_DECREF(v);
1850 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1851 }
1852
1853 return (PyObject *)v;
1854}
1855
1856PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1857 int size,
1858 const char *errors)
1859{
1860 PyObject *repr;
1861 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001862 DWORD mbcssize;
1863
1864 /* If there are no characters, bail now! */
1865 if (size==0)
1866 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001867
1868 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001869 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001870 if (mbcssize==0)
1871 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1872
1873 repr = PyString_FromStringAndSize(NULL, mbcssize);
1874 if (repr == NULL)
1875 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001876 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001877 return repr;
1878
1879 /* Do the conversion */
1880 s = PyString_AS_STRING(repr);
1881 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1882 Py_DECREF(repr);
1883 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1884 }
1885 return repr;
1886}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001887
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001888#endif /* MS_WIN32 */
1889
Guido van Rossumd57fd912000-03-10 22:53:23 +00001890/* --- Character Mapping Codec -------------------------------------------- */
1891
1892static
1893int charmap_decoding_error(const char **source,
1894 Py_UNICODE **dest,
1895 const char *errors,
1896 const char *details)
1897{
1898 if ((errors == NULL) ||
1899 (strcmp(errors,"strict") == 0)) {
1900 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001901 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001902 details);
1903 return -1;
1904 }
1905 else if (strcmp(errors,"ignore") == 0) {
1906 return 0;
1907 }
1908 else if (strcmp(errors,"replace") == 0) {
1909 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1910 (*dest)++;
1911 return 0;
1912 }
1913 else {
1914 PyErr_Format(PyExc_ValueError,
1915 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001916 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 errors);
1918 return -1;
1919 }
1920}
1921
1922PyObject *PyUnicode_DecodeCharmap(const char *s,
1923 int size,
1924 PyObject *mapping,
1925 const char *errors)
1926{
1927 PyUnicodeObject *v;
1928 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001929 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930
1931 /* Default to Latin-1 */
1932 if (mapping == NULL)
1933 return PyUnicode_DecodeLatin1(s, size, errors);
1934
1935 v = _PyUnicode_New(size);
1936 if (v == NULL)
1937 goto onError;
1938 if (size == 0)
1939 return (PyObject *)v;
1940 p = PyUnicode_AS_UNICODE(v);
1941 while (size-- > 0) {
1942 unsigned char ch = *s++;
1943 PyObject *w, *x;
1944
1945 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1946 w = PyInt_FromLong((long)ch);
1947 if (w == NULL)
1948 goto onError;
1949 x = PyObject_GetItem(mapping, w);
1950 Py_DECREF(w);
1951 if (x == NULL) {
1952 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001953 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001954 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001955 x = Py_None;
1956 Py_INCREF(x);
1957 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001958 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001959 }
1960
1961 /* Apply mapping */
1962 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001963 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001964 if (value < 0 || value > 65535) {
1965 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001966 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 Py_DECREF(x);
1968 goto onError;
1969 }
1970 *p++ = (Py_UNICODE)value;
1971 }
1972 else if (x == Py_None) {
1973 /* undefined mapping */
1974 if (charmap_decoding_error(&s, &p, errors,
1975 "character maps to <undefined>")) {
1976 Py_DECREF(x);
1977 goto onError;
1978 }
1979 }
1980 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001981 int targetsize = PyUnicode_GET_SIZE(x);
1982
1983 if (targetsize == 1)
1984 /* 1-1 mapping */
1985 *p++ = *PyUnicode_AS_UNICODE(x);
1986
1987 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001988 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001989 if (targetsize > extrachars) {
1990 /* resize first */
1991 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1992 int needed = (targetsize - extrachars) + \
1993 (targetsize << 2);
1994 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001995 if (_PyUnicode_Resize(&v,
1996 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001997 Py_DECREF(x);
1998 goto onError;
1999 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002000 p = PyUnicode_AS_UNICODE(v) + oldpos;
2001 }
2002 Py_UNICODE_COPY(p,
2003 PyUnicode_AS_UNICODE(x),
2004 targetsize);
2005 p += targetsize;
2006 extrachars -= targetsize;
2007 }
2008 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002009 }
2010 else {
2011 /* wrong return value */
2012 PyErr_SetString(PyExc_TypeError,
2013 "character mapping must return integer, None or unicode");
2014 Py_DECREF(x);
2015 goto onError;
2016 }
2017 Py_DECREF(x);
2018 }
2019 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002020 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002021 goto onError;
2022 return (PyObject *)v;
2023
2024 onError:
2025 Py_XDECREF(v);
2026 return NULL;
2027}
2028
2029static
2030int charmap_encoding_error(const Py_UNICODE **source,
2031 char **dest,
2032 const char *errors,
2033 const char *details)
2034{
2035 if ((errors == NULL) ||
2036 (strcmp(errors,"strict") == 0)) {
2037 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002038 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 details);
2040 return -1;
2041 }
2042 else if (strcmp(errors,"ignore") == 0) {
2043 return 0;
2044 }
2045 else if (strcmp(errors,"replace") == 0) {
2046 **dest = '?';
2047 (*dest)++;
2048 return 0;
2049 }
2050 else {
2051 PyErr_Format(PyExc_ValueError,
2052 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002053 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 errors);
2055 return -1;
2056 }
2057}
2058
2059PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2060 int size,
2061 PyObject *mapping,
2062 const char *errors)
2063{
2064 PyObject *v;
2065 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002066 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002067
2068 /* Default to Latin-1 */
2069 if (mapping == NULL)
2070 return PyUnicode_EncodeLatin1(p, size, errors);
2071
2072 v = PyString_FromStringAndSize(NULL, size);
2073 if (v == NULL)
2074 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002075 if (size == 0)
2076 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002077 s = PyString_AS_STRING(v);
2078 while (size-- > 0) {
2079 Py_UNICODE ch = *p++;
2080 PyObject *w, *x;
2081
2082 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2083 w = PyInt_FromLong((long)ch);
2084 if (w == NULL)
2085 goto onError;
2086 x = PyObject_GetItem(mapping, w);
2087 Py_DECREF(w);
2088 if (x == NULL) {
2089 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002090 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002091 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002092 x = Py_None;
2093 Py_INCREF(x);
2094 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002095 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096 }
2097
2098 /* Apply mapping */
2099 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002100 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002101 if (value < 0 || value > 255) {
2102 PyErr_SetString(PyExc_TypeError,
2103 "character mapping must be in range(256)");
2104 Py_DECREF(x);
2105 goto onError;
2106 }
2107 *s++ = (char)value;
2108 }
2109 else if (x == Py_None) {
2110 /* undefined mapping */
2111 if (charmap_encoding_error(&p, &s, errors,
2112 "character maps to <undefined>")) {
2113 Py_DECREF(x);
2114 goto onError;
2115 }
2116 }
2117 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002118 int targetsize = PyString_GET_SIZE(x);
2119
2120 if (targetsize == 1)
2121 /* 1-1 mapping */
2122 *s++ = *PyString_AS_STRING(x);
2123
2124 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002125 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002126 if (targetsize > extrachars) {
2127 /* resize first */
2128 int oldpos = (int)(s - PyString_AS_STRING(v));
2129 int needed = (targetsize - extrachars) + \
2130 (targetsize << 2);
2131 extrachars += needed;
2132 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002133 Py_DECREF(x);
2134 goto onError;
2135 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002136 s = PyString_AS_STRING(v) + oldpos;
2137 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002138 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002139 s += targetsize;
2140 extrachars -= targetsize;
2141 }
2142 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002143 }
2144 else {
2145 /* wrong return value */
2146 PyErr_SetString(PyExc_TypeError,
2147 "character mapping must return integer, None or unicode");
2148 Py_DECREF(x);
2149 goto onError;
2150 }
2151 Py_DECREF(x);
2152 }
2153 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2154 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2155 goto onError;
2156 return v;
2157
2158 onError:
2159 Py_DECREF(v);
2160 return NULL;
2161}
2162
2163PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2164 PyObject *mapping)
2165{
2166 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2167 PyErr_BadArgument();
2168 return NULL;
2169 }
2170 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2171 PyUnicode_GET_SIZE(unicode),
2172 mapping,
2173 NULL);
2174}
2175
2176static
2177int translate_error(const Py_UNICODE **source,
2178 Py_UNICODE **dest,
2179 const char *errors,
2180 const char *details)
2181{
2182 if ((errors == NULL) ||
2183 (strcmp(errors,"strict") == 0)) {
2184 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002185 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002186 details);
2187 return -1;
2188 }
2189 else if (strcmp(errors,"ignore") == 0) {
2190 return 0;
2191 }
2192 else if (strcmp(errors,"replace") == 0) {
2193 **dest = '?';
2194 (*dest)++;
2195 return 0;
2196 }
2197 else {
2198 PyErr_Format(PyExc_ValueError,
2199 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002200 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002201 errors);
2202 return -1;
2203 }
2204}
2205
2206PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2207 int size,
2208 PyObject *mapping,
2209 const char *errors)
2210{
2211 PyUnicodeObject *v;
2212 Py_UNICODE *p;
2213
2214 if (mapping == NULL) {
2215 PyErr_BadArgument();
2216 return NULL;
2217 }
2218
2219 /* Output will never be longer than input */
2220 v = _PyUnicode_New(size);
2221 if (v == NULL)
2222 goto onError;
2223 if (size == 0)
2224 goto done;
2225 p = PyUnicode_AS_UNICODE(v);
2226 while (size-- > 0) {
2227 Py_UNICODE ch = *s++;
2228 PyObject *w, *x;
2229
2230 /* Get mapping */
2231 w = PyInt_FromLong(ch);
2232 if (w == NULL)
2233 goto onError;
2234 x = PyObject_GetItem(mapping, w);
2235 Py_DECREF(w);
2236 if (x == NULL) {
2237 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2238 /* No mapping found: default to 1-1 mapping */
2239 PyErr_Clear();
2240 *p++ = ch;
2241 continue;
2242 }
2243 goto onError;
2244 }
2245
2246 /* Apply mapping */
2247 if (PyInt_Check(x))
2248 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2249 else if (x == Py_None) {
2250 /* undefined mapping */
2251 if (translate_error(&s, &p, errors,
2252 "character maps to <undefined>")) {
2253 Py_DECREF(x);
2254 goto onError;
2255 }
2256 }
2257 else if (PyUnicode_Check(x)) {
2258 if (PyUnicode_GET_SIZE(x) != 1) {
2259 /* 1-n mapping */
2260 PyErr_SetString(PyExc_NotImplementedError,
2261 "1-n mappings are currently not implemented");
2262 Py_DECREF(x);
2263 goto onError;
2264 }
2265 *p++ = *PyUnicode_AS_UNICODE(x);
2266 }
2267 else {
2268 /* wrong return value */
2269 PyErr_SetString(PyExc_TypeError,
2270 "translate mapping must return integer, None or unicode");
2271 Py_DECREF(x);
2272 goto onError;
2273 }
2274 Py_DECREF(x);
2275 }
2276 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002277 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002278 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002279
2280 done:
2281 return (PyObject *)v;
2282
2283 onError:
2284 Py_XDECREF(v);
2285 return NULL;
2286}
2287
2288PyObject *PyUnicode_Translate(PyObject *str,
2289 PyObject *mapping,
2290 const char *errors)
2291{
2292 PyObject *result;
2293
2294 str = PyUnicode_FromObject(str);
2295 if (str == NULL)
2296 goto onError;
2297 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2298 PyUnicode_GET_SIZE(str),
2299 mapping,
2300 errors);
2301 Py_DECREF(str);
2302 return result;
2303
2304 onError:
2305 Py_XDECREF(str);
2306 return NULL;
2307}
2308
Guido van Rossum9e896b32000-04-05 20:11:21 +00002309/* --- Decimal Encoder ---------------------------------------------------- */
2310
2311int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2312 int length,
2313 char *output,
2314 const char *errors)
2315{
2316 Py_UNICODE *p, *end;
2317
2318 if (output == NULL) {
2319 PyErr_BadArgument();
2320 return -1;
2321 }
2322
2323 p = s;
2324 end = s + length;
2325 while (p < end) {
2326 register Py_UNICODE ch = *p++;
2327 int decimal;
2328
2329 if (Py_UNICODE_ISSPACE(ch)) {
2330 *output++ = ' ';
2331 continue;
2332 }
2333 decimal = Py_UNICODE_TODECIMAL(ch);
2334 if (decimal >= 0) {
2335 *output++ = '0' + decimal;
2336 continue;
2337 }
Guido van Rossumba477042000-04-06 18:18:10 +00002338 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002339 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002340 continue;
2341 }
2342 /* All other characters are considered invalid */
2343 if (errors == NULL || strcmp(errors, "strict") == 0) {
2344 PyErr_SetString(PyExc_ValueError,
2345 "invalid decimal Unicode string");
2346 goto onError;
2347 }
2348 else if (strcmp(errors, "ignore") == 0)
2349 continue;
2350 else if (strcmp(errors, "replace") == 0) {
2351 *output++ = '?';
2352 continue;
2353 }
2354 }
2355 /* 0-terminate the output string */
2356 *output++ = '\0';
2357 return 0;
2358
2359 onError:
2360 return -1;
2361}
2362
Guido van Rossumd57fd912000-03-10 22:53:23 +00002363/* --- Helpers ------------------------------------------------------------ */
2364
2365static
2366int count(PyUnicodeObject *self,
2367 int start,
2368 int end,
2369 PyUnicodeObject *substring)
2370{
2371 int count = 0;
2372
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002373 if (start < 0)
2374 start += self->length;
2375 if (start < 0)
2376 start = 0;
2377 if (end > self->length)
2378 end = self->length;
2379 if (end < 0)
2380 end += self->length;
2381 if (end < 0)
2382 end = 0;
2383
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002384 if (substring->length == 0)
2385 return (end - start + 1);
2386
Guido van Rossumd57fd912000-03-10 22:53:23 +00002387 end -= substring->length;
2388
2389 while (start <= end)
2390 if (Py_UNICODE_MATCH(self, start, substring)) {
2391 count++;
2392 start += substring->length;
2393 } else
2394 start++;
2395
2396 return count;
2397}
2398
2399int PyUnicode_Count(PyObject *str,
2400 PyObject *substr,
2401 int start,
2402 int end)
2403{
2404 int result;
2405
2406 str = PyUnicode_FromObject(str);
2407 if (str == NULL)
2408 return -1;
2409 substr = PyUnicode_FromObject(substr);
2410 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002411 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002412 return -1;
2413 }
2414
2415 result = count((PyUnicodeObject *)str,
2416 start, end,
2417 (PyUnicodeObject *)substr);
2418
2419 Py_DECREF(str);
2420 Py_DECREF(substr);
2421 return result;
2422}
2423
2424static
2425int findstring(PyUnicodeObject *self,
2426 PyUnicodeObject *substring,
2427 int start,
2428 int end,
2429 int direction)
2430{
2431 if (start < 0)
2432 start += self->length;
2433 if (start < 0)
2434 start = 0;
2435
2436 if (substring->length == 0)
2437 return start;
2438
2439 if (end > self->length)
2440 end = self->length;
2441 if (end < 0)
2442 end += self->length;
2443 if (end < 0)
2444 end = 0;
2445
2446 end -= substring->length;
2447
2448 if (direction < 0) {
2449 for (; end >= start; end--)
2450 if (Py_UNICODE_MATCH(self, end, substring))
2451 return end;
2452 } else {
2453 for (; start <= end; start++)
2454 if (Py_UNICODE_MATCH(self, start, substring))
2455 return start;
2456 }
2457
2458 return -1;
2459}
2460
2461int PyUnicode_Find(PyObject *str,
2462 PyObject *substr,
2463 int start,
2464 int end,
2465 int direction)
2466{
2467 int result;
2468
2469 str = PyUnicode_FromObject(str);
2470 if (str == NULL)
2471 return -1;
2472 substr = PyUnicode_FromObject(substr);
2473 if (substr == NULL) {
2474 Py_DECREF(substr);
2475 return -1;
2476 }
2477
2478 result = findstring((PyUnicodeObject *)str,
2479 (PyUnicodeObject *)substr,
2480 start, end, direction);
2481 Py_DECREF(str);
2482 Py_DECREF(substr);
2483 return result;
2484}
2485
2486static
2487int tailmatch(PyUnicodeObject *self,
2488 PyUnicodeObject *substring,
2489 int start,
2490 int end,
2491 int direction)
2492{
2493 if (start < 0)
2494 start += self->length;
2495 if (start < 0)
2496 start = 0;
2497
2498 if (substring->length == 0)
2499 return 1;
2500
2501 if (end > self->length)
2502 end = self->length;
2503 if (end < 0)
2504 end += self->length;
2505 if (end < 0)
2506 end = 0;
2507
2508 end -= substring->length;
2509 if (end < start)
2510 return 0;
2511
2512 if (direction > 0) {
2513 if (Py_UNICODE_MATCH(self, end, substring))
2514 return 1;
2515 } else {
2516 if (Py_UNICODE_MATCH(self, start, substring))
2517 return 1;
2518 }
2519
2520 return 0;
2521}
2522
2523int PyUnicode_Tailmatch(PyObject *str,
2524 PyObject *substr,
2525 int start,
2526 int end,
2527 int direction)
2528{
2529 int result;
2530
2531 str = PyUnicode_FromObject(str);
2532 if (str == NULL)
2533 return -1;
2534 substr = PyUnicode_FromObject(substr);
2535 if (substr == NULL) {
2536 Py_DECREF(substr);
2537 return -1;
2538 }
2539
2540 result = tailmatch((PyUnicodeObject *)str,
2541 (PyUnicodeObject *)substr,
2542 start, end, direction);
2543 Py_DECREF(str);
2544 Py_DECREF(substr);
2545 return result;
2546}
2547
2548static
2549const Py_UNICODE *findchar(const Py_UNICODE *s,
2550 int size,
2551 Py_UNICODE ch)
2552{
2553 /* like wcschr, but doesn't stop at NULL characters */
2554
2555 while (size-- > 0) {
2556 if (*s == ch)
2557 return s;
2558 s++;
2559 }
2560
2561 return NULL;
2562}
2563
2564/* Apply fixfct filter to the Unicode object self and return a
2565 reference to the modified object */
2566
2567static
2568PyObject *fixup(PyUnicodeObject *self,
2569 int (*fixfct)(PyUnicodeObject *s))
2570{
2571
2572 PyUnicodeObject *u;
2573
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002574 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575 if (u == NULL)
2576 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002577
2578 Py_UNICODE_COPY(u->str, self->str, self->length);
2579
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580 if (!fixfct(u)) {
2581 /* fixfct should return TRUE if it modified the buffer. If
2582 FALSE, return a reference to the original buffer instead
2583 (to save space, not time) */
2584 Py_INCREF(self);
2585 Py_DECREF(u);
2586 return (PyObject*) self;
2587 }
2588 return (PyObject*) u;
2589}
2590
2591static
2592int fixupper(PyUnicodeObject *self)
2593{
2594 int len = self->length;
2595 Py_UNICODE *s = self->str;
2596 int status = 0;
2597
2598 while (len-- > 0) {
2599 register Py_UNICODE ch;
2600
2601 ch = Py_UNICODE_TOUPPER(*s);
2602 if (ch != *s) {
2603 status = 1;
2604 *s = ch;
2605 }
2606 s++;
2607 }
2608
2609 return status;
2610}
2611
2612static
2613int fixlower(PyUnicodeObject *self)
2614{
2615 int len = self->length;
2616 Py_UNICODE *s = self->str;
2617 int status = 0;
2618
2619 while (len-- > 0) {
2620 register Py_UNICODE ch;
2621
2622 ch = Py_UNICODE_TOLOWER(*s);
2623 if (ch != *s) {
2624 status = 1;
2625 *s = ch;
2626 }
2627 s++;
2628 }
2629
2630 return status;
2631}
2632
2633static
2634int fixswapcase(PyUnicodeObject *self)
2635{
2636 int len = self->length;
2637 Py_UNICODE *s = self->str;
2638 int status = 0;
2639
2640 while (len-- > 0) {
2641 if (Py_UNICODE_ISUPPER(*s)) {
2642 *s = Py_UNICODE_TOLOWER(*s);
2643 status = 1;
2644 } else if (Py_UNICODE_ISLOWER(*s)) {
2645 *s = Py_UNICODE_TOUPPER(*s);
2646 status = 1;
2647 }
2648 s++;
2649 }
2650
2651 return status;
2652}
2653
2654static
2655int fixcapitalize(PyUnicodeObject *self)
2656{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002657 int len = self->length;
2658 Py_UNICODE *s = self->str;
2659 int status = 0;
2660
2661 if (len == 0)
2662 return 0;
2663 if (Py_UNICODE_ISLOWER(*s)) {
2664 *s = Py_UNICODE_TOUPPER(*s);
2665 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002666 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002667 s++;
2668 while (--len > 0) {
2669 if (Py_UNICODE_ISUPPER(*s)) {
2670 *s = Py_UNICODE_TOLOWER(*s);
2671 status = 1;
2672 }
2673 s++;
2674 }
2675 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002676}
2677
2678static
2679int fixtitle(PyUnicodeObject *self)
2680{
2681 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2682 register Py_UNICODE *e;
2683 int previous_is_cased;
2684
2685 /* Shortcut for single character strings */
2686 if (PyUnicode_GET_SIZE(self) == 1) {
2687 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2688 if (*p != ch) {
2689 *p = ch;
2690 return 1;
2691 }
2692 else
2693 return 0;
2694 }
2695
2696 e = p + PyUnicode_GET_SIZE(self);
2697 previous_is_cased = 0;
2698 for (; p < e; p++) {
2699 register const Py_UNICODE ch = *p;
2700
2701 if (previous_is_cased)
2702 *p = Py_UNICODE_TOLOWER(ch);
2703 else
2704 *p = Py_UNICODE_TOTITLE(ch);
2705
2706 if (Py_UNICODE_ISLOWER(ch) ||
2707 Py_UNICODE_ISUPPER(ch) ||
2708 Py_UNICODE_ISTITLE(ch))
2709 previous_is_cased = 1;
2710 else
2711 previous_is_cased = 0;
2712 }
2713 return 1;
2714}
2715
2716PyObject *PyUnicode_Join(PyObject *separator,
2717 PyObject *seq)
2718{
2719 Py_UNICODE *sep;
2720 int seplen;
2721 PyUnicodeObject *res = NULL;
2722 int reslen = 0;
2723 Py_UNICODE *p;
2724 int seqlen = 0;
2725 int sz = 100;
2726 int i;
2727
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002728 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002729 if (seqlen < 0 && PyErr_Occurred())
2730 return NULL;
2731
2732 if (separator == NULL) {
2733 Py_UNICODE blank = ' ';
2734 sep = &blank;
2735 seplen = 1;
2736 }
2737 else {
2738 separator = PyUnicode_FromObject(separator);
2739 if (separator == NULL)
2740 return NULL;
2741 sep = PyUnicode_AS_UNICODE(separator);
2742 seplen = PyUnicode_GET_SIZE(separator);
2743 }
2744
2745 res = _PyUnicode_New(sz);
2746 if (res == NULL)
2747 goto onError;
2748 p = PyUnicode_AS_UNICODE(res);
2749 reslen = 0;
2750
2751 for (i = 0; i < seqlen; i++) {
2752 int itemlen;
2753 PyObject *item;
2754
2755 item = PySequence_GetItem(seq, i);
2756 if (item == NULL)
2757 goto onError;
2758 if (!PyUnicode_Check(item)) {
2759 PyObject *v;
2760 v = PyUnicode_FromObject(item);
2761 Py_DECREF(item);
2762 item = v;
2763 if (item == NULL)
2764 goto onError;
2765 }
2766 itemlen = PyUnicode_GET_SIZE(item);
2767 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002768 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002769 goto onError;
2770 sz *= 2;
2771 p = PyUnicode_AS_UNICODE(res) + reslen;
2772 }
2773 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002774 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002775 p += seplen;
2776 reslen += seplen;
2777 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002778 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002779 p += itemlen;
2780 reslen += itemlen;
2781 Py_DECREF(item);
2782 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002783 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 goto onError;
2785
2786 Py_XDECREF(separator);
2787 return (PyObject *)res;
2788
2789 onError:
2790 Py_XDECREF(separator);
2791 Py_DECREF(res);
2792 return NULL;
2793}
2794
2795static
2796PyUnicodeObject *pad(PyUnicodeObject *self,
2797 int left,
2798 int right,
2799 Py_UNICODE fill)
2800{
2801 PyUnicodeObject *u;
2802
2803 if (left < 0)
2804 left = 0;
2805 if (right < 0)
2806 right = 0;
2807
2808 if (left == 0 && right == 0) {
2809 Py_INCREF(self);
2810 return self;
2811 }
2812
2813 u = _PyUnicode_New(left + self->length + right);
2814 if (u) {
2815 if (left)
2816 Py_UNICODE_FILL(u->str, fill, left);
2817 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2818 if (right)
2819 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2820 }
2821
2822 return u;
2823}
2824
2825#define SPLIT_APPEND(data, left, right) \
2826 str = PyUnicode_FromUnicode(data + left, right - left); \
2827 if (!str) \
2828 goto onError; \
2829 if (PyList_Append(list, str)) { \
2830 Py_DECREF(str); \
2831 goto onError; \
2832 } \
2833 else \
2834 Py_DECREF(str);
2835
2836static
2837PyObject *split_whitespace(PyUnicodeObject *self,
2838 PyObject *list,
2839 int maxcount)
2840{
2841 register int i;
2842 register int j;
2843 int len = self->length;
2844 PyObject *str;
2845
2846 for (i = j = 0; i < len; ) {
2847 /* find a token */
2848 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2849 i++;
2850 j = i;
2851 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2852 i++;
2853 if (j < i) {
2854 if (maxcount-- <= 0)
2855 break;
2856 SPLIT_APPEND(self->str, j, i);
2857 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2858 i++;
2859 j = i;
2860 }
2861 }
2862 if (j < len) {
2863 SPLIT_APPEND(self->str, j, len);
2864 }
2865 return list;
2866
2867 onError:
2868 Py_DECREF(list);
2869 return NULL;
2870}
2871
2872PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002873 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874{
2875 register int i;
2876 register int j;
2877 int len;
2878 PyObject *list;
2879 PyObject *str;
2880 Py_UNICODE *data;
2881
2882 string = PyUnicode_FromObject(string);
2883 if (string == NULL)
2884 return NULL;
2885 data = PyUnicode_AS_UNICODE(string);
2886 len = PyUnicode_GET_SIZE(string);
2887
Guido van Rossumd57fd912000-03-10 22:53:23 +00002888 list = PyList_New(0);
2889 if (!list)
2890 goto onError;
2891
2892 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002893 int eol;
2894
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 /* Find a line and append it */
2896 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2897 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002898
2899 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002900 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002901 if (i < len) {
2902 if (data[i] == '\r' && i + 1 < len &&
2903 data[i+1] == '\n')
2904 i += 2;
2905 else
2906 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002907 if (keepends)
2908 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909 }
Guido van Rossum86662912000-04-11 15:38:46 +00002910 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002911 j = i;
2912 }
2913 if (j < len) {
2914 SPLIT_APPEND(data, j, len);
2915 }
2916
2917 Py_DECREF(string);
2918 return list;
2919
2920 onError:
2921 Py_DECREF(list);
2922 Py_DECREF(string);
2923 return NULL;
2924}
2925
2926static
2927PyObject *split_char(PyUnicodeObject *self,
2928 PyObject *list,
2929 Py_UNICODE ch,
2930 int maxcount)
2931{
2932 register int i;
2933 register int j;
2934 int len = self->length;
2935 PyObject *str;
2936
2937 for (i = j = 0; i < len; ) {
2938 if (self->str[i] == ch) {
2939 if (maxcount-- <= 0)
2940 break;
2941 SPLIT_APPEND(self->str, j, i);
2942 i = j = i + 1;
2943 } else
2944 i++;
2945 }
2946 if (j <= len) {
2947 SPLIT_APPEND(self->str, j, len);
2948 }
2949 return list;
2950
2951 onError:
2952 Py_DECREF(list);
2953 return NULL;
2954}
2955
2956static
2957PyObject *split_substring(PyUnicodeObject *self,
2958 PyObject *list,
2959 PyUnicodeObject *substring,
2960 int maxcount)
2961{
2962 register int i;
2963 register int j;
2964 int len = self->length;
2965 int sublen = substring->length;
2966 PyObject *str;
2967
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002968 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969 if (Py_UNICODE_MATCH(self, i, substring)) {
2970 if (maxcount-- <= 0)
2971 break;
2972 SPLIT_APPEND(self->str, j, i);
2973 i = j = i + sublen;
2974 } else
2975 i++;
2976 }
2977 if (j <= len) {
2978 SPLIT_APPEND(self->str, j, len);
2979 }
2980 return list;
2981
2982 onError:
2983 Py_DECREF(list);
2984 return NULL;
2985}
2986
2987#undef SPLIT_APPEND
2988
2989static
2990PyObject *split(PyUnicodeObject *self,
2991 PyUnicodeObject *substring,
2992 int maxcount)
2993{
2994 PyObject *list;
2995
2996 if (maxcount < 0)
2997 maxcount = INT_MAX;
2998
2999 list = PyList_New(0);
3000 if (!list)
3001 return NULL;
3002
3003 if (substring == NULL)
3004 return split_whitespace(self,list,maxcount);
3005
3006 else if (substring->length == 1)
3007 return split_char(self,list,substring->str[0],maxcount);
3008
3009 else if (substring->length == 0) {
3010 Py_DECREF(list);
3011 PyErr_SetString(PyExc_ValueError, "empty separator");
3012 return NULL;
3013 }
3014 else
3015 return split_substring(self,list,substring,maxcount);
3016}
3017
3018static
3019PyObject *strip(PyUnicodeObject *self,
3020 int left,
3021 int right)
3022{
3023 Py_UNICODE *p = self->str;
3024 int start = 0;
3025 int end = self->length;
3026
3027 if (left)
3028 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3029 start++;
3030
3031 if (right)
3032 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3033 end--;
3034
3035 if (start == 0 && end == self->length) {
3036 /* couldn't strip anything off, return original string */
3037 Py_INCREF(self);
3038 return (PyObject*) self;
3039 }
3040
3041 return (PyObject*) PyUnicode_FromUnicode(
3042 self->str + start,
3043 end - start
3044 );
3045}
3046
3047static
3048PyObject *replace(PyUnicodeObject *self,
3049 PyUnicodeObject *str1,
3050 PyUnicodeObject *str2,
3051 int maxcount)
3052{
3053 PyUnicodeObject *u;
3054
3055 if (maxcount < 0)
3056 maxcount = INT_MAX;
3057
3058 if (str1->length == 1 && str2->length == 1) {
3059 int i;
3060
3061 /* replace characters */
3062 if (!findchar(self->str, self->length, str1->str[0])) {
3063 /* nothing to replace, return original string */
3064 Py_INCREF(self);
3065 u = self;
3066 } else {
3067 Py_UNICODE u1 = str1->str[0];
3068 Py_UNICODE u2 = str2->str[0];
3069
3070 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003071 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003072 self->length
3073 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003074 if (u != NULL) {
3075 Py_UNICODE_COPY(u->str, self->str,
3076 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 for (i = 0; i < u->length; i++)
3078 if (u->str[i] == u1) {
3079 if (--maxcount < 0)
3080 break;
3081 u->str[i] = u2;
3082 }
3083 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003084 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085
3086 } else {
3087 int n, i;
3088 Py_UNICODE *p;
3089
3090 /* replace strings */
3091 n = count(self, 0, self->length, str1);
3092 if (n > maxcount)
3093 n = maxcount;
3094 if (n == 0) {
3095 /* nothing to replace, return original string */
3096 Py_INCREF(self);
3097 u = self;
3098 } else {
3099 u = _PyUnicode_New(
3100 self->length + n * (str2->length - str1->length));
3101 if (u) {
3102 i = 0;
3103 p = u->str;
3104 while (i <= self->length - str1->length)
3105 if (Py_UNICODE_MATCH(self, i, str1)) {
3106 /* replace string segment */
3107 Py_UNICODE_COPY(p, str2->str, str2->length);
3108 p += str2->length;
3109 i += str1->length;
3110 if (--n <= 0) {
3111 /* copy remaining part */
3112 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3113 break;
3114 }
3115 } else
3116 *p++ = self->str[i++];
3117 }
3118 }
3119 }
3120
3121 return (PyObject *) u;
3122}
3123
3124/* --- Unicode Object Methods --------------------------------------------- */
3125
3126static char title__doc__[] =
3127"S.title() -> unicode\n\
3128\n\
3129Return a titlecased version of S, i.e. words start with title case\n\
3130characters, all remaining cased characters have lower case.";
3131
3132static PyObject*
3133unicode_title(PyUnicodeObject *self, PyObject *args)
3134{
3135 if (!PyArg_NoArgs(args))
3136 return NULL;
3137 return fixup(self, fixtitle);
3138}
3139
3140static char capitalize__doc__[] =
3141"S.capitalize() -> unicode\n\
3142\n\
3143Return a capitalized version of S, i.e. make the first character\n\
3144have upper case.";
3145
3146static PyObject*
3147unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3148{
3149 if (!PyArg_NoArgs(args))
3150 return NULL;
3151 return fixup(self, fixcapitalize);
3152}
3153
3154#if 0
3155static char capwords__doc__[] =
3156"S.capwords() -> unicode\n\
3157\n\
3158Apply .capitalize() to all words in S and return the result with\n\
3159normalized whitespace (all whitespace strings are replaced by ' ').";
3160
3161static PyObject*
3162unicode_capwords(PyUnicodeObject *self, PyObject *args)
3163{
3164 PyObject *list;
3165 PyObject *item;
3166 int i;
3167
3168 if (!PyArg_NoArgs(args))
3169 return NULL;
3170
3171 /* Split into words */
3172 list = split(self, NULL, -1);
3173 if (!list)
3174 return NULL;
3175
3176 /* Capitalize each word */
3177 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3178 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3179 fixcapitalize);
3180 if (item == NULL)
3181 goto onError;
3182 Py_DECREF(PyList_GET_ITEM(list, i));
3183 PyList_SET_ITEM(list, i, item);
3184 }
3185
3186 /* Join the words to form a new string */
3187 item = PyUnicode_Join(NULL, list);
3188
3189onError:
3190 Py_DECREF(list);
3191 return (PyObject *)item;
3192}
3193#endif
3194
3195static char center__doc__[] =
3196"S.center(width) -> unicode\n\
3197\n\
3198Return S centered in a Unicode string of length width. Padding is done\n\
3199using spaces.";
3200
3201static PyObject *
3202unicode_center(PyUnicodeObject *self, PyObject *args)
3203{
3204 int marg, left;
3205 int width;
3206
3207 if (!PyArg_ParseTuple(args, "i:center", &width))
3208 return NULL;
3209
3210 if (self->length >= width) {
3211 Py_INCREF(self);
3212 return (PyObject*) self;
3213 }
3214
3215 marg = width - self->length;
3216 left = marg / 2 + (marg & width & 1);
3217
3218 return (PyObject*) pad(self, left, marg - left, ' ');
3219}
3220
Marc-André Lemburge5034372000-08-08 08:04:29 +00003221#if 0
3222
3223/* This code should go into some future Unicode collation support
3224 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003225 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003226
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003227/* speedy UTF-16 code point order comparison */
3228/* gleaned from: */
3229/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3230
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003231static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003232{
3233 0, 0, 0, 0, 0, 0, 0, 0,
3234 0, 0, 0, 0, 0, 0, 0, 0,
3235 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003236 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003237};
3238
Guido van Rossumd57fd912000-03-10 22:53:23 +00003239static int
3240unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3241{
3242 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003243
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 Py_UNICODE *s1 = str1->str;
3245 Py_UNICODE *s2 = str2->str;
3246
3247 len1 = str1->length;
3248 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003249
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003251 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003252 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003253
3254 c1 = *s1++;
3255 c2 = *s2++;
3256 if (c1 > (1<<11) * 26)
3257 c1 += utf16Fixup[c1>>11];
3258 if (c2 > (1<<11) * 26)
3259 c2 += utf16Fixup[c2>>11];
3260
3261 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003262 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003263 if (diff)
3264 return (diff < 0) ? -1 : (diff != 0);
3265 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003266 }
3267
3268 return (len1 < len2) ? -1 : (len1 != len2);
3269}
3270
Marc-André Lemburge5034372000-08-08 08:04:29 +00003271#else
3272
3273static int
3274unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3275{
3276 register int len1, len2;
3277
3278 Py_UNICODE *s1 = str1->str;
3279 Py_UNICODE *s2 = str2->str;
3280
3281 len1 = str1->length;
3282 len2 = str2->length;
3283
3284 while (len1 > 0 && len2 > 0) {
3285 register long diff;
3286
3287 diff = (long)*s1++ - (long)*s2++;
3288 if (diff)
3289 return (diff < 0) ? -1 : (diff != 0);
3290 len1--; len2--;
3291 }
3292
3293 return (len1 < len2) ? -1 : (len1 != len2);
3294}
3295
3296#endif
3297
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298int PyUnicode_Compare(PyObject *left,
3299 PyObject *right)
3300{
3301 PyUnicodeObject *u = NULL, *v = NULL;
3302 int result;
3303
3304 /* Coerce the two arguments */
3305 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3306 if (u == NULL)
3307 goto onError;
3308 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3309 if (v == NULL)
3310 goto onError;
3311
Thomas Wouters7e474022000-07-16 12:04:32 +00003312 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 if (v == u) {
3314 Py_DECREF(u);
3315 Py_DECREF(v);
3316 return 0;
3317 }
3318
3319 result = unicode_compare(u, v);
3320
3321 Py_DECREF(u);
3322 Py_DECREF(v);
3323 return result;
3324
3325onError:
3326 Py_XDECREF(u);
3327 Py_XDECREF(v);
3328 return -1;
3329}
3330
Guido van Rossum403d68b2000-03-13 15:55:09 +00003331int PyUnicode_Contains(PyObject *container,
3332 PyObject *element)
3333{
3334 PyUnicodeObject *u = NULL, *v = NULL;
3335 int result;
3336 register const Py_UNICODE *p, *e;
3337 register Py_UNICODE ch;
3338
3339 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003340 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003341 if (v == NULL) {
3342 PyErr_SetString(PyExc_TypeError,
3343 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003344 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003345 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003346 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3347 if (u == NULL) {
3348 Py_DECREF(v);
3349 goto onError;
3350 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003351
3352 /* Check v in u */
3353 if (PyUnicode_GET_SIZE(v) != 1) {
3354 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003355 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003356 goto onError;
3357 }
3358 ch = *PyUnicode_AS_UNICODE(v);
3359 p = PyUnicode_AS_UNICODE(u);
3360 e = p + PyUnicode_GET_SIZE(u);
3361 result = 0;
3362 while (p < e) {
3363 if (*p++ == ch) {
3364 result = 1;
3365 break;
3366 }
3367 }
3368
3369 Py_DECREF(u);
3370 Py_DECREF(v);
3371 return result;
3372
3373onError:
3374 Py_XDECREF(u);
3375 Py_XDECREF(v);
3376 return -1;
3377}
3378
Guido van Rossumd57fd912000-03-10 22:53:23 +00003379/* Concat to string or Unicode object giving a new Unicode object. */
3380
3381PyObject *PyUnicode_Concat(PyObject *left,
3382 PyObject *right)
3383{
3384 PyUnicodeObject *u = NULL, *v = NULL, *w;
3385
3386 /* Coerce the two arguments */
3387 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3388 if (u == NULL)
3389 goto onError;
3390 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3391 if (v == NULL)
3392 goto onError;
3393
3394 /* Shortcuts */
3395 if (v == unicode_empty) {
3396 Py_DECREF(v);
3397 return (PyObject *)u;
3398 }
3399 if (u == unicode_empty) {
3400 Py_DECREF(u);
3401 return (PyObject *)v;
3402 }
3403
3404 /* Concat the two Unicode strings */
3405 w = _PyUnicode_New(u->length + v->length);
3406 if (w == NULL)
3407 goto onError;
3408 Py_UNICODE_COPY(w->str, u->str, u->length);
3409 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3410
3411 Py_DECREF(u);
3412 Py_DECREF(v);
3413 return (PyObject *)w;
3414
3415onError:
3416 Py_XDECREF(u);
3417 Py_XDECREF(v);
3418 return NULL;
3419}
3420
3421static char count__doc__[] =
3422"S.count(sub[, start[, end]]) -> int\n\
3423\n\
3424Return the number of occurrences of substring sub in Unicode string\n\
3425S[start:end]. Optional arguments start and end are\n\
3426interpreted as in slice notation.";
3427
3428static PyObject *
3429unicode_count(PyUnicodeObject *self, PyObject *args)
3430{
3431 PyUnicodeObject *substring;
3432 int start = 0;
3433 int end = INT_MAX;
3434 PyObject *result;
3435
Guido van Rossumb8872e62000-05-09 14:14:27 +00003436 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3437 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003438 return NULL;
3439
3440 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3441 (PyObject *)substring);
3442 if (substring == NULL)
3443 return NULL;
3444
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 if (start < 0)
3446 start += self->length;
3447 if (start < 0)
3448 start = 0;
3449 if (end > self->length)
3450 end = self->length;
3451 if (end < 0)
3452 end += self->length;
3453 if (end < 0)
3454 end = 0;
3455
3456 result = PyInt_FromLong((long) count(self, start, end, substring));
3457
3458 Py_DECREF(substring);
3459 return result;
3460}
3461
3462static char encode__doc__[] =
3463"S.encode([encoding[,errors]]) -> string\n\
3464\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003465Return an encoded string version of S. Default encoding is the current\n\
3466default string encoding. errors may be given to set a different error\n\
3467handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3468a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469
3470static PyObject *
3471unicode_encode(PyUnicodeObject *self, PyObject *args)
3472{
3473 char *encoding = NULL;
3474 char *errors = NULL;
3475 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3476 return NULL;
3477 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3478}
3479
3480static char expandtabs__doc__[] =
3481"S.expandtabs([tabsize]) -> unicode\n\
3482\n\
3483Return a copy of S where all tab characters are expanded using spaces.\n\
3484If tabsize is not given, a tab size of 8 characters is assumed.";
3485
3486static PyObject*
3487unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3488{
3489 Py_UNICODE *e;
3490 Py_UNICODE *p;
3491 Py_UNICODE *q;
3492 int i, j;
3493 PyUnicodeObject *u;
3494 int tabsize = 8;
3495
3496 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3497 return NULL;
3498
Thomas Wouters7e474022000-07-16 12:04:32 +00003499 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 i = j = 0;
3501 e = self->str + self->length;
3502 for (p = self->str; p < e; p++)
3503 if (*p == '\t') {
3504 if (tabsize > 0)
3505 j += tabsize - (j % tabsize);
3506 }
3507 else {
3508 j++;
3509 if (*p == '\n' || *p == '\r') {
3510 i += j;
3511 j = 0;
3512 }
3513 }
3514
3515 /* Second pass: create output string and fill it */
3516 u = _PyUnicode_New(i + j);
3517 if (!u)
3518 return NULL;
3519
3520 j = 0;
3521 q = u->str;
3522
3523 for (p = self->str; p < e; p++)
3524 if (*p == '\t') {
3525 if (tabsize > 0) {
3526 i = tabsize - (j % tabsize);
3527 j += i;
3528 while (i--)
3529 *q++ = ' ';
3530 }
3531 }
3532 else {
3533 j++;
3534 *q++ = *p;
3535 if (*p == '\n' || *p == '\r')
3536 j = 0;
3537 }
3538
3539 return (PyObject*) u;
3540}
3541
3542static char find__doc__[] =
3543"S.find(sub [,start [,end]]) -> int\n\
3544\n\
3545Return the lowest index in S where substring sub is found,\n\
3546such that sub is contained within s[start,end]. Optional\n\
3547arguments start and end are interpreted as in slice notation.\n\
3548\n\
3549Return -1 on failure.";
3550
3551static PyObject *
3552unicode_find(PyUnicodeObject *self, PyObject *args)
3553{
3554 PyUnicodeObject *substring;
3555 int start = 0;
3556 int end = INT_MAX;
3557 PyObject *result;
3558
Guido van Rossumb8872e62000-05-09 14:14:27 +00003559 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3560 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003561 return NULL;
3562 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3563 (PyObject *)substring);
3564 if (substring == NULL)
3565 return NULL;
3566
3567 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3568
3569 Py_DECREF(substring);
3570 return result;
3571}
3572
3573static PyObject *
3574unicode_getitem(PyUnicodeObject *self, int index)
3575{
3576 if (index < 0 || index >= self->length) {
3577 PyErr_SetString(PyExc_IndexError, "string index out of range");
3578 return NULL;
3579 }
3580
3581 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3582}
3583
3584static long
3585unicode_hash(PyUnicodeObject *self)
3586{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003587 /* Since Unicode objects compare equal to their ASCII string
3588 counterparts, they should use the individual character values
3589 as basis for their hash value. This is needed to assure that
3590 strings and Unicode objects behave in the same way as
3591 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003592
Fredrik Lundhdde61642000-07-10 18:27:47 +00003593 register int len;
3594 register Py_UNICODE *p;
3595 register long x;
3596
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 if (self->hash != -1)
3598 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003599 len = PyUnicode_GET_SIZE(self);
3600 p = PyUnicode_AS_UNICODE(self);
3601 x = *p << 7;
3602 while (--len >= 0)
3603 x = (1000003*x) ^ *p++;
3604 x ^= PyUnicode_GET_SIZE(self);
3605 if (x == -1)
3606 x = -2;
3607 self->hash = x;
3608 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003609}
3610
3611static char index__doc__[] =
3612"S.index(sub [,start [,end]]) -> int\n\
3613\n\
3614Like S.find() but raise ValueError when the substring is not found.";
3615
3616static PyObject *
3617unicode_index(PyUnicodeObject *self, PyObject *args)
3618{
3619 int result;
3620 PyUnicodeObject *substring;
3621 int start = 0;
3622 int end = INT_MAX;
3623
Guido van Rossumb8872e62000-05-09 14:14:27 +00003624 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3625 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003626 return NULL;
3627
3628 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3629 (PyObject *)substring);
3630 if (substring == NULL)
3631 return NULL;
3632
3633 result = findstring(self, substring, start, end, 1);
3634
3635 Py_DECREF(substring);
3636 if (result < 0) {
3637 PyErr_SetString(PyExc_ValueError, "substring not found");
3638 return NULL;
3639 }
3640 return PyInt_FromLong(result);
3641}
3642
3643static char islower__doc__[] =
3644"S.islower() -> int\n\
3645\n\
3646Return 1 if all cased characters in S are lowercase and there is\n\
3647at least one cased character in S, 0 otherwise.";
3648
3649static PyObject*
3650unicode_islower(PyUnicodeObject *self, PyObject *args)
3651{
3652 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3653 register const Py_UNICODE *e;
3654 int cased;
3655
3656 if (!PyArg_NoArgs(args))
3657 return NULL;
3658
3659 /* Shortcut for single character strings */
3660 if (PyUnicode_GET_SIZE(self) == 1)
3661 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3662
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003663 /* Special case for empty strings */
3664 if (PyString_GET_SIZE(self) == 0)
3665 return PyInt_FromLong(0);
3666
Guido van Rossumd57fd912000-03-10 22:53:23 +00003667 e = p + PyUnicode_GET_SIZE(self);
3668 cased = 0;
3669 for (; p < e; p++) {
3670 register const Py_UNICODE ch = *p;
3671
3672 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3673 return PyInt_FromLong(0);
3674 else if (!cased && Py_UNICODE_ISLOWER(ch))
3675 cased = 1;
3676 }
3677 return PyInt_FromLong(cased);
3678}
3679
3680static char isupper__doc__[] =
3681"S.isupper() -> int\n\
3682\n\
3683Return 1 if all cased characters in S are uppercase and there is\n\
3684at least one cased character in S, 0 otherwise.";
3685
3686static PyObject*
3687unicode_isupper(PyUnicodeObject *self, PyObject *args)
3688{
3689 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3690 register const Py_UNICODE *e;
3691 int cased;
3692
3693 if (!PyArg_NoArgs(args))
3694 return NULL;
3695
3696 /* Shortcut for single character strings */
3697 if (PyUnicode_GET_SIZE(self) == 1)
3698 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3699
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003700 /* Special case for empty strings */
3701 if (PyString_GET_SIZE(self) == 0)
3702 return PyInt_FromLong(0);
3703
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 e = p + PyUnicode_GET_SIZE(self);
3705 cased = 0;
3706 for (; p < e; p++) {
3707 register const Py_UNICODE ch = *p;
3708
3709 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3710 return PyInt_FromLong(0);
3711 else if (!cased && Py_UNICODE_ISUPPER(ch))
3712 cased = 1;
3713 }
3714 return PyInt_FromLong(cased);
3715}
3716
3717static char istitle__doc__[] =
3718"S.istitle() -> int\n\
3719\n\
3720Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3721may only follow uncased characters and lowercase characters only cased\n\
3722ones. Return 0 otherwise.";
3723
3724static PyObject*
3725unicode_istitle(PyUnicodeObject *self, PyObject *args)
3726{
3727 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3728 register const Py_UNICODE *e;
3729 int cased, previous_is_cased;
3730
3731 if (!PyArg_NoArgs(args))
3732 return NULL;
3733
3734 /* Shortcut for single character strings */
3735 if (PyUnicode_GET_SIZE(self) == 1)
3736 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3737 (Py_UNICODE_ISUPPER(*p) != 0));
3738
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003739 /* Special case for empty strings */
3740 if (PyString_GET_SIZE(self) == 0)
3741 return PyInt_FromLong(0);
3742
Guido van Rossumd57fd912000-03-10 22:53:23 +00003743 e = p + PyUnicode_GET_SIZE(self);
3744 cased = 0;
3745 previous_is_cased = 0;
3746 for (; p < e; p++) {
3747 register const Py_UNICODE ch = *p;
3748
3749 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3750 if (previous_is_cased)
3751 return PyInt_FromLong(0);
3752 previous_is_cased = 1;
3753 cased = 1;
3754 }
3755 else if (Py_UNICODE_ISLOWER(ch)) {
3756 if (!previous_is_cased)
3757 return PyInt_FromLong(0);
3758 previous_is_cased = 1;
3759 cased = 1;
3760 }
3761 else
3762 previous_is_cased = 0;
3763 }
3764 return PyInt_FromLong(cased);
3765}
3766
3767static char isspace__doc__[] =
3768"S.isspace() -> int\n\
3769\n\
3770Return 1 if there are only whitespace characters in S,\n\
37710 otherwise.";
3772
3773static PyObject*
3774unicode_isspace(PyUnicodeObject *self, PyObject *args)
3775{
3776 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3777 register const Py_UNICODE *e;
3778
3779 if (!PyArg_NoArgs(args))
3780 return NULL;
3781
3782 /* Shortcut for single character strings */
3783 if (PyUnicode_GET_SIZE(self) == 1 &&
3784 Py_UNICODE_ISSPACE(*p))
3785 return PyInt_FromLong(1);
3786
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003787 /* Special case for empty strings */
3788 if (PyString_GET_SIZE(self) == 0)
3789 return PyInt_FromLong(0);
3790
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791 e = p + PyUnicode_GET_SIZE(self);
3792 for (; p < e; p++) {
3793 if (!Py_UNICODE_ISSPACE(*p))
3794 return PyInt_FromLong(0);
3795 }
3796 return PyInt_FromLong(1);
3797}
3798
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003799static char isalpha__doc__[] =
3800"S.isalpha() -> int\n\
3801\n\
3802Return 1 if all characters in S are alphabetic\n\
3803and there is at least one character in S, 0 otherwise.";
3804
3805static PyObject*
3806unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3807{
3808 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3809 register const Py_UNICODE *e;
3810
3811 if (!PyArg_NoArgs(args))
3812 return NULL;
3813
3814 /* Shortcut for single character strings */
3815 if (PyUnicode_GET_SIZE(self) == 1 &&
3816 Py_UNICODE_ISALPHA(*p))
3817 return PyInt_FromLong(1);
3818
3819 /* Special case for empty strings */
3820 if (PyString_GET_SIZE(self) == 0)
3821 return PyInt_FromLong(0);
3822
3823 e = p + PyUnicode_GET_SIZE(self);
3824 for (; p < e; p++) {
3825 if (!Py_UNICODE_ISALPHA(*p))
3826 return PyInt_FromLong(0);
3827 }
3828 return PyInt_FromLong(1);
3829}
3830
3831static char isalnum__doc__[] =
3832"S.isalnum() -> int\n\
3833\n\
3834Return 1 if all characters in S are alphanumeric\n\
3835and there is at least one character in S, 0 otherwise.";
3836
3837static PyObject*
3838unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3839{
3840 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3841 register const Py_UNICODE *e;
3842
3843 if (!PyArg_NoArgs(args))
3844 return NULL;
3845
3846 /* Shortcut for single character strings */
3847 if (PyUnicode_GET_SIZE(self) == 1 &&
3848 Py_UNICODE_ISALNUM(*p))
3849 return PyInt_FromLong(1);
3850
3851 /* Special case for empty strings */
3852 if (PyString_GET_SIZE(self) == 0)
3853 return PyInt_FromLong(0);
3854
3855 e = p + PyUnicode_GET_SIZE(self);
3856 for (; p < e; p++) {
3857 if (!Py_UNICODE_ISALNUM(*p))
3858 return PyInt_FromLong(0);
3859 }
3860 return PyInt_FromLong(1);
3861}
3862
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863static char isdecimal__doc__[] =
3864"S.isdecimal() -> int\n\
3865\n\
3866Return 1 if there are only decimal characters in S,\n\
38670 otherwise.";
3868
3869static PyObject*
3870unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3871{
3872 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3873 register const Py_UNICODE *e;
3874
3875 if (!PyArg_NoArgs(args))
3876 return NULL;
3877
3878 /* Shortcut for single character strings */
3879 if (PyUnicode_GET_SIZE(self) == 1 &&
3880 Py_UNICODE_ISDECIMAL(*p))
3881 return PyInt_FromLong(1);
3882
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003883 /* Special case for empty strings */
3884 if (PyString_GET_SIZE(self) == 0)
3885 return PyInt_FromLong(0);
3886
Guido van Rossumd57fd912000-03-10 22:53:23 +00003887 e = p + PyUnicode_GET_SIZE(self);
3888 for (; p < e; p++) {
3889 if (!Py_UNICODE_ISDECIMAL(*p))
3890 return PyInt_FromLong(0);
3891 }
3892 return PyInt_FromLong(1);
3893}
3894
3895static char isdigit__doc__[] =
3896"S.isdigit() -> int\n\
3897\n\
3898Return 1 if there are only digit characters in S,\n\
38990 otherwise.";
3900
3901static PyObject*
3902unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3903{
3904 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3905 register const Py_UNICODE *e;
3906
3907 if (!PyArg_NoArgs(args))
3908 return NULL;
3909
3910 /* Shortcut for single character strings */
3911 if (PyUnicode_GET_SIZE(self) == 1 &&
3912 Py_UNICODE_ISDIGIT(*p))
3913 return PyInt_FromLong(1);
3914
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003915 /* Special case for empty strings */
3916 if (PyString_GET_SIZE(self) == 0)
3917 return PyInt_FromLong(0);
3918
Guido van Rossumd57fd912000-03-10 22:53:23 +00003919 e = p + PyUnicode_GET_SIZE(self);
3920 for (; p < e; p++) {
3921 if (!Py_UNICODE_ISDIGIT(*p))
3922 return PyInt_FromLong(0);
3923 }
3924 return PyInt_FromLong(1);
3925}
3926
3927static char isnumeric__doc__[] =
3928"S.isnumeric() -> int\n\
3929\n\
3930Return 1 if there are only numeric characters in S,\n\
39310 otherwise.";
3932
3933static PyObject*
3934unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3935{
3936 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3937 register const Py_UNICODE *e;
3938
3939 if (!PyArg_NoArgs(args))
3940 return NULL;
3941
3942 /* Shortcut for single character strings */
3943 if (PyUnicode_GET_SIZE(self) == 1 &&
3944 Py_UNICODE_ISNUMERIC(*p))
3945 return PyInt_FromLong(1);
3946
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003947 /* Special case for empty strings */
3948 if (PyString_GET_SIZE(self) == 0)
3949 return PyInt_FromLong(0);
3950
Guido van Rossumd57fd912000-03-10 22:53:23 +00003951 e = p + PyUnicode_GET_SIZE(self);
3952 for (; p < e; p++) {
3953 if (!Py_UNICODE_ISNUMERIC(*p))
3954 return PyInt_FromLong(0);
3955 }
3956 return PyInt_FromLong(1);
3957}
3958
3959static char join__doc__[] =
3960"S.join(sequence) -> unicode\n\
3961\n\
3962Return a string which is the concatenation of the strings in the\n\
3963sequence. The separator between elements is S.";
3964
3965static PyObject*
3966unicode_join(PyUnicodeObject *self, PyObject *args)
3967{
3968 PyObject *data;
3969 if (!PyArg_ParseTuple(args, "O:join", &data))
3970 return NULL;
3971
3972 return PyUnicode_Join((PyObject *)self, data);
3973}
3974
3975static int
3976unicode_length(PyUnicodeObject *self)
3977{
3978 return self->length;
3979}
3980
3981static char ljust__doc__[] =
3982"S.ljust(width) -> unicode\n\
3983\n\
3984Return S left justified in a Unicode string of length width. Padding is\n\
3985done using spaces.";
3986
3987static PyObject *
3988unicode_ljust(PyUnicodeObject *self, PyObject *args)
3989{
3990 int width;
3991 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3992 return NULL;
3993
3994 if (self->length >= width) {
3995 Py_INCREF(self);
3996 return (PyObject*) self;
3997 }
3998
3999 return (PyObject*) pad(self, 0, width - self->length, ' ');
4000}
4001
4002static char lower__doc__[] =
4003"S.lower() -> unicode\n\
4004\n\
4005Return a copy of the string S converted to lowercase.";
4006
4007static PyObject*
4008unicode_lower(PyUnicodeObject *self, PyObject *args)
4009{
4010 if (!PyArg_NoArgs(args))
4011 return NULL;
4012 return fixup(self, fixlower);
4013}
4014
4015static char lstrip__doc__[] =
4016"S.lstrip() -> unicode\n\
4017\n\
4018Return a copy of the string S with leading whitespace removed.";
4019
4020static PyObject *
4021unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4022{
4023 if (!PyArg_NoArgs(args))
4024 return NULL;
4025 return strip(self, 1, 0);
4026}
4027
4028static PyObject*
4029unicode_repeat(PyUnicodeObject *str, int len)
4030{
4031 PyUnicodeObject *u;
4032 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004033 int nchars;
4034 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004035
4036 if (len < 0)
4037 len = 0;
4038
4039 if (len == 1) {
4040 /* no repeat, return original string */
4041 Py_INCREF(str);
4042 return (PyObject*) str;
4043 }
Tim Peters8f422462000-09-09 06:13:41 +00004044
4045 /* ensure # of chars needed doesn't overflow int and # of bytes
4046 * needed doesn't overflow size_t
4047 */
4048 nchars = len * str->length;
4049 if (len && nchars / len != str->length) {
4050 PyErr_SetString(PyExc_OverflowError,
4051 "repeated string is too long");
4052 return NULL;
4053 }
4054 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4055 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4056 PyErr_SetString(PyExc_OverflowError,
4057 "repeated string is too long");
4058 return NULL;
4059 }
4060 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004061 if (!u)
4062 return NULL;
4063
4064 p = u->str;
4065
4066 while (len-- > 0) {
4067 Py_UNICODE_COPY(p, str->str, str->length);
4068 p += str->length;
4069 }
4070
4071 return (PyObject*) u;
4072}
4073
4074PyObject *PyUnicode_Replace(PyObject *obj,
4075 PyObject *subobj,
4076 PyObject *replobj,
4077 int maxcount)
4078{
4079 PyObject *self;
4080 PyObject *str1;
4081 PyObject *str2;
4082 PyObject *result;
4083
4084 self = PyUnicode_FromObject(obj);
4085 if (self == NULL)
4086 return NULL;
4087 str1 = PyUnicode_FromObject(subobj);
4088 if (str1 == NULL) {
4089 Py_DECREF(self);
4090 return NULL;
4091 }
4092 str2 = PyUnicode_FromObject(replobj);
4093 if (str2 == NULL) {
4094 Py_DECREF(self);
4095 Py_DECREF(str1);
4096 return NULL;
4097 }
4098 result = replace((PyUnicodeObject *)self,
4099 (PyUnicodeObject *)str1,
4100 (PyUnicodeObject *)str2,
4101 maxcount);
4102 Py_DECREF(self);
4103 Py_DECREF(str1);
4104 Py_DECREF(str2);
4105 return result;
4106}
4107
4108static char replace__doc__[] =
4109"S.replace (old, new[, maxsplit]) -> unicode\n\
4110\n\
4111Return a copy of S with all occurrences of substring\n\
4112old replaced by new. If the optional argument maxsplit is\n\
4113given, only the first maxsplit occurrences are replaced.";
4114
4115static PyObject*
4116unicode_replace(PyUnicodeObject *self, PyObject *args)
4117{
4118 PyUnicodeObject *str1;
4119 PyUnicodeObject *str2;
4120 int maxcount = -1;
4121 PyObject *result;
4122
4123 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4124 return NULL;
4125 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4126 if (str1 == NULL)
4127 return NULL;
4128 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4129 if (str2 == NULL)
4130 return NULL;
4131
4132 result = replace(self, str1, str2, maxcount);
4133
4134 Py_DECREF(str1);
4135 Py_DECREF(str2);
4136 return result;
4137}
4138
4139static
4140PyObject *unicode_repr(PyObject *unicode)
4141{
4142 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4143 PyUnicode_GET_SIZE(unicode),
4144 1);
4145}
4146
4147static char rfind__doc__[] =
4148"S.rfind(sub [,start [,end]]) -> int\n\
4149\n\
4150Return the highest index in S where substring sub is found,\n\
4151such that sub is contained within s[start,end]. Optional\n\
4152arguments start and end are interpreted as in slice notation.\n\
4153\n\
4154Return -1 on failure.";
4155
4156static PyObject *
4157unicode_rfind(PyUnicodeObject *self, PyObject *args)
4158{
4159 PyUnicodeObject *substring;
4160 int start = 0;
4161 int end = INT_MAX;
4162 PyObject *result;
4163
Guido van Rossumb8872e62000-05-09 14:14:27 +00004164 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4165 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004166 return NULL;
4167 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4168 (PyObject *)substring);
4169 if (substring == NULL)
4170 return NULL;
4171
4172 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4173
4174 Py_DECREF(substring);
4175 return result;
4176}
4177
4178static char rindex__doc__[] =
4179"S.rindex(sub [,start [,end]]) -> int\n\
4180\n\
4181Like S.rfind() but raise ValueError when the substring is not found.";
4182
4183static PyObject *
4184unicode_rindex(PyUnicodeObject *self, PyObject *args)
4185{
4186 int result;
4187 PyUnicodeObject *substring;
4188 int start = 0;
4189 int end = INT_MAX;
4190
Guido van Rossumb8872e62000-05-09 14:14:27 +00004191 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4192 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004193 return NULL;
4194 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4195 (PyObject *)substring);
4196 if (substring == NULL)
4197 return NULL;
4198
4199 result = findstring(self, substring, start, end, -1);
4200
4201 Py_DECREF(substring);
4202 if (result < 0) {
4203 PyErr_SetString(PyExc_ValueError, "substring not found");
4204 return NULL;
4205 }
4206 return PyInt_FromLong(result);
4207}
4208
4209static char rjust__doc__[] =
4210"S.rjust(width) -> unicode\n\
4211\n\
4212Return S right justified in a Unicode string of length width. Padding is\n\
4213done using spaces.";
4214
4215static PyObject *
4216unicode_rjust(PyUnicodeObject *self, PyObject *args)
4217{
4218 int width;
4219 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4220 return NULL;
4221
4222 if (self->length >= width) {
4223 Py_INCREF(self);
4224 return (PyObject*) self;
4225 }
4226
4227 return (PyObject*) pad(self, width - self->length, 0, ' ');
4228}
4229
4230static char rstrip__doc__[] =
4231"S.rstrip() -> unicode\n\
4232\n\
4233Return a copy of the string S with trailing whitespace removed.";
4234
4235static PyObject *
4236unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4237{
4238 if (!PyArg_NoArgs(args))
4239 return NULL;
4240 return strip(self, 0, 1);
4241}
4242
4243static PyObject*
4244unicode_slice(PyUnicodeObject *self, int start, int end)
4245{
4246 /* standard clamping */
4247 if (start < 0)
4248 start = 0;
4249 if (end < 0)
4250 end = 0;
4251 if (end > self->length)
4252 end = self->length;
4253 if (start == 0 && end == self->length) {
4254 /* full slice, return original string */
4255 Py_INCREF(self);
4256 return (PyObject*) self;
4257 }
4258 if (start > end)
4259 start = end;
4260 /* copy slice */
4261 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4262 end - start);
4263}
4264
4265PyObject *PyUnicode_Split(PyObject *s,
4266 PyObject *sep,
4267 int maxsplit)
4268{
4269 PyObject *result;
4270
4271 s = PyUnicode_FromObject(s);
4272 if (s == NULL)
4273 return NULL;
4274 if (sep != NULL) {
4275 sep = PyUnicode_FromObject(sep);
4276 if (sep == NULL) {
4277 Py_DECREF(s);
4278 return NULL;
4279 }
4280 }
4281
4282 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4283
4284 Py_DECREF(s);
4285 Py_XDECREF(sep);
4286 return result;
4287}
4288
4289static char split__doc__[] =
4290"S.split([sep [,maxsplit]]) -> list of strings\n\
4291\n\
4292Return a list of the words in S, using sep as the\n\
4293delimiter string. If maxsplit is given, at most maxsplit\n\
4294splits are done. If sep is not specified, any whitespace string\n\
4295is a separator.";
4296
4297static PyObject*
4298unicode_split(PyUnicodeObject *self, PyObject *args)
4299{
4300 PyObject *substring = Py_None;
4301 int maxcount = -1;
4302
4303 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4304 return NULL;
4305
4306 if (substring == Py_None)
4307 return split(self, NULL, maxcount);
4308 else if (PyUnicode_Check(substring))
4309 return split(self, (PyUnicodeObject *)substring, maxcount);
4310 else
4311 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4312}
4313
4314static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004315"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316\n\
4317Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004318Line breaks are not included in the resulting list unless keepends\n\
4319is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320
4321static PyObject*
4322unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4323{
Guido van Rossum86662912000-04-11 15:38:46 +00004324 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004325
Guido van Rossum86662912000-04-11 15:38:46 +00004326 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004327 return NULL;
4328
Guido van Rossum86662912000-04-11 15:38:46 +00004329 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330}
4331
4332static
4333PyObject *unicode_str(PyUnicodeObject *self)
4334{
Fred Drakee4315f52000-05-09 19:53:39 +00004335 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004336}
4337
4338static char strip__doc__[] =
4339"S.strip() -> unicode\n\
4340\n\
4341Return a copy of S with leading and trailing whitespace removed.";
4342
4343static PyObject *
4344unicode_strip(PyUnicodeObject *self, PyObject *args)
4345{
4346 if (!PyArg_NoArgs(args))
4347 return NULL;
4348 return strip(self, 1, 1);
4349}
4350
4351static char swapcase__doc__[] =
4352"S.swapcase() -> unicode\n\
4353\n\
4354Return a copy of S with uppercase characters converted to lowercase\n\
4355and vice versa.";
4356
4357static PyObject*
4358unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4359{
4360 if (!PyArg_NoArgs(args))
4361 return NULL;
4362 return fixup(self, fixswapcase);
4363}
4364
4365static char translate__doc__[] =
4366"S.translate(table) -> unicode\n\
4367\n\
4368Return a copy of the string S, where all characters have been mapped\n\
4369through the given translation table, which must be a mapping of\n\
4370Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4371are left untouched. Characters mapped to None are deleted.";
4372
4373static PyObject*
4374unicode_translate(PyUnicodeObject *self, PyObject *args)
4375{
4376 PyObject *table;
4377
4378 if (!PyArg_ParseTuple(args, "O:translate", &table))
4379 return NULL;
4380 return PyUnicode_TranslateCharmap(self->str,
4381 self->length,
4382 table,
4383 "ignore");
4384}
4385
4386static char upper__doc__[] =
4387"S.upper() -> unicode\n\
4388\n\
4389Return a copy of S converted to uppercase.";
4390
4391static PyObject*
4392unicode_upper(PyUnicodeObject *self, PyObject *args)
4393{
4394 if (!PyArg_NoArgs(args))
4395 return NULL;
4396 return fixup(self, fixupper);
4397}
4398
4399#if 0
4400static char zfill__doc__[] =
4401"S.zfill(width) -> unicode\n\
4402\n\
4403Pad a numeric string x with zeros on the left, to fill a field\n\
4404of the specified width. The string x is never truncated.";
4405
4406static PyObject *
4407unicode_zfill(PyUnicodeObject *self, PyObject *args)
4408{
4409 int fill;
4410 PyUnicodeObject *u;
4411
4412 int width;
4413 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4414 return NULL;
4415
4416 if (self->length >= width) {
4417 Py_INCREF(self);
4418 return (PyObject*) self;
4419 }
4420
4421 fill = width - self->length;
4422
4423 u = pad(self, fill, 0, '0');
4424
4425 if (u->str[fill] == '+' || u->str[fill] == '-') {
4426 /* move sign to beginning of string */
4427 u->str[0] = u->str[fill];
4428 u->str[fill] = '0';
4429 }
4430
4431 return (PyObject*) u;
4432}
4433#endif
4434
4435#if 0
4436static PyObject*
4437unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4438{
4439 if (!PyArg_NoArgs(args))
4440 return NULL;
4441 return PyInt_FromLong(unicode_freelist_size);
4442}
4443#endif
4444
4445static char startswith__doc__[] =
4446"S.startswith(prefix[, start[, end]]) -> int\n\
4447\n\
4448Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4449optional start, test S beginning at that position. With optional end, stop\n\
4450comparing S at that position.";
4451
4452static PyObject *
4453unicode_startswith(PyUnicodeObject *self,
4454 PyObject *args)
4455{
4456 PyUnicodeObject *substring;
4457 int start = 0;
4458 int end = INT_MAX;
4459 PyObject *result;
4460
Guido van Rossumb8872e62000-05-09 14:14:27 +00004461 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4462 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004463 return NULL;
4464 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4465 (PyObject *)substring);
4466 if (substring == NULL)
4467 return NULL;
4468
4469 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4470
4471 Py_DECREF(substring);
4472 return result;
4473}
4474
4475
4476static char endswith__doc__[] =
4477"S.endswith(suffix[, start[, end]]) -> int\n\
4478\n\
4479Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4480optional start, test S beginning at that position. With optional end, stop\n\
4481comparing S at that position.";
4482
4483static PyObject *
4484unicode_endswith(PyUnicodeObject *self,
4485 PyObject *args)
4486{
4487 PyUnicodeObject *substring;
4488 int start = 0;
4489 int end = INT_MAX;
4490 PyObject *result;
4491
Guido van Rossumb8872e62000-05-09 14:14:27 +00004492 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4493 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494 return NULL;
4495 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4496 (PyObject *)substring);
4497 if (substring == NULL)
4498 return NULL;
4499
4500 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4501
4502 Py_DECREF(substring);
4503 return result;
4504}
4505
4506
4507static PyMethodDef unicode_methods[] = {
4508
4509 /* Order is according to common usage: often used methods should
4510 appear first, since lookup is done sequentially. */
4511
4512 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4513 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4514 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4515 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4516 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4517 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4518 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4519 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4520 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4521 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4522 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4523 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4524 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4525 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4526/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4527 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4528 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4529 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4530 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4531 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4532 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4533 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4534 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4535 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4536 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4537 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4538 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4539 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4540 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4541 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4542 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4543 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4544 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004545 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4546 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004547#if 0
4548 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4549 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4550#endif
4551
4552#if 0
4553 /* This one is just used for debugging the implementation. */
4554 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4555#endif
4556
4557 {NULL, NULL}
4558};
4559
4560static PyObject *
4561unicode_getattr(PyUnicodeObject *self, char *name)
4562{
4563 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4564}
4565
4566static PySequenceMethods unicode_as_sequence = {
4567 (inquiry) unicode_length, /* sq_length */
4568 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4569 (intargfunc) unicode_repeat, /* sq_repeat */
4570 (intargfunc) unicode_getitem, /* sq_item */
4571 (intintargfunc) unicode_slice, /* sq_slice */
4572 0, /* sq_ass_item */
4573 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004574 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004575};
4576
4577static int
4578unicode_buffer_getreadbuf(PyUnicodeObject *self,
4579 int index,
4580 const void **ptr)
4581{
4582 if (index != 0) {
4583 PyErr_SetString(PyExc_SystemError,
4584 "accessing non-existent unicode segment");
4585 return -1;
4586 }
4587 *ptr = (void *) self->str;
4588 return PyUnicode_GET_DATA_SIZE(self);
4589}
4590
4591static int
4592unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4593 const void **ptr)
4594{
4595 PyErr_SetString(PyExc_TypeError,
4596 "cannot use unicode as modifyable buffer");
4597 return -1;
4598}
4599
4600static int
4601unicode_buffer_getsegcount(PyUnicodeObject *self,
4602 int *lenp)
4603{
4604 if (lenp)
4605 *lenp = PyUnicode_GET_DATA_SIZE(self);
4606 return 1;
4607}
4608
4609static int
4610unicode_buffer_getcharbuf(PyUnicodeObject *self,
4611 int index,
4612 const void **ptr)
4613{
4614 PyObject *str;
4615
4616 if (index != 0) {
4617 PyErr_SetString(PyExc_SystemError,
4618 "accessing non-existent unicode segment");
4619 return -1;
4620 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004621 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 if (str == NULL)
4623 return -1;
4624 *ptr = (void *) PyString_AS_STRING(str);
4625 return PyString_GET_SIZE(str);
4626}
4627
4628/* Helpers for PyUnicode_Format() */
4629
4630static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004631getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004632{
4633 int argidx = *p_argidx;
4634 if (argidx < arglen) {
4635 (*p_argidx)++;
4636 if (arglen < 0)
4637 return args;
4638 else
4639 return PyTuple_GetItem(args, argidx);
4640 }
4641 PyErr_SetString(PyExc_TypeError,
4642 "not enough arguments for format string");
4643 return NULL;
4644}
4645
4646#define F_LJUST (1<<0)
4647#define F_SIGN (1<<1)
4648#define F_BLANK (1<<2)
4649#define F_ALT (1<<3)
4650#define F_ZERO (1<<4)
4651
4652static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004653int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004654{
4655 register int i;
4656 int len;
4657 va_list va;
4658 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660
4661 /* First, format the string as char array, then expand to Py_UNICODE
4662 array. */
4663 charbuffer = (char *)buffer;
4664 len = vsprintf(charbuffer, format, va);
4665 for (i = len - 1; i >= 0; i--)
4666 buffer[i] = (Py_UNICODE) charbuffer[i];
4667
4668 va_end(va);
4669 return len;
4670}
4671
4672static int
4673formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004674 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 int flags,
4676 int prec,
4677 int type,
4678 PyObject *v)
4679{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004680 /* fmt = '%#.' + `prec` + `type`
4681 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004682 char fmt[20];
4683 double x;
4684
4685 x = PyFloat_AsDouble(v);
4686 if (x == -1.0 && PyErr_Occurred())
4687 return -1;
4688 if (prec < 0)
4689 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4691 type = 'g';
4692 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004693 /* worst case length calc to ensure no buffer overrun:
4694 fmt = %#.<prec>g
4695 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4696 for any double rep.)
4697 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4698 If prec=0 the effective precision is 1 (the leading digit is
4699 always given), therefore increase by one to 10+prec. */
4700 if (buflen <= (size_t)10 + (size_t)prec) {
4701 PyErr_SetString(PyExc_OverflowError,
4702 "formatted float is too long (precision too long?)");
4703 return -1;
4704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705 return usprintf(buf, fmt, x);
4706}
4707
Tim Peters38fd5b62000-09-21 05:43:11 +00004708static PyObject*
4709formatlong(PyObject *val, int flags, int prec, int type)
4710{
4711 char *buf;
4712 int i, len;
4713 PyObject *str; /* temporary string object. */
4714 PyUnicodeObject *result;
4715
4716 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4717 if (!str)
4718 return NULL;
4719 result = _PyUnicode_New(len);
4720 for (i = 0; i < len; i++)
4721 result->str[i] = buf[i];
4722 result->str[len] = 0;
4723 Py_DECREF(str);
4724 return (PyObject*)result;
4725}
4726
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727static int
4728formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004729 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 int flags,
4731 int prec,
4732 int type,
4733 PyObject *v)
4734{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004735 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004736 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4737 + 1 + 1 = 24*/
4738 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739 long x;
4740
4741 x = PyInt_AsLong(v);
4742 if (x == -1 && PyErr_Occurred())
4743 return -1;
4744 if (prec < 0)
4745 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004746 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4747 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4748 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4749 PyErr_SetString(PyExc_OverflowError,
4750 "formatted integer is too long (precision too long?)");
4751 return -1;
4752 }
Tim Petersfff53252001-04-12 18:38:48 +00004753 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4754 * but we want it (for consistency with other %#x conversions, and
4755 * for consistency with Python's hex() function).
4756 */
4757 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X'))
4758 sprintf(fmt, "0%c%%%s.%dl%c", type, "#", prec, type);
4759 else
4760 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004761 return usprintf(buf, fmt, x);
4762}
4763
4764static int
4765formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004766 size_t buflen,
4767 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004768{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004769 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004770 if (PyUnicode_Check(v)) {
4771 if (PyUnicode_GET_SIZE(v) != 1)
4772 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004773 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004774 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004776 else if (PyString_Check(v)) {
4777 if (PyString_GET_SIZE(v) != 1)
4778 goto onError;
4779 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4780 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781
4782 else {
4783 /* Integer input truncated to a character */
4784 long x;
4785 x = PyInt_AsLong(v);
4786 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004787 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004788 buf[0] = (char) x;
4789 }
4790 buf[1] = '\0';
4791 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004792
4793 onError:
4794 PyErr_SetString(PyExc_TypeError,
4795 "%c requires int or char");
4796 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004797}
4798
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004799/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4800
4801 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4802 chars are formatted. XXX This is a magic number. Each formatting
4803 routine does bounds checking to ensure no overflow, but a better
4804 solution may be to malloc a buffer of appropriate size for each
4805 format. For now, the current solution is sufficient.
4806*/
4807#define FORMATBUFLEN (size_t)120
4808
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809PyObject *PyUnicode_Format(PyObject *format,
4810 PyObject *args)
4811{
4812 Py_UNICODE *fmt, *res;
4813 int fmtcnt, rescnt, reslen, arglen, argidx;
4814 int args_owned = 0;
4815 PyUnicodeObject *result = NULL;
4816 PyObject *dict = NULL;
4817 PyObject *uformat;
4818
4819 if (format == NULL || args == NULL) {
4820 PyErr_BadInternalCall();
4821 return NULL;
4822 }
4823 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004824 if (uformat == NULL)
4825 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004826 fmt = PyUnicode_AS_UNICODE(uformat);
4827 fmtcnt = PyUnicode_GET_SIZE(uformat);
4828
4829 reslen = rescnt = fmtcnt + 100;
4830 result = _PyUnicode_New(reslen);
4831 if (result == NULL)
4832 goto onError;
4833 res = PyUnicode_AS_UNICODE(result);
4834
4835 if (PyTuple_Check(args)) {
4836 arglen = PyTuple_Size(args);
4837 argidx = 0;
4838 }
4839 else {
4840 arglen = -1;
4841 argidx = -2;
4842 }
4843 if (args->ob_type->tp_as_mapping)
4844 dict = args;
4845
4846 while (--fmtcnt >= 0) {
4847 if (*fmt != '%') {
4848 if (--rescnt < 0) {
4849 rescnt = fmtcnt + 100;
4850 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004851 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 return NULL;
4853 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4854 --rescnt;
4855 }
4856 *res++ = *fmt++;
4857 }
4858 else {
4859 /* Got a format specifier */
4860 int flags = 0;
4861 int width = -1;
4862 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004863 Py_UNICODE c = '\0';
4864 Py_UNICODE fill;
4865 PyObject *v = NULL;
4866 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004867 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004868 Py_UNICODE sign;
4869 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004870 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004871
4872 fmt++;
4873 if (*fmt == '(') {
4874 Py_UNICODE *keystart;
4875 int keylen;
4876 PyObject *key;
4877 int pcount = 1;
4878
4879 if (dict == NULL) {
4880 PyErr_SetString(PyExc_TypeError,
4881 "format requires a mapping");
4882 goto onError;
4883 }
4884 ++fmt;
4885 --fmtcnt;
4886 keystart = fmt;
4887 /* Skip over balanced parentheses */
4888 while (pcount > 0 && --fmtcnt >= 0) {
4889 if (*fmt == ')')
4890 --pcount;
4891 else if (*fmt == '(')
4892 ++pcount;
4893 fmt++;
4894 }
4895 keylen = fmt - keystart - 1;
4896 if (fmtcnt < 0 || pcount > 0) {
4897 PyErr_SetString(PyExc_ValueError,
4898 "incomplete format key");
4899 goto onError;
4900 }
Fred Drakee4315f52000-05-09 19:53:39 +00004901 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004902 then looked up since Python uses strings to hold
4903 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004904 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004905 key = PyUnicode_EncodeUTF8(keystart,
4906 keylen,
4907 NULL);
4908 if (key == NULL)
4909 goto onError;
4910 if (args_owned) {
4911 Py_DECREF(args);
4912 args_owned = 0;
4913 }
4914 args = PyObject_GetItem(dict, key);
4915 Py_DECREF(key);
4916 if (args == NULL) {
4917 goto onError;
4918 }
4919 args_owned = 1;
4920 arglen = -1;
4921 argidx = -2;
4922 }
4923 while (--fmtcnt >= 0) {
4924 switch (c = *fmt++) {
4925 case '-': flags |= F_LJUST; continue;
4926 case '+': flags |= F_SIGN; continue;
4927 case ' ': flags |= F_BLANK; continue;
4928 case '#': flags |= F_ALT; continue;
4929 case '0': flags |= F_ZERO; continue;
4930 }
4931 break;
4932 }
4933 if (c == '*') {
4934 v = getnextarg(args, arglen, &argidx);
4935 if (v == NULL)
4936 goto onError;
4937 if (!PyInt_Check(v)) {
4938 PyErr_SetString(PyExc_TypeError,
4939 "* wants int");
4940 goto onError;
4941 }
4942 width = PyInt_AsLong(v);
4943 if (width < 0) {
4944 flags |= F_LJUST;
4945 width = -width;
4946 }
4947 if (--fmtcnt >= 0)
4948 c = *fmt++;
4949 }
4950 else if (c >= '0' && c <= '9') {
4951 width = c - '0';
4952 while (--fmtcnt >= 0) {
4953 c = *fmt++;
4954 if (c < '0' || c > '9')
4955 break;
4956 if ((width*10) / 10 != width) {
4957 PyErr_SetString(PyExc_ValueError,
4958 "width too big");
4959 goto onError;
4960 }
4961 width = width*10 + (c - '0');
4962 }
4963 }
4964 if (c == '.') {
4965 prec = 0;
4966 if (--fmtcnt >= 0)
4967 c = *fmt++;
4968 if (c == '*') {
4969 v = getnextarg(args, arglen, &argidx);
4970 if (v == NULL)
4971 goto onError;
4972 if (!PyInt_Check(v)) {
4973 PyErr_SetString(PyExc_TypeError,
4974 "* wants int");
4975 goto onError;
4976 }
4977 prec = PyInt_AsLong(v);
4978 if (prec < 0)
4979 prec = 0;
4980 if (--fmtcnt >= 0)
4981 c = *fmt++;
4982 }
4983 else if (c >= '0' && c <= '9') {
4984 prec = c - '0';
4985 while (--fmtcnt >= 0) {
4986 c = Py_CHARMASK(*fmt++);
4987 if (c < '0' || c > '9')
4988 break;
4989 if ((prec*10) / 10 != prec) {
4990 PyErr_SetString(PyExc_ValueError,
4991 "prec too big");
4992 goto onError;
4993 }
4994 prec = prec*10 + (c - '0');
4995 }
4996 }
4997 } /* prec */
4998 if (fmtcnt >= 0) {
4999 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 if (--fmtcnt >= 0)
5001 c = *fmt++;
5002 }
5003 }
5004 if (fmtcnt < 0) {
5005 PyErr_SetString(PyExc_ValueError,
5006 "incomplete format");
5007 goto onError;
5008 }
5009 if (c != '%') {
5010 v = getnextarg(args, arglen, &argidx);
5011 if (v == NULL)
5012 goto onError;
5013 }
5014 sign = 0;
5015 fill = ' ';
5016 switch (c) {
5017
5018 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005019 pbuf = formatbuf;
5020 /* presume that buffer length is at least 1 */
5021 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 len = 1;
5023 break;
5024
5025 case 's':
5026 case 'r':
5027 if (PyUnicode_Check(v) && c == 's') {
5028 temp = v;
5029 Py_INCREF(temp);
5030 }
5031 else {
5032 PyObject *unicode;
5033 if (c == 's')
5034 temp = PyObject_Str(v);
5035 else
5036 temp = PyObject_Repr(v);
5037 if (temp == NULL)
5038 goto onError;
5039 if (!PyString_Check(temp)) {
5040 /* XXX Note: this should never happen, since
5041 PyObject_Repr() and PyObject_Str() assure
5042 this */
5043 Py_DECREF(temp);
5044 PyErr_SetString(PyExc_TypeError,
5045 "%s argument has non-string str()");
5046 goto onError;
5047 }
Fred Drakee4315f52000-05-09 19:53:39 +00005048 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005049 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005050 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005051 "strict");
5052 Py_DECREF(temp);
5053 temp = unicode;
5054 if (temp == NULL)
5055 goto onError;
5056 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005057 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005058 len = PyUnicode_GET_SIZE(temp);
5059 if (prec >= 0 && len > prec)
5060 len = prec;
5061 break;
5062
5063 case 'i':
5064 case 'd':
5065 case 'u':
5066 case 'o':
5067 case 'x':
5068 case 'X':
5069 if (c == 'i')
5070 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005071 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005072 temp = formatlong(v, flags, prec, c);
5073 if (!temp)
5074 goto onError;
5075 pbuf = PyUnicode_AS_UNICODE(temp);
5076 len = PyUnicode_GET_SIZE(temp);
5077 /* unbounded ints can always produce
5078 a sign character! */
5079 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005080 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005081 else {
5082 pbuf = formatbuf;
5083 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5084 flags, prec, c, v);
5085 if (len < 0)
5086 goto onError;
5087 /* only d conversion is signed */
5088 sign = c == 'd';
5089 }
5090 if (flags & F_ZERO)
5091 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005092 break;
5093
5094 case 'e':
5095 case 'E':
5096 case 'f':
5097 case 'g':
5098 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005099 pbuf = formatbuf;
5100 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5101 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005102 if (len < 0)
5103 goto onError;
5104 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005105 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005106 fill = '0';
5107 break;
5108
5109 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005110 pbuf = formatbuf;
5111 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 if (len < 0)
5113 goto onError;
5114 break;
5115
5116 default:
5117 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005118 "unsupported format character '%c' (0x%x) "
5119 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005120 (31<=c && c<=126) ? c : '?',
5121 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 goto onError;
5123 }
5124 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005125 if (*pbuf == '-' || *pbuf == '+') {
5126 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 len--;
5128 }
5129 else if (flags & F_SIGN)
5130 sign = '+';
5131 else if (flags & F_BLANK)
5132 sign = ' ';
5133 else
5134 sign = 0;
5135 }
5136 if (width < len)
5137 width = len;
5138 if (rescnt < width + (sign != 0)) {
5139 reslen -= rescnt;
5140 rescnt = width + fmtcnt + 100;
5141 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005142 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005143 return NULL;
5144 res = PyUnicode_AS_UNICODE(result)
5145 + reslen - rescnt;
5146 }
5147 if (sign) {
5148 if (fill != ' ')
5149 *res++ = sign;
5150 rescnt--;
5151 if (width > len)
5152 width--;
5153 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005154 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5155 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005156 assert(pbuf[1] == c);
5157 if (fill != ' ') {
5158 *res++ = *pbuf++;
5159 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005160 }
Tim Petersfff53252001-04-12 18:38:48 +00005161 rescnt -= 2;
5162 width -= 2;
5163 if (width < 0)
5164 width = 0;
5165 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005166 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005167 if (width > len && !(flags & F_LJUST)) {
5168 do {
5169 --rescnt;
5170 *res++ = fill;
5171 } while (--width > len);
5172 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005173 if (fill == ' ') {
5174 if (sign)
5175 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005176 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005177 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005178 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005179 *res++ = *pbuf++;
5180 *res++ = *pbuf++;
5181 }
5182 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005183 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184 res += len;
5185 rescnt -= len;
5186 while (--width >= len) {
5187 --rescnt;
5188 *res++ = ' ';
5189 }
5190 if (dict && (argidx < arglen) && c != '%') {
5191 PyErr_SetString(PyExc_TypeError,
5192 "not all arguments converted");
5193 goto onError;
5194 }
5195 Py_XDECREF(temp);
5196 } /* '%' */
5197 } /* until end */
5198 if (argidx < arglen && !dict) {
5199 PyErr_SetString(PyExc_TypeError,
5200 "not all arguments converted");
5201 goto onError;
5202 }
5203
5204 if (args_owned) {
5205 Py_DECREF(args);
5206 }
5207 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005208 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005209 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 return (PyObject *)result;
5211
5212 onError:
5213 Py_XDECREF(result);
5214 Py_DECREF(uformat);
5215 if (args_owned) {
5216 Py_DECREF(args);
5217 }
5218 return NULL;
5219}
5220
5221static PyBufferProcs unicode_as_buffer = {
5222 (getreadbufferproc) unicode_buffer_getreadbuf,
5223 (getwritebufferproc) unicode_buffer_getwritebuf,
5224 (getsegcountproc) unicode_buffer_getsegcount,
5225 (getcharbufferproc) unicode_buffer_getcharbuf,
5226};
5227
5228PyTypeObject PyUnicode_Type = {
5229 PyObject_HEAD_INIT(&PyType_Type)
5230 0, /* ob_size */
5231 "unicode", /* tp_name */
5232 sizeof(PyUnicodeObject), /* tp_size */
5233 0, /* tp_itemsize */
5234 /* Slots */
5235 (destructor)_PyUnicode_Free, /* tp_dealloc */
5236 0, /* tp_print */
5237 (getattrfunc)unicode_getattr, /* tp_getattr */
5238 0, /* tp_setattr */
5239 (cmpfunc) unicode_compare, /* tp_compare */
5240 (reprfunc) unicode_repr, /* tp_repr */
5241 0, /* tp_as_number */
5242 &unicode_as_sequence, /* tp_as_sequence */
5243 0, /* tp_as_mapping */
5244 (hashfunc) unicode_hash, /* tp_hash*/
5245 0, /* tp_call*/
5246 (reprfunc) unicode_str, /* tp_str */
5247 (getattrofunc) NULL, /* tp_getattro */
5248 (setattrofunc) NULL, /* tp_setattro */
5249 &unicode_as_buffer, /* tp_as_buffer */
5250 Py_TPFLAGS_DEFAULT, /* tp_flags */
5251};
5252
5253/* Initialize the Unicode implementation */
5254
Thomas Wouters78890102000-07-22 19:25:51 +00005255void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005256{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005257 int i;
5258
Guido van Rossumd57fd912000-03-10 22:53:23 +00005259 /* Doublecheck the configuration... */
5260 if (sizeof(Py_UNICODE) != 2)
5261 Py_FatalError("Unicode configuration error: "
5262 "sizeof(Py_UNICODE) != 2 bytes");
5263
Fred Drakee4315f52000-05-09 19:53:39 +00005264 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005265 unicode_freelist = NULL;
5266 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005267 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005268 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005269 for (i = 0; i < 256; i++)
5270 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005271}
5272
5273/* Finalize the Unicode implementation */
5274
5275void
Thomas Wouters78890102000-07-22 19:25:51 +00005276_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005277{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005278 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005279 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005280
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005281 Py_XDECREF(unicode_empty);
5282 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005283
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005284 for (i = 0; i < 256; i++) {
5285 if (unicode_latin1[i]) {
5286 Py_DECREF(unicode_latin1[i]);
5287 unicode_latin1[i] = NULL;
5288 }
5289 }
5290
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005291 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 PyUnicodeObject *v = u;
5293 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005294 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005295 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005296 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005297 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005298 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005299 unicode_freelist = NULL;
5300 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005301}