blob: c62f65bcd0e27a6585cd6caad7dd626f4458e708 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
Guido van Rossumd57fd912000-03-10 22:53:23 +000086/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_freelist;
88static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +000090/* The empty Unicode object is shared to improve performance. */
91static PyUnicodeObject *unicode_empty;
92
93/* Single character Unicode strings in the Latin-1 range are being
94 shared as well. */
95static PyUnicodeObject *unicode_latin1[256];
96
Fred Drakee4315f52000-05-09 19:53:39 +000097/* Default encoding to use and assume when NULL is passed as encoding
98 parameter; it is initialized by _PyUnicode_Init().
99
100 Always use the PyUnicode_SetDefaultEncoding() and
101 PyUnicode_GetDefaultEncoding() APIs to access this global.
102
103*/
Fred Drakee4315f52000-05-09 19:53:39 +0000104static char unicode_default_encoding[100];
105
Guido van Rossumd57fd912000-03-10 22:53:23 +0000106/* --- Unicode Object ----------------------------------------------------- */
107
108static
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000109int unicode_resize(register PyUnicodeObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000110 int length)
111{
112 void *oldstr;
113
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000114 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000115 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000116 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000118 /* Resizing shared object (unicode_empty or single character
119 objects) in-place is not allowed. Use PyUnicode_Resize()
120 instead ! */
121 if (unicode == unicode_empty ||
122 (unicode->length == 1 &&
123 unicode->str[0] < 256 &&
124 unicode_latin1[unicode->str[0]] == unicode)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000125 PyErr_SetString(PyExc_SystemError,
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000126 "can't resize shared unicode objects");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000127 return -1;
128 }
129
130 /* We allocate one more byte to make sure the string is
131 Ux0000 terminated -- XXX is this needed ? */
132 oldstr = unicode->str;
133 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
134 if (!unicode->str) {
135 unicode->str = oldstr;
136 PyErr_NoMemory();
137 return -1;
138 }
139 unicode->str[length] = 0;
140 unicode->length = length;
141
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000142 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000143 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000144 if (unicode->defenc) {
145 Py_DECREF(unicode->defenc);
146 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000147 }
148 unicode->hash = -1;
149
150 return 0;
151}
152
153/* We allocate one more byte to make sure the string is
154 Ux0000 terminated -- XXX is this needed ?
155
156 XXX This allocator could further be enhanced by assuring that the
157 free list never reduces its size below 1.
158
159*/
160
161static
162PyUnicodeObject *_PyUnicode_New(int length)
163{
164 register PyUnicodeObject *unicode;
165
166 /* Optimization for empty strings */
167 if (length == 0 && unicode_empty != NULL) {
168 Py_INCREF(unicode_empty);
169 return unicode_empty;
170 }
171
172 /* Unicode freelist & memory allocation */
173 if (unicode_freelist) {
174 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000175 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000176 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000177 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000178 /* Keep-Alive optimization: we only upsize the buffer,
179 never downsize it. */
180 if ((unicode->length < length) &&
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000181 unicode_resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000182 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000183 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 }
185 }
Guido van Rossumad98db12001-06-14 17:52:02 +0000186 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Guido van Rossumad98db12001-06-14 17:52:02 +0000188 }
189 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 }
191 else {
192 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
193 if (unicode == NULL)
194 return NULL;
195 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
196 }
197
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000198 if (!unicode->str) {
199 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000200 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000201 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000202 unicode->str[length] = 0;
203 unicode->length = length;
204 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000205 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000206 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000207
208 onError:
209 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000210 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212}
213
214static
215void _PyUnicode_Free(register PyUnicodeObject *unicode)
216{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000218 /* Keep-Alive optimization */
219 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000220 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 unicode->str = NULL;
222 unicode->length = 0;
223 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000224 if (unicode->defenc) {
225 Py_DECREF(unicode->defenc);
226 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 }
228 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000229 *(PyUnicodeObject **)unicode = unicode_freelist;
230 unicode_freelist = unicode;
231 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000232 }
233 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000234 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000235 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000236 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 }
238}
239
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000240int PyUnicode_Resize(PyObject **unicode,
241 int length)
242{
243 register PyUnicodeObject *v;
244
245 /* Argument checks */
246 if (unicode == NULL) {
247 PyErr_BadInternalCall();
248 return -1;
249 }
250 v = (PyUnicodeObject *)*unicode;
251 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
252 PyErr_BadInternalCall();
253 return -1;
254 }
255
256 /* Resizing unicode_empty and single character objects is not
257 possible since these are being shared. We simply return a fresh
258 copy with the same Unicode content. */
259 if (v->length != length &&
260 (v == unicode_empty || v->length == 1)) {
261 PyUnicodeObject *w = _PyUnicode_New(length);
262 if (w == NULL)
263 return -1;
264 Py_UNICODE_COPY(w->str, v->str,
265 length < v->length ? length : v->length);
266 *unicode = (PyObject *)w;
267 return 0;
268 }
269
270 /* Note that we don't have to modify *unicode for unshared Unicode
271 objects, since we can modify them in-place. */
272 return unicode_resize(v, length);
273}
274
275/* Internal API for use in unicodeobject.c only ! */
276#define _PyUnicode_Resize(unicodevar, length) \
277 PyUnicode_Resize(((PyObject **)(unicodevar)), length)
278
Guido van Rossumd57fd912000-03-10 22:53:23 +0000279PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
280 int size)
281{
282 PyUnicodeObject *unicode;
283
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000284 /* If the Unicode data is known at construction time, we can apply
285 some optimizations which share commonly used objects. */
286 if (u != NULL) {
287
288 /* Optimization for empty strings */
289 if (size == 0 && unicode_empty != NULL) {
290 Py_INCREF(unicode_empty);
291 return (PyObject *)unicode_empty;
292 }
293
294 /* Single character Unicode objects in the Latin-1 range are
295 shared when using this constructor */
296 if (size == 1 && *u < 256) {
297 unicode = unicode_latin1[*u];
298 if (!unicode) {
299 unicode = _PyUnicode_New(1);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000300 if (!unicode)
301 return NULL;
Marc-André Lemburg8879a332001-06-07 12:26:56 +0000302 unicode->str[0] = *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000303 unicode_latin1[*u] = unicode;
304 }
305 Py_INCREF(unicode);
306 return (PyObject *)unicode;
307 }
308 }
309
Guido van Rossumd57fd912000-03-10 22:53:23 +0000310 unicode = _PyUnicode_New(size);
311 if (!unicode)
312 return NULL;
313
314 /* Copy the Unicode data into the new object */
315 if (u != NULL)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000316 Py_UNICODE_COPY(unicode->str, u, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000317
318 return (PyObject *)unicode;
319}
320
321#ifdef HAVE_WCHAR_H
322
323PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
324 int size)
325{
326 PyUnicodeObject *unicode;
327
328 if (w == NULL) {
329 PyErr_BadInternalCall();
330 return NULL;
331 }
332
333 unicode = _PyUnicode_New(size);
334 if (!unicode)
335 return NULL;
336
337 /* Copy the wchar_t data into the new object */
338#ifdef HAVE_USABLE_WCHAR_T
339 memcpy(unicode->str, w, size * sizeof(wchar_t));
340#else
341 {
342 register Py_UNICODE *u;
343 register int i;
344 u = PyUnicode_AS_UNICODE(unicode);
345 for (i = size; i >= 0; i--)
346 *u++ = *w++;
347 }
348#endif
349
350 return (PyObject *)unicode;
351}
352
353int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
354 register wchar_t *w,
355 int size)
356{
357 if (unicode == NULL) {
358 PyErr_BadInternalCall();
359 return -1;
360 }
361 if (size > PyUnicode_GET_SIZE(unicode))
362 size = PyUnicode_GET_SIZE(unicode);
363#ifdef HAVE_USABLE_WCHAR_T
364 memcpy(w, unicode->str, size * sizeof(wchar_t));
365#else
366 {
367 register Py_UNICODE *u;
368 register int i;
369 u = PyUnicode_AS_UNICODE(unicode);
370 for (i = size; i >= 0; i--)
371 *w++ = *u++;
372 }
373#endif
374
375 return size;
376}
377
378#endif
379
380PyObject *PyUnicode_FromObject(register PyObject *obj)
381{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
383}
384
385PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
386 const char *encoding,
387 const char *errors)
388{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000389 const char *s;
390 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 int owned = 0;
392 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393
394 if (obj == NULL) {
395 PyErr_BadInternalCall();
396 return NULL;
397 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000398
399 /* Coerce object */
400 if (PyInstance_Check(obj)) {
401 PyObject *func;
402 func = PyObject_GetAttrString(obj, "__str__");
403 if (func == NULL) {
404 PyErr_SetString(PyExc_TypeError,
405 "coercing to Unicode: instance doesn't define __str__");
406 return NULL;
407 }
408 obj = PyEval_CallObject(func, NULL);
409 Py_DECREF(func);
410 if (obj == NULL)
411 return NULL;
412 owned = 1;
413 }
414 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000416 v = obj;
417 if (encoding) {
418 PyErr_SetString(PyExc_TypeError,
419 "decoding Unicode is not supported");
420 return NULL;
421 }
422 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000423 }
424 else if (PyString_Check(obj)) {
425 s = PyString_AS_STRING(obj);
426 len = PyString_GET_SIZE(obj);
427 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000428 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
429 /* Overwrite the error message with something more useful in
430 case of a TypeError. */
431 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000432 PyErr_Format(PyExc_TypeError,
433 "coercing to Unicode: need string or buffer, "
434 "%.80s found",
435 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000436 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000437 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000438
439 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000440 if (len == 0) {
441 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000442 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000443 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000444 else
445 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000446
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000447 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000448 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000449 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000450 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000451 return v;
452
453 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000454 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000455 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000456 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000457 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458}
459
460PyObject *PyUnicode_Decode(const char *s,
461 int size,
462 const char *encoding,
463 const char *errors)
464{
465 PyObject *buffer = NULL, *unicode;
466
Fred Drakee4315f52000-05-09 19:53:39 +0000467 if (encoding == NULL)
468 encoding = PyUnicode_GetDefaultEncoding();
469
470 /* Shortcuts for common default encodings */
471 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000472 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000473 else if (strcmp(encoding, "latin-1") == 0)
474 return PyUnicode_DecodeLatin1(s, size, errors);
475 else if (strcmp(encoding, "ascii") == 0)
476 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000477
478 /* Decode via the codec registry */
479 buffer = PyBuffer_FromMemory((void *)s, size);
480 if (buffer == NULL)
481 goto onError;
482 unicode = PyCodec_Decode(buffer, encoding, errors);
483 if (unicode == NULL)
484 goto onError;
485 if (!PyUnicode_Check(unicode)) {
486 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000487 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000488 unicode->ob_type->tp_name);
489 Py_DECREF(unicode);
490 goto onError;
491 }
492 Py_DECREF(buffer);
493 return unicode;
494
495 onError:
496 Py_XDECREF(buffer);
497 return NULL;
498}
499
500PyObject *PyUnicode_Encode(const Py_UNICODE *s,
501 int size,
502 const char *encoding,
503 const char *errors)
504{
505 PyObject *v, *unicode;
506
507 unicode = PyUnicode_FromUnicode(s, size);
508 if (unicode == NULL)
509 return NULL;
510 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
511 Py_DECREF(unicode);
512 return v;
513}
514
515PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
516 const char *encoding,
517 const char *errors)
518{
519 PyObject *v;
520
521 if (!PyUnicode_Check(unicode)) {
522 PyErr_BadArgument();
523 goto onError;
524 }
Fred Drakee4315f52000-05-09 19:53:39 +0000525
526 if (encoding == NULL)
527 encoding = PyUnicode_GetDefaultEncoding();
528
529 /* Shortcuts for common default encodings */
530 if (errors == NULL) {
531 if (strcmp(encoding, "utf-8") == 0)
Jeremy Hylton9cea41c2001-05-29 17:13:15 +0000532 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000533 else if (strcmp(encoding, "latin-1") == 0)
534 return PyUnicode_AsLatin1String(unicode);
535 else if (strcmp(encoding, "ascii") == 0)
536 return PyUnicode_AsASCIIString(unicode);
537 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000538
539 /* Encode via the codec registry */
540 v = PyCodec_Encode(unicode, encoding, errors);
541 if (v == NULL)
542 goto onError;
543 /* XXX Should we really enforce this ? */
544 if (!PyString_Check(v)) {
545 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000546 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000547 v->ob_type->tp_name);
548 Py_DECREF(v);
549 goto onError;
550 }
551 return v;
552
553 onError:
554 return NULL;
555}
556
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000557/* Return a Python string holding the default encoded value of the
558 Unicode object.
559
560 The resulting string is cached in the Unicode object for subsequent
561 usage by this function. The cached version is needed to implement
562 the character buffer interface and will live (at least) as long as
563 the Unicode object itself.
564
565 The refcount of the string is *not* incremented.
566
567 *** Exported for internal use by the interpreter only !!! ***
568
569*/
570
571PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
572 const char *errors)
573{
574 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
575
576 if (v)
577 return v;
578 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
579 if (v && errors == NULL)
580 ((PyUnicodeObject *)unicode)->defenc = v;
581 return v;
582}
583
Guido van Rossumd57fd912000-03-10 22:53:23 +0000584Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
585{
586 if (!PyUnicode_Check(unicode)) {
587 PyErr_BadArgument();
588 goto onError;
589 }
590 return PyUnicode_AS_UNICODE(unicode);
591
592 onError:
593 return NULL;
594}
595
596int PyUnicode_GetSize(PyObject *unicode)
597{
598 if (!PyUnicode_Check(unicode)) {
599 PyErr_BadArgument();
600 goto onError;
601 }
602 return PyUnicode_GET_SIZE(unicode);
603
604 onError:
605 return -1;
606}
607
Thomas Wouters78890102000-07-22 19:25:51 +0000608const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000609{
610 return unicode_default_encoding;
611}
612
613int PyUnicode_SetDefaultEncoding(const char *encoding)
614{
615 PyObject *v;
616
617 /* Make sure the encoding is valid. As side effect, this also
618 loads the encoding into the codec registry cache. */
619 v = _PyCodec_Lookup(encoding);
620 if (v == NULL)
621 goto onError;
622 Py_DECREF(v);
623 strncpy(unicode_default_encoding,
624 encoding,
625 sizeof(unicode_default_encoding));
626 return 0;
627
628 onError:
629 return -1;
630}
631
Guido van Rossumd57fd912000-03-10 22:53:23 +0000632/* --- UTF-8 Codec -------------------------------------------------------- */
633
634static
635char utf8_code_length[256] = {
636 /* Map UTF-8 encoded prefix byte to sequence length. zero means
637 illegal prefix. see RFC 2279 for details */
638 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
639 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
640 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
641 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
642 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
643 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
644 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
645 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
646 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
647 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
648 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
649 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
650 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
651 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
652 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
653 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
654};
655
656static
657int utf8_decoding_error(const char **source,
658 Py_UNICODE **dest,
659 const char *errors,
660 const char *details)
661{
662 if ((errors == NULL) ||
663 (strcmp(errors,"strict") == 0)) {
664 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000665 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666 details);
667 return -1;
668 }
669 else if (strcmp(errors,"ignore") == 0) {
670 (*source)++;
671 return 0;
672 }
673 else if (strcmp(errors,"replace") == 0) {
674 (*source)++;
675 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
676 (*dest)++;
677 return 0;
678 }
679 else {
680 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000681 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000682 errors);
683 return -1;
684 }
685}
686
Guido van Rossumd57fd912000-03-10 22:53:23 +0000687PyObject *PyUnicode_DecodeUTF8(const char *s,
688 int size,
689 const char *errors)
690{
691 int n;
692 const char *e;
693 PyUnicodeObject *unicode;
694 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000696
697 /* Note: size will always be longer than the resulting Unicode
698 character count */
699 unicode = _PyUnicode_New(size);
700 if (!unicode)
701 return NULL;
702 if (size == 0)
703 return (PyObject *)unicode;
704
705 /* Unpack UTF-8 encoded data */
706 p = unicode->str;
707 e = s + size;
708
709 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000710 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711
712 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000713 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000714 s++;
715 continue;
716 }
717
718 n = utf8_code_length[ch];
719
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if (s + n > e) {
721 errmsg = "unexpected end of data";
722 goto utf8Error;
723 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000724
725 switch (n) {
726
727 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 errmsg = "unexpected code byte";
729 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730
731 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000732 errmsg = "internal error";
733 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000734
735 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000736 if ((s[1] & 0xc0) != 0x80) {
737 errmsg = "invalid data";
738 goto utf8Error;
739 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000740 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 if (ch < 0x80) {
742 errmsg = "illegal encoding";
743 goto utf8Error;
744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000745 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000747 break;
748
749 case 3:
750 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000751 (s[2] & 0xc0) != 0x80) {
752 errmsg = "invalid data";
753 goto utf8Error;
754 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000755 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000756 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
757 errmsg = "illegal encoding";
758 goto utf8Error;
759 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000761 *p++ = (Py_UNICODE)ch;
762 break;
763
764 case 4:
765 if ((s[1] & 0xc0) != 0x80 ||
766 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000767 (s[3] & 0xc0) != 0x80) {
768 errmsg = "invalid data";
769 goto utf8Error;
770 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000771 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
772 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
773 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000774 if ((ch < 0x10000) || /* minimum value allowed for 4
775 byte encoding */
776 (ch > 0x10ffff)) { /* maximum value allowed for
777 UTF-16 */
778 errmsg = "illegal encoding";
779 goto utf8Error;
780 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000781 /* compute and append the two surrogates: */
782
783 /* translate from 10000..10FFFF to 0..FFFF */
784 ch -= 0x10000;
785
786 /* high surrogate = top 10 bits added to D800 */
787 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
788
789 /* low surrogate = bottom 10 bits added to DC00 */
Fredrik Lundh45714e92001-06-26 16:39:36 +0000790 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000791 break;
792
793 default:
794 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000795 errmsg = "unsupported Unicode code range";
796 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797 }
798 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000799 continue;
800
801 utf8Error:
802 if (utf8_decoding_error(&s, &p, errors, errmsg))
803 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000804 }
805
806 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000807 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 goto onError;
809
810 return (PyObject *)unicode;
811
812onError:
813 Py_DECREF(unicode);
814 return NULL;
815}
816
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000817/* Not used anymore, now that the encoder supports UTF-16
818 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000819#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820static
821int utf8_encoding_error(const Py_UNICODE **source,
822 char **dest,
823 const char *errors,
824 const char *details)
825{
826 if ((errors == NULL) ||
827 (strcmp(errors,"strict") == 0)) {
828 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000829 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830 details);
831 return -1;
832 }
833 else if (strcmp(errors,"ignore") == 0) {
834 return 0;
835 }
836 else if (strcmp(errors,"replace") == 0) {
837 **dest = '?';
838 (*dest)++;
839 return 0;
840 }
841 else {
842 PyErr_Format(PyExc_ValueError,
843 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000844 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000845 errors);
846 return -1;
847 }
848}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000849#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000850
851PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
852 int size,
853 const char *errors)
854{
855 PyObject *v;
856 char *p;
857 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000858 Py_UCS4 ch2;
859 unsigned int cbAllocated = 3 * size;
860 unsigned int cbWritten = 0;
861 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000862
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000863 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 if (v == NULL)
865 return NULL;
866 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000867 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868
869 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000870 while (i < size) {
871 Py_UCS4 ch = s[i++];
872 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000874 cbWritten++;
875 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000876 else if (ch < 0x0800) {
877 *p++ = 0xc0 | (ch >> 6);
878 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000879 cbWritten += 2;
880 }
881 else {
882 /* Check for high surrogate */
883 if (0xD800 <= ch && ch <= 0xDBFF) {
884 if (i != size) {
885 ch2 = s[i];
886 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
887
888 if (cbWritten >= (cbAllocated - 4)) {
889 /* Provide enough room for some more
890 surrogates */
891 cbAllocated += 4*10;
892 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000893 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000894 }
895
896 /* combine the two values */
897 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
898
899 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000900 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000901 i++;
902 cbWritten += 4;
903 }
904 }
905 }
906 else {
907 *p++ = (char)(0xe0 | (ch >> 12));
908 cbWritten += 3;
909 }
910 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
911 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000912 }
913 }
914 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000915 if (_PyString_Resize(&v, p - q))
916 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000917 return v;
918
919 onError:
920 Py_DECREF(v);
921 return NULL;
922}
923
Guido van Rossumd57fd912000-03-10 22:53:23 +0000924PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
925{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926 if (!PyUnicode_Check(unicode)) {
927 PyErr_BadArgument();
928 return NULL;
929 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000930 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
931 PyUnicode_GET_SIZE(unicode),
932 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000933}
934
935/* --- UTF-16 Codec ------------------------------------------------------- */
936
937static
938int utf16_decoding_error(const Py_UNICODE **source,
939 Py_UNICODE **dest,
940 const char *errors,
941 const char *details)
942{
943 if ((errors == NULL) ||
944 (strcmp(errors,"strict") == 0)) {
945 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000946 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000947 details);
948 return -1;
949 }
950 else if (strcmp(errors,"ignore") == 0) {
951 return 0;
952 }
953 else if (strcmp(errors,"replace") == 0) {
954 if (dest) {
955 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
956 (*dest)++;
957 }
958 return 0;
959 }
960 else {
961 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000962 "UTF-16 decoding error; "
963 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 errors);
965 return -1;
966 }
967}
968
Guido van Rossumd57fd912000-03-10 22:53:23 +0000969PyObject *PyUnicode_DecodeUTF16(const char *s,
970 int size,
971 const char *errors,
972 int *byteorder)
973{
974 PyUnicodeObject *unicode;
975 Py_UNICODE *p;
976 const Py_UNICODE *q, *e;
977 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000978 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000979
980 /* size should be an even number */
981 if (size % sizeof(Py_UNICODE) != 0) {
982 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
983 return NULL;
984 /* The remaining input chars are ignored if we fall through
985 here... */
986 }
987
988 /* Note: size will always be longer than the resulting Unicode
989 character count */
990 unicode = _PyUnicode_New(size);
991 if (!unicode)
992 return NULL;
993 if (size == 0)
994 return (PyObject *)unicode;
995
996 /* Unpack UTF-16 encoded data */
997 p = unicode->str;
998 q = (Py_UNICODE *)s;
999 e = q + (size / sizeof(Py_UNICODE));
1000
1001 if (byteorder)
1002 bo = *byteorder;
1003
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001004 /* Check for BOM marks (U+FEFF) in the input and adjust current
1005 byte order setting accordingly. In native mode, the leading BOM
1006 mark is skipped, in all other modes, it is copied to the output
1007 stream as-is (giving a ZWNBSP character). */
1008 if (bo == 0) {
1009#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1010 if (*q == 0xFEFF) {
1011 q++;
1012 bo = -1;
1013 } else if (*q == 0xFFFE) {
1014 q++;
1015 bo = 1;
1016 }
1017#else
1018 if (*q == 0xFEFF) {
1019 q++;
1020 bo = 1;
1021 } else if (*q == 0xFFFE) {
1022 q++;
1023 bo = -1;
1024 }
1025#endif
1026 }
1027
Guido van Rossumd57fd912000-03-10 22:53:23 +00001028 while (q < e) {
1029 register Py_UNICODE ch = *q++;
1030
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00001031 /* Swap input bytes if needed. (This assumes
1032 sizeof(Py_UNICODE) == 2 !) */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001033#ifdef BYTEORDER_IS_LITTLE_ENDIAN
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034 if (bo == 1)
1035 ch = (ch >> 8) | (ch << 8);
1036#else
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037 if (bo == -1)
1038 ch = (ch >> 8) | (ch << 8);
1039#endif
1040 if (ch < 0xD800 || ch > 0xDFFF) {
1041 *p++ = ch;
1042 continue;
1043 }
1044
1045 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001046 if (q >= e) {
1047 errmsg = "unexpected end of data";
1048 goto utf16Error;
1049 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001050 if (0xDC00 <= *q && *q <= 0xDFFF) {
1051 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001052 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001053 /* This is valid data (a UTF-16 surrogate pair), but
1054 we are not able to store this information since our
1055 Py_UNICODE type only has 16 bits... this might
1056 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001057 errmsg = "code pairs are not supported";
1058 goto utf16Error;
1059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 else
1061 continue;
1062 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001063 errmsg = "illegal encoding";
1064 /* Fall through to report the error */
1065
1066 utf16Error:
1067 if (utf16_decoding_error(&q, &p, errors, errmsg))
1068 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001069 }
1070
1071 if (byteorder)
1072 *byteorder = bo;
1073
1074 /* Adjust length */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001075 if (_PyUnicode_Resize(&unicode, p - unicode->str))
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 goto onError;
1077
1078 return (PyObject *)unicode;
1079
1080onError:
1081 Py_DECREF(unicode);
1082 return NULL;
1083}
1084
1085#undef UTF16_ERROR
1086
1087PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1088 int size,
1089 const char *errors,
1090 int byteorder)
1091{
1092 PyObject *v;
1093 Py_UNICODE *p;
1094 char *q;
1095
1096 /* We don't create UTF-16 pairs... */
1097 v = PyString_FromStringAndSize(NULL,
1098 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1099 if (v == NULL)
1100 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101
1102 q = PyString_AS_STRING(v);
1103 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104 if (byteorder == 0)
1105 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001106 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001107 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108 if (byteorder == 0 ||
1109#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1110 byteorder == -1
1111#else
1112 byteorder == 1
1113#endif
1114 )
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001115 Py_UNICODE_COPY(p, s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 else
1117 while (size-- > 0) {
1118 Py_UNICODE ch = *s++;
1119 *p++ = (ch >> 8) | (ch << 8);
1120 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 return v;
1122}
1123
1124PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1125{
1126 if (!PyUnicode_Check(unicode)) {
1127 PyErr_BadArgument();
1128 return NULL;
1129 }
1130 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1131 PyUnicode_GET_SIZE(unicode),
1132 NULL,
1133 0);
1134}
1135
1136/* --- Unicode Escape Codec ----------------------------------------------- */
1137
1138static
1139int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001140 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141 const char *errors,
1142 const char *details)
1143{
1144 if ((errors == NULL) ||
1145 (strcmp(errors,"strict") == 0)) {
1146 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001147 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 details);
1149 return -1;
1150 }
1151 else if (strcmp(errors,"ignore") == 0) {
1152 return 0;
1153 }
1154 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001155 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001156 return 0;
1157 }
1158 else {
1159 PyErr_Format(PyExc_ValueError,
1160 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001161 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 errors);
1163 return -1;
1164 }
1165}
1166
Fredrik Lundh06d12682001-01-24 07:59:11 +00001167static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001168
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1170 int size,
1171 const char *errors)
1172{
1173 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001174 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001176 char* message;
1177 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1178
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179 /* Escaped strings will always be longer than the resulting
1180 Unicode string, so we start with size here and then reduce the
1181 length after conversion to the true value. */
1182 v = _PyUnicode_New(size);
1183 if (v == NULL)
1184 goto onError;
1185 if (size == 0)
1186 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001187
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 p = buf = PyUnicode_AS_UNICODE(v);
1189 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001190
Guido van Rossumd57fd912000-03-10 22:53:23 +00001191 while (s < end) {
1192 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001193 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001194 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001195
1196 /* Non-escape characters are interpreted as Unicode ordinals */
1197 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001198 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199 continue;
1200 }
1201
1202 /* \ - Escapes */
1203 s++;
1204 switch (*s++) {
1205
1206 /* \x escapes */
1207 case '\n': break;
1208 case '\\': *p++ = '\\'; break;
1209 case '\'': *p++ = '\''; break;
1210 case '\"': *p++ = '\"'; break;
1211 case 'b': *p++ = '\b'; break;
1212 case 'f': *p++ = '\014'; break; /* FF */
1213 case 't': *p++ = '\t'; break;
1214 case 'n': *p++ = '\n'; break;
1215 case 'r': *p++ = '\r'; break;
1216 case 'v': *p++ = '\013'; break; /* VT */
1217 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1218
1219 /* \OOO (octal) escapes */
1220 case '0': case '1': case '2': case '3':
1221 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001222 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001224 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001225 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001226 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001227 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001228 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001229 break;
1230
Fredrik Lundhccc74732001-02-18 22:13:49 +00001231 /* hex escapes */
1232 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001233 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001234 digits = 2;
1235 message = "truncated \\xXX escape";
1236 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001237
Fredrik Lundhccc74732001-02-18 22:13:49 +00001238 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001239 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001240 digits = 4;
1241 message = "truncated \\uXXXX escape";
1242 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001243
Fredrik Lundhccc74732001-02-18 22:13:49 +00001244 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001245 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001246 digits = 8;
1247 message = "truncated \\UXXXXXXXX escape";
1248 hexescape:
1249 chr = 0;
1250 for (i = 0; i < digits; i++) {
1251 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001252 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001253 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001254 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001255 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001256 i++;
1257 break;
1258 }
1259 chr = (chr<<4) & ~0xF;
1260 if (c >= '0' && c <= '9')
1261 chr += c - '0';
1262 else if (c >= 'a' && c <= 'f')
1263 chr += 10 + c - 'a';
1264 else
1265 chr += 10 + c - 'A';
1266 }
1267 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001268 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001269 /* when we get here, chr is a 32-bit unicode character */
1270 if (chr <= 0xffff)
1271 /* UCS-2 character */
1272 *p++ = (Py_UNICODE) chr;
1273 else if (chr <= 0x10ffff) {
1274 /* UCS-4 character. store as two surrogate characters */
1275 chr -= 0x10000L;
1276 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
Fredrik Lundh45714e92001-06-26 16:39:36 +00001277 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
Fredrik Lundhdf846752000-09-03 11:29:49 +00001278 } else {
1279 if (unicodeescape_decoding_error(
1280 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001281 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001282 )
1283 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001284 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001285 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001286 break;
1287
1288 /* \N{name} */
1289 case 'N':
1290 message = "malformed \\N character escape";
1291 if (ucnhash_CAPI == NULL) {
1292 /* load the unicode data module */
1293 PyObject *m, *v;
1294 m = PyImport_ImportModule("unicodedata");
1295 if (m == NULL)
1296 goto ucnhashError;
1297 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1298 Py_DECREF(m);
1299 if (v == NULL)
1300 goto ucnhashError;
1301 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1302 Py_DECREF(v);
1303 if (ucnhash_CAPI == NULL)
1304 goto ucnhashError;
1305 }
1306 if (*s == '{') {
1307 const char *start = s+1;
1308 /* look for the closing brace */
1309 while (*s != '}' && s < end)
1310 s++;
1311 if (s > start && s < end && *s == '}') {
1312 /* found a name. look it up in the unicode database */
1313 message = "unknown Unicode character name";
1314 s++;
1315 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1316 goto store;
1317 }
1318 }
1319 if (unicodeescape_decoding_error(&s, &x, errors, message))
1320 goto onError;
1321 *p++ = x;
1322 break;
1323
1324 default:
1325 *p++ = '\\';
1326 *p++ = (unsigned char)s[-1];
1327 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001328 }
1329 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001330 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001331 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001332 return (PyObject *)v;
1333
Fredrik Lundhccc74732001-02-18 22:13:49 +00001334ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001335 PyErr_SetString(
1336 PyExc_UnicodeError,
1337 "\\N escapes not supported (can't load unicodedata module)"
1338 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001339 return NULL;
1340
Fredrik Lundhccc74732001-02-18 22:13:49 +00001341onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342 Py_XDECREF(v);
1343 return NULL;
1344}
1345
1346/* Return a Unicode-Escape string version of the Unicode object.
1347
1348 If quotes is true, the string is enclosed in u"" or u'' quotes as
1349 appropriate.
1350
1351*/
1352
Barry Warsaw51ac5802000-03-20 16:36:48 +00001353static const Py_UNICODE *findchar(const Py_UNICODE *s,
1354 int size,
1355 Py_UNICODE ch);
1356
Guido van Rossumd57fd912000-03-10 22:53:23 +00001357static
1358PyObject *unicodeescape_string(const Py_UNICODE *s,
1359 int size,
1360 int quotes)
1361{
1362 PyObject *repr;
1363 char *p;
1364 char *q;
1365
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001366 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367
1368 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1369 if (repr == NULL)
1370 return NULL;
1371
1372 p = q = PyString_AS_STRING(repr);
1373
1374 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001375 *p++ = 'u';
1376 *p++ = (findchar(s, size, '\'') &&
1377 !findchar(s, size, '"')) ? '"' : '\'';
1378 }
1379 while (size-- > 0) {
1380 Py_UNICODE ch = *s++;
1381 /* Escape quotes */
Fredrik Lundh30831632001-06-26 15:11:00 +00001382 if (quotes && (ch == (Py_UNICODE) q[1] || ch == '\\')) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001383 *p++ = '\\';
1384 *p++ = (char) ch;
1385 }
1386 /* Map 16-bit characters to '\uxxxx' */
1387 else if (ch >= 256) {
1388 *p++ = '\\';
1389 *p++ = 'u';
1390 *p++ = hexdigit[(ch >> 12) & 0xf];
1391 *p++ = hexdigit[(ch >> 8) & 0xf];
1392 *p++ = hexdigit[(ch >> 4) & 0xf];
1393 *p++ = hexdigit[ch & 15];
1394 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001395 /* Map special whitespace to '\t', \n', '\r' */
1396 else if (ch == '\t') {
1397 *p++ = '\\';
1398 *p++ = 't';
1399 }
1400 else if (ch == '\n') {
1401 *p++ = '\\';
1402 *p++ = 'n';
1403 }
1404 else if (ch == '\r') {
1405 *p++ = '\\';
1406 *p++ = 'r';
1407 }
1408 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 else if (ch < ' ' || ch >= 128) {
1410 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001411 *p++ = 'x';
1412 *p++ = hexdigit[(ch >> 4) & 0xf];
1413 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001414 }
1415 /* Copy everything else as-is */
1416 else
1417 *p++ = (char) ch;
1418 }
1419 if (quotes)
1420 *p++ = q[1];
1421
1422 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001423 if (_PyString_Resize(&repr, p - q))
1424 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425
1426 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001427
1428 onError:
1429 Py_DECREF(repr);
1430 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431}
1432
1433PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1434 int size)
1435{
1436 return unicodeescape_string(s, size, 0);
1437}
1438
1439PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1440{
1441 if (!PyUnicode_Check(unicode)) {
1442 PyErr_BadArgument();
1443 return NULL;
1444 }
1445 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1446 PyUnicode_GET_SIZE(unicode));
1447}
1448
1449/* --- Raw Unicode Escape Codec ------------------------------------------- */
1450
1451PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1452 int size,
1453 const char *errors)
1454{
1455 PyUnicodeObject *v;
1456 Py_UNICODE *p, *buf;
1457 const char *end;
1458 const char *bs;
1459
1460 /* Escaped strings will always be longer than the resulting
1461 Unicode string, so we start with size here and then reduce the
1462 length after conversion to the true value. */
1463 v = _PyUnicode_New(size);
1464 if (v == NULL)
1465 goto onError;
1466 if (size == 0)
1467 return (PyObject *)v;
1468 p = buf = PyUnicode_AS_UNICODE(v);
1469 end = s + size;
1470 while (s < end) {
1471 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001472 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001473 int i;
1474
1475 /* Non-escape characters are interpreted as Unicode ordinals */
1476 if (*s != '\\') {
1477 *p++ = (unsigned char)*s++;
1478 continue;
1479 }
1480
1481 /* \u-escapes are only interpreted iff the number of leading
1482 backslashes if odd */
1483 bs = s;
1484 for (;s < end;) {
1485 if (*s != '\\')
1486 break;
1487 *p++ = (unsigned char)*s++;
1488 }
1489 if (((s - bs) & 1) == 0 ||
1490 s >= end ||
1491 *s != 'u') {
1492 continue;
1493 }
1494 p--;
1495 s++;
1496
1497 /* \uXXXX with 4 hex digits */
1498 for (x = 0, i = 0; i < 4; i++) {
1499 c = (unsigned char)s[i];
1500 if (!isxdigit(c)) {
1501 if (unicodeescape_decoding_error(&s, &x, errors,
1502 "truncated \\uXXXX"))
1503 goto onError;
1504 i++;
1505 break;
1506 }
1507 x = (x<<4) & ~0xF;
1508 if (c >= '0' && c <= '9')
1509 x += c - '0';
1510 else if (c >= 'a' && c <= 'f')
1511 x += 10 + c - 'a';
1512 else
1513 x += 10 + c - 'A';
1514 }
1515 s += i;
1516 *p++ = x;
1517 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001518 if (_PyUnicode_Resize(&v, (int)(p - buf)))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001519 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001520 return (PyObject *)v;
1521
1522 onError:
1523 Py_XDECREF(v);
1524 return NULL;
1525}
1526
1527PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1528 int size)
1529{
1530 PyObject *repr;
1531 char *p;
1532 char *q;
1533
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001534 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001535
1536 repr = PyString_FromStringAndSize(NULL, 6 * size);
1537 if (repr == NULL)
1538 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001539 if (size == 0)
1540 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541
1542 p = q = PyString_AS_STRING(repr);
1543 while (size-- > 0) {
1544 Py_UNICODE ch = *s++;
1545 /* Map 16-bit characters to '\uxxxx' */
1546 if (ch >= 256) {
1547 *p++ = '\\';
1548 *p++ = 'u';
1549 *p++ = hexdigit[(ch >> 12) & 0xf];
1550 *p++ = hexdigit[(ch >> 8) & 0xf];
1551 *p++ = hexdigit[(ch >> 4) & 0xf];
1552 *p++ = hexdigit[ch & 15];
1553 }
1554 /* Copy everything else as-is */
1555 else
1556 *p++ = (char) ch;
1557 }
1558 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001559 if (_PyString_Resize(&repr, p - q))
1560 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561
1562 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001563
1564 onError:
1565 Py_DECREF(repr);
1566 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001567}
1568
1569PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1570{
1571 if (!PyUnicode_Check(unicode)) {
1572 PyErr_BadArgument();
1573 return NULL;
1574 }
1575 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1576 PyUnicode_GET_SIZE(unicode));
1577}
1578
1579/* --- Latin-1 Codec ------------------------------------------------------ */
1580
1581PyObject *PyUnicode_DecodeLatin1(const char *s,
1582 int size,
1583 const char *errors)
1584{
1585 PyUnicodeObject *v;
1586 Py_UNICODE *p;
1587
1588 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001589 if (size == 1 && *(unsigned char*)s < 256) {
1590 Py_UNICODE r = *(unsigned char*)s;
1591 return PyUnicode_FromUnicode(&r, 1);
1592 }
1593
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 v = _PyUnicode_New(size);
1595 if (v == NULL)
1596 goto onError;
1597 if (size == 0)
1598 return (PyObject *)v;
1599 p = PyUnicode_AS_UNICODE(v);
1600 while (size-- > 0)
1601 *p++ = (unsigned char)*s++;
1602 return (PyObject *)v;
1603
1604 onError:
1605 Py_XDECREF(v);
1606 return NULL;
1607}
1608
1609static
1610int latin1_encoding_error(const Py_UNICODE **source,
1611 char **dest,
1612 const char *errors,
1613 const char *details)
1614{
1615 if ((errors == NULL) ||
1616 (strcmp(errors,"strict") == 0)) {
1617 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001618 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001619 details);
1620 return -1;
1621 }
1622 else if (strcmp(errors,"ignore") == 0) {
1623 return 0;
1624 }
1625 else if (strcmp(errors,"replace") == 0) {
1626 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001627 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 return 0;
1629 }
1630 else {
1631 PyErr_Format(PyExc_ValueError,
1632 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001633 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001634 errors);
1635 return -1;
1636 }
1637}
1638
1639PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1640 int size,
1641 const char *errors)
1642{
1643 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001644 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001645
Guido van Rossumd57fd912000-03-10 22:53:23 +00001646 repr = PyString_FromStringAndSize(NULL, size);
1647 if (repr == NULL)
1648 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001649 if (size == 0)
1650 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001651
1652 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001653 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001654 while (size-- > 0) {
1655 Py_UNICODE ch = *p++;
1656 if (ch >= 256) {
1657 if (latin1_encoding_error(&p, &s, errors,
1658 "ordinal not in range(256)"))
1659 goto onError;
1660 }
1661 else
1662 *s++ = (char)ch;
1663 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001664 /* Resize if error handling skipped some characters */
1665 if (s - start < PyString_GET_SIZE(repr))
1666 if (_PyString_Resize(&repr, s - start))
1667 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001668 return repr;
1669
1670 onError:
1671 Py_DECREF(repr);
1672 return NULL;
1673}
1674
1675PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1676{
1677 if (!PyUnicode_Check(unicode)) {
1678 PyErr_BadArgument();
1679 return NULL;
1680 }
1681 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1682 PyUnicode_GET_SIZE(unicode),
1683 NULL);
1684}
1685
1686/* --- 7-bit ASCII Codec -------------------------------------------------- */
1687
1688static
1689int ascii_decoding_error(const char **source,
1690 Py_UNICODE **dest,
1691 const char *errors,
1692 const char *details)
1693{
1694 if ((errors == NULL) ||
1695 (strcmp(errors,"strict") == 0)) {
1696 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001697 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001698 details);
1699 return -1;
1700 }
1701 else if (strcmp(errors,"ignore") == 0) {
1702 return 0;
1703 }
1704 else if (strcmp(errors,"replace") == 0) {
1705 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1706 (*dest)++;
1707 return 0;
1708 }
1709 else {
1710 PyErr_Format(PyExc_ValueError,
1711 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001712 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001713 errors);
1714 return -1;
1715 }
1716}
1717
1718PyObject *PyUnicode_DecodeASCII(const char *s,
1719 int size,
1720 const char *errors)
1721{
1722 PyUnicodeObject *v;
1723 Py_UNICODE *p;
1724
1725 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001726 if (size == 1 && *(unsigned char*)s < 128) {
1727 Py_UNICODE r = *(unsigned char*)s;
1728 return PyUnicode_FromUnicode(&r, 1);
1729 }
1730
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 v = _PyUnicode_New(size);
1732 if (v == NULL)
1733 goto onError;
1734 if (size == 0)
1735 return (PyObject *)v;
1736 p = PyUnicode_AS_UNICODE(v);
1737 while (size-- > 0) {
1738 register unsigned char c;
1739
1740 c = (unsigned char)*s++;
1741 if (c < 128)
1742 *p++ = c;
1743 else if (ascii_decoding_error(&s, &p, errors,
1744 "ordinal not in range(128)"))
1745 goto onError;
1746 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001747 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001748 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001749 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001750 return (PyObject *)v;
1751
1752 onError:
1753 Py_XDECREF(v);
1754 return NULL;
1755}
1756
1757static
1758int ascii_encoding_error(const Py_UNICODE **source,
1759 char **dest,
1760 const char *errors,
1761 const char *details)
1762{
1763 if ((errors == NULL) ||
1764 (strcmp(errors,"strict") == 0)) {
1765 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001766 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001767 details);
1768 return -1;
1769 }
1770 else if (strcmp(errors,"ignore") == 0) {
1771 return 0;
1772 }
1773 else if (strcmp(errors,"replace") == 0) {
1774 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001775 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001776 return 0;
1777 }
1778 else {
1779 PyErr_Format(PyExc_ValueError,
1780 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001781 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001782 errors);
1783 return -1;
1784 }
1785}
1786
1787PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1788 int size,
1789 const char *errors)
1790{
1791 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001792 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001793
Guido van Rossumd57fd912000-03-10 22:53:23 +00001794 repr = PyString_FromStringAndSize(NULL, size);
1795 if (repr == NULL)
1796 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001797 if (size == 0)
1798 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001799
1800 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001801 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001802 while (size-- > 0) {
1803 Py_UNICODE ch = *p++;
1804 if (ch >= 128) {
1805 if (ascii_encoding_error(&p, &s, errors,
1806 "ordinal not in range(128)"))
1807 goto onError;
1808 }
1809 else
1810 *s++ = (char)ch;
1811 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001812 /* Resize if error handling skipped some characters */
1813 if (s - start < PyString_GET_SIZE(repr))
1814 if (_PyString_Resize(&repr, s - start))
1815 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 return repr;
1817
1818 onError:
1819 Py_DECREF(repr);
1820 return NULL;
1821}
1822
1823PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1824{
1825 if (!PyUnicode_Check(unicode)) {
1826 PyErr_BadArgument();
1827 return NULL;
1828 }
1829 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1830 PyUnicode_GET_SIZE(unicode),
1831 NULL);
1832}
1833
Fredrik Lundh30831632001-06-26 15:11:00 +00001834#if defined(MS_WIN32) && defined(HAVE_USABLE_WCHAR_T)
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001835
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001836/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001837
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001838PyObject *PyUnicode_DecodeMBCS(const char *s,
1839 int size,
1840 const char *errors)
1841{
1842 PyUnicodeObject *v;
1843 Py_UNICODE *p;
1844
1845 /* First get the size of the result */
1846 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001847 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001848 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1849
1850 v = _PyUnicode_New(usize);
1851 if (v == NULL)
1852 return NULL;
1853 if (usize == 0)
1854 return (PyObject *)v;
1855 p = PyUnicode_AS_UNICODE(v);
1856 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1857 Py_DECREF(v);
1858 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1859 }
1860
1861 return (PyObject *)v;
1862}
1863
1864PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1865 int size,
1866 const char *errors)
1867{
1868 PyObject *repr;
1869 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001870 DWORD mbcssize;
1871
1872 /* If there are no characters, bail now! */
1873 if (size==0)
1874 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001875
1876 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001877 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001878 if (mbcssize==0)
1879 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1880
1881 repr = PyString_FromStringAndSize(NULL, mbcssize);
1882 if (repr == NULL)
1883 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001884 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001885 return repr;
1886
1887 /* Do the conversion */
1888 s = PyString_AS_STRING(repr);
1889 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1890 Py_DECREF(repr);
1891 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1892 }
1893 return repr;
1894}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001895
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001896#endif /* MS_WIN32 */
1897
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898/* --- Character Mapping Codec -------------------------------------------- */
1899
1900static
1901int charmap_decoding_error(const char **source,
1902 Py_UNICODE **dest,
1903 const char *errors,
1904 const char *details)
1905{
1906 if ((errors == NULL) ||
1907 (strcmp(errors,"strict") == 0)) {
1908 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001909 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001910 details);
1911 return -1;
1912 }
1913 else if (strcmp(errors,"ignore") == 0) {
1914 return 0;
1915 }
1916 else if (strcmp(errors,"replace") == 0) {
1917 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1918 (*dest)++;
1919 return 0;
1920 }
1921 else {
1922 PyErr_Format(PyExc_ValueError,
1923 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001924 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 errors);
1926 return -1;
1927 }
1928}
1929
1930PyObject *PyUnicode_DecodeCharmap(const char *s,
1931 int size,
1932 PyObject *mapping,
1933 const char *errors)
1934{
1935 PyUnicodeObject *v;
1936 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001937 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001938
1939 /* Default to Latin-1 */
1940 if (mapping == NULL)
1941 return PyUnicode_DecodeLatin1(s, size, errors);
1942
1943 v = _PyUnicode_New(size);
1944 if (v == NULL)
1945 goto onError;
1946 if (size == 0)
1947 return (PyObject *)v;
1948 p = PyUnicode_AS_UNICODE(v);
1949 while (size-- > 0) {
1950 unsigned char ch = *s++;
1951 PyObject *w, *x;
1952
1953 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1954 w = PyInt_FromLong((long)ch);
1955 if (w == NULL)
1956 goto onError;
1957 x = PyObject_GetItem(mapping, w);
1958 Py_DECREF(w);
1959 if (x == NULL) {
1960 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001961 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001962 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001963 x = Py_None;
1964 Py_INCREF(x);
1965 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001966 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001967 }
1968
1969 /* Apply mapping */
1970 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001971 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 if (value < 0 || value > 65535) {
1973 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001974 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 Py_DECREF(x);
1976 goto onError;
1977 }
1978 *p++ = (Py_UNICODE)value;
1979 }
1980 else if (x == Py_None) {
1981 /* undefined mapping */
1982 if (charmap_decoding_error(&s, &p, errors,
1983 "character maps to <undefined>")) {
1984 Py_DECREF(x);
1985 goto onError;
1986 }
1987 }
1988 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001989 int targetsize = PyUnicode_GET_SIZE(x);
1990
1991 if (targetsize == 1)
1992 /* 1-1 mapping */
1993 *p++ = *PyUnicode_AS_UNICODE(x);
1994
1995 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001996 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001997 if (targetsize > extrachars) {
1998 /* resize first */
1999 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2000 int needed = (targetsize - extrachars) + \
2001 (targetsize << 2);
2002 extrachars += needed;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002003 if (_PyUnicode_Resize(&v,
2004 PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002005 Py_DECREF(x);
2006 goto onError;
2007 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002008 p = PyUnicode_AS_UNICODE(v) + oldpos;
2009 }
2010 Py_UNICODE_COPY(p,
2011 PyUnicode_AS_UNICODE(x),
2012 targetsize);
2013 p += targetsize;
2014 extrachars -= targetsize;
2015 }
2016 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002017 }
2018 else {
2019 /* wrong return value */
2020 PyErr_SetString(PyExc_TypeError,
2021 "character mapping must return integer, None or unicode");
2022 Py_DECREF(x);
2023 goto onError;
2024 }
2025 Py_DECREF(x);
2026 }
2027 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002028 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 goto onError;
2030 return (PyObject *)v;
2031
2032 onError:
2033 Py_XDECREF(v);
2034 return NULL;
2035}
2036
2037static
2038int charmap_encoding_error(const Py_UNICODE **source,
2039 char **dest,
2040 const char *errors,
2041 const char *details)
2042{
2043 if ((errors == NULL) ||
2044 (strcmp(errors,"strict") == 0)) {
2045 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002046 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002047 details);
2048 return -1;
2049 }
2050 else if (strcmp(errors,"ignore") == 0) {
2051 return 0;
2052 }
2053 else if (strcmp(errors,"replace") == 0) {
2054 **dest = '?';
2055 (*dest)++;
2056 return 0;
2057 }
2058 else {
2059 PyErr_Format(PyExc_ValueError,
2060 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002061 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002062 errors);
2063 return -1;
2064 }
2065}
2066
2067PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2068 int size,
2069 PyObject *mapping,
2070 const char *errors)
2071{
2072 PyObject *v;
2073 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002074 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075
2076 /* Default to Latin-1 */
2077 if (mapping == NULL)
2078 return PyUnicode_EncodeLatin1(p, size, errors);
2079
2080 v = PyString_FromStringAndSize(NULL, size);
2081 if (v == NULL)
2082 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002083 if (size == 0)
2084 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002085 s = PyString_AS_STRING(v);
2086 while (size-- > 0) {
2087 Py_UNICODE ch = *p++;
2088 PyObject *w, *x;
2089
2090 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2091 w = PyInt_FromLong((long)ch);
2092 if (w == NULL)
2093 goto onError;
2094 x = PyObject_GetItem(mapping, w);
2095 Py_DECREF(w);
2096 if (x == NULL) {
2097 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002098 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002100 x = Py_None;
2101 Py_INCREF(x);
2102 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002103 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002104 }
2105
2106 /* Apply mapping */
2107 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002108 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 if (value < 0 || value > 255) {
2110 PyErr_SetString(PyExc_TypeError,
2111 "character mapping must be in range(256)");
2112 Py_DECREF(x);
2113 goto onError;
2114 }
2115 *s++ = (char)value;
2116 }
2117 else if (x == Py_None) {
2118 /* undefined mapping */
2119 if (charmap_encoding_error(&p, &s, errors,
2120 "character maps to <undefined>")) {
2121 Py_DECREF(x);
2122 goto onError;
2123 }
2124 }
2125 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002126 int targetsize = PyString_GET_SIZE(x);
2127
2128 if (targetsize == 1)
2129 /* 1-1 mapping */
2130 *s++ = *PyString_AS_STRING(x);
2131
2132 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002133 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002134 if (targetsize > extrachars) {
2135 /* resize first */
2136 int oldpos = (int)(s - PyString_AS_STRING(v));
2137 int needed = (targetsize - extrachars) + \
2138 (targetsize << 2);
2139 extrachars += needed;
2140 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002141 Py_DECREF(x);
2142 goto onError;
2143 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002144 s = PyString_AS_STRING(v) + oldpos;
2145 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002146 memcpy(s, PyString_AS_STRING(x), targetsize);
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002147 s += targetsize;
2148 extrachars -= targetsize;
2149 }
2150 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151 }
2152 else {
2153 /* wrong return value */
2154 PyErr_SetString(PyExc_TypeError,
2155 "character mapping must return integer, None or unicode");
2156 Py_DECREF(x);
2157 goto onError;
2158 }
2159 Py_DECREF(x);
2160 }
2161 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2162 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2163 goto onError;
2164 return v;
2165
2166 onError:
2167 Py_DECREF(v);
2168 return NULL;
2169}
2170
2171PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2172 PyObject *mapping)
2173{
2174 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2175 PyErr_BadArgument();
2176 return NULL;
2177 }
2178 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2179 PyUnicode_GET_SIZE(unicode),
2180 mapping,
2181 NULL);
2182}
2183
2184static
2185int translate_error(const Py_UNICODE **source,
2186 Py_UNICODE **dest,
2187 const char *errors,
2188 const char *details)
2189{
2190 if ((errors == NULL) ||
2191 (strcmp(errors,"strict") == 0)) {
2192 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002193 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002194 details);
2195 return -1;
2196 }
2197 else if (strcmp(errors,"ignore") == 0) {
2198 return 0;
2199 }
2200 else if (strcmp(errors,"replace") == 0) {
2201 **dest = '?';
2202 (*dest)++;
2203 return 0;
2204 }
2205 else {
2206 PyErr_Format(PyExc_ValueError,
2207 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002208 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 errors);
2210 return -1;
2211 }
2212}
2213
2214PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2215 int size,
2216 PyObject *mapping,
2217 const char *errors)
2218{
2219 PyUnicodeObject *v;
2220 Py_UNICODE *p;
2221
2222 if (mapping == NULL) {
2223 PyErr_BadArgument();
2224 return NULL;
2225 }
2226
2227 /* Output will never be longer than input */
2228 v = _PyUnicode_New(size);
2229 if (v == NULL)
2230 goto onError;
2231 if (size == 0)
2232 goto done;
2233 p = PyUnicode_AS_UNICODE(v);
2234 while (size-- > 0) {
2235 Py_UNICODE ch = *s++;
2236 PyObject *w, *x;
2237
2238 /* Get mapping */
2239 w = PyInt_FromLong(ch);
2240 if (w == NULL)
2241 goto onError;
2242 x = PyObject_GetItem(mapping, w);
2243 Py_DECREF(w);
2244 if (x == NULL) {
2245 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2246 /* No mapping found: default to 1-1 mapping */
2247 PyErr_Clear();
2248 *p++ = ch;
2249 continue;
2250 }
2251 goto onError;
2252 }
2253
2254 /* Apply mapping */
2255 if (PyInt_Check(x))
2256 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2257 else if (x == Py_None) {
2258 /* undefined mapping */
2259 if (translate_error(&s, &p, errors,
2260 "character maps to <undefined>")) {
2261 Py_DECREF(x);
2262 goto onError;
2263 }
2264 }
2265 else if (PyUnicode_Check(x)) {
2266 if (PyUnicode_GET_SIZE(x) != 1) {
2267 /* 1-n mapping */
2268 PyErr_SetString(PyExc_NotImplementedError,
2269 "1-n mappings are currently not implemented");
2270 Py_DECREF(x);
2271 goto onError;
2272 }
2273 *p++ = *PyUnicode_AS_UNICODE(x);
2274 }
2275 else {
2276 /* wrong return value */
2277 PyErr_SetString(PyExc_TypeError,
2278 "translate mapping must return integer, None or unicode");
2279 Py_DECREF(x);
2280 goto onError;
2281 }
2282 Py_DECREF(x);
2283 }
2284 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002285 if (_PyUnicode_Resize(&v, (int)(p - PyUnicode_AS_UNICODE(v))))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002286 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002287
2288 done:
2289 return (PyObject *)v;
2290
2291 onError:
2292 Py_XDECREF(v);
2293 return NULL;
2294}
2295
2296PyObject *PyUnicode_Translate(PyObject *str,
2297 PyObject *mapping,
2298 const char *errors)
2299{
2300 PyObject *result;
2301
2302 str = PyUnicode_FromObject(str);
2303 if (str == NULL)
2304 goto onError;
2305 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2306 PyUnicode_GET_SIZE(str),
2307 mapping,
2308 errors);
2309 Py_DECREF(str);
2310 return result;
2311
2312 onError:
2313 Py_XDECREF(str);
2314 return NULL;
2315}
2316
Guido van Rossum9e896b32000-04-05 20:11:21 +00002317/* --- Decimal Encoder ---------------------------------------------------- */
2318
2319int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2320 int length,
2321 char *output,
2322 const char *errors)
2323{
2324 Py_UNICODE *p, *end;
2325
2326 if (output == NULL) {
2327 PyErr_BadArgument();
2328 return -1;
2329 }
2330
2331 p = s;
2332 end = s + length;
2333 while (p < end) {
2334 register Py_UNICODE ch = *p++;
2335 int decimal;
2336
2337 if (Py_UNICODE_ISSPACE(ch)) {
2338 *output++ = ' ';
2339 continue;
2340 }
2341 decimal = Py_UNICODE_TODECIMAL(ch);
2342 if (decimal >= 0) {
2343 *output++ = '0' + decimal;
2344 continue;
2345 }
Guido van Rossumba477042000-04-06 18:18:10 +00002346 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002347 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002348 continue;
2349 }
2350 /* All other characters are considered invalid */
2351 if (errors == NULL || strcmp(errors, "strict") == 0) {
2352 PyErr_SetString(PyExc_ValueError,
2353 "invalid decimal Unicode string");
2354 goto onError;
2355 }
2356 else if (strcmp(errors, "ignore") == 0)
2357 continue;
2358 else if (strcmp(errors, "replace") == 0) {
2359 *output++ = '?';
2360 continue;
2361 }
2362 }
2363 /* 0-terminate the output string */
2364 *output++ = '\0';
2365 return 0;
2366
2367 onError:
2368 return -1;
2369}
2370
Guido van Rossumd57fd912000-03-10 22:53:23 +00002371/* --- Helpers ------------------------------------------------------------ */
2372
2373static
2374int count(PyUnicodeObject *self,
2375 int start,
2376 int end,
2377 PyUnicodeObject *substring)
2378{
2379 int count = 0;
2380
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002381 if (start < 0)
2382 start += self->length;
2383 if (start < 0)
2384 start = 0;
2385 if (end > self->length)
2386 end = self->length;
2387 if (end < 0)
2388 end += self->length;
2389 if (end < 0)
2390 end = 0;
2391
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002392 if (substring->length == 0)
2393 return (end - start + 1);
2394
Guido van Rossumd57fd912000-03-10 22:53:23 +00002395 end -= substring->length;
2396
2397 while (start <= end)
2398 if (Py_UNICODE_MATCH(self, start, substring)) {
2399 count++;
2400 start += substring->length;
2401 } else
2402 start++;
2403
2404 return count;
2405}
2406
2407int PyUnicode_Count(PyObject *str,
2408 PyObject *substr,
2409 int start,
2410 int end)
2411{
2412 int result;
2413
2414 str = PyUnicode_FromObject(str);
2415 if (str == NULL)
2416 return -1;
2417 substr = PyUnicode_FromObject(substr);
2418 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002419 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002420 return -1;
2421 }
2422
2423 result = count((PyUnicodeObject *)str,
2424 start, end,
2425 (PyUnicodeObject *)substr);
2426
2427 Py_DECREF(str);
2428 Py_DECREF(substr);
2429 return result;
2430}
2431
2432static
2433int findstring(PyUnicodeObject *self,
2434 PyUnicodeObject *substring,
2435 int start,
2436 int end,
2437 int direction)
2438{
2439 if (start < 0)
2440 start += self->length;
2441 if (start < 0)
2442 start = 0;
2443
2444 if (substring->length == 0)
2445 return start;
2446
2447 if (end > self->length)
2448 end = self->length;
2449 if (end < 0)
2450 end += self->length;
2451 if (end < 0)
2452 end = 0;
2453
2454 end -= substring->length;
2455
2456 if (direction < 0) {
2457 for (; end >= start; end--)
2458 if (Py_UNICODE_MATCH(self, end, substring))
2459 return end;
2460 } else {
2461 for (; start <= end; start++)
2462 if (Py_UNICODE_MATCH(self, start, substring))
2463 return start;
2464 }
2465
2466 return -1;
2467}
2468
2469int PyUnicode_Find(PyObject *str,
2470 PyObject *substr,
2471 int start,
2472 int end,
2473 int direction)
2474{
2475 int result;
2476
2477 str = PyUnicode_FromObject(str);
2478 if (str == NULL)
2479 return -1;
2480 substr = PyUnicode_FromObject(substr);
2481 if (substr == NULL) {
2482 Py_DECREF(substr);
2483 return -1;
2484 }
2485
2486 result = findstring((PyUnicodeObject *)str,
2487 (PyUnicodeObject *)substr,
2488 start, end, direction);
2489 Py_DECREF(str);
2490 Py_DECREF(substr);
2491 return result;
2492}
2493
2494static
2495int tailmatch(PyUnicodeObject *self,
2496 PyUnicodeObject *substring,
2497 int start,
2498 int end,
2499 int direction)
2500{
2501 if (start < 0)
2502 start += self->length;
2503 if (start < 0)
2504 start = 0;
2505
2506 if (substring->length == 0)
2507 return 1;
2508
2509 if (end > self->length)
2510 end = self->length;
2511 if (end < 0)
2512 end += self->length;
2513 if (end < 0)
2514 end = 0;
2515
2516 end -= substring->length;
2517 if (end < start)
2518 return 0;
2519
2520 if (direction > 0) {
2521 if (Py_UNICODE_MATCH(self, end, substring))
2522 return 1;
2523 } else {
2524 if (Py_UNICODE_MATCH(self, start, substring))
2525 return 1;
2526 }
2527
2528 return 0;
2529}
2530
2531int PyUnicode_Tailmatch(PyObject *str,
2532 PyObject *substr,
2533 int start,
2534 int end,
2535 int direction)
2536{
2537 int result;
2538
2539 str = PyUnicode_FromObject(str);
2540 if (str == NULL)
2541 return -1;
2542 substr = PyUnicode_FromObject(substr);
2543 if (substr == NULL) {
2544 Py_DECREF(substr);
2545 return -1;
2546 }
2547
2548 result = tailmatch((PyUnicodeObject *)str,
2549 (PyUnicodeObject *)substr,
2550 start, end, direction);
2551 Py_DECREF(str);
2552 Py_DECREF(substr);
2553 return result;
2554}
2555
2556static
2557const Py_UNICODE *findchar(const Py_UNICODE *s,
2558 int size,
2559 Py_UNICODE ch)
2560{
2561 /* like wcschr, but doesn't stop at NULL characters */
2562
2563 while (size-- > 0) {
2564 if (*s == ch)
2565 return s;
2566 s++;
2567 }
2568
2569 return NULL;
2570}
2571
2572/* Apply fixfct filter to the Unicode object self and return a
2573 reference to the modified object */
2574
2575static
2576PyObject *fixup(PyUnicodeObject *self,
2577 int (*fixfct)(PyUnicodeObject *s))
2578{
2579
2580 PyUnicodeObject *u;
2581
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002582 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583 if (u == NULL)
2584 return NULL;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002585
2586 Py_UNICODE_COPY(u->str, self->str, self->length);
2587
Guido van Rossumd57fd912000-03-10 22:53:23 +00002588 if (!fixfct(u)) {
2589 /* fixfct should return TRUE if it modified the buffer. If
2590 FALSE, return a reference to the original buffer instead
2591 (to save space, not time) */
2592 Py_INCREF(self);
2593 Py_DECREF(u);
2594 return (PyObject*) self;
2595 }
2596 return (PyObject*) u;
2597}
2598
2599static
2600int fixupper(PyUnicodeObject *self)
2601{
2602 int len = self->length;
2603 Py_UNICODE *s = self->str;
2604 int status = 0;
2605
2606 while (len-- > 0) {
2607 register Py_UNICODE ch;
2608
2609 ch = Py_UNICODE_TOUPPER(*s);
2610 if (ch != *s) {
2611 status = 1;
2612 *s = ch;
2613 }
2614 s++;
2615 }
2616
2617 return status;
2618}
2619
2620static
2621int fixlower(PyUnicodeObject *self)
2622{
2623 int len = self->length;
2624 Py_UNICODE *s = self->str;
2625 int status = 0;
2626
2627 while (len-- > 0) {
2628 register Py_UNICODE ch;
2629
2630 ch = Py_UNICODE_TOLOWER(*s);
2631 if (ch != *s) {
2632 status = 1;
2633 *s = ch;
2634 }
2635 s++;
2636 }
2637
2638 return status;
2639}
2640
2641static
2642int fixswapcase(PyUnicodeObject *self)
2643{
2644 int len = self->length;
2645 Py_UNICODE *s = self->str;
2646 int status = 0;
2647
2648 while (len-- > 0) {
2649 if (Py_UNICODE_ISUPPER(*s)) {
2650 *s = Py_UNICODE_TOLOWER(*s);
2651 status = 1;
2652 } else if (Py_UNICODE_ISLOWER(*s)) {
2653 *s = Py_UNICODE_TOUPPER(*s);
2654 status = 1;
2655 }
2656 s++;
2657 }
2658
2659 return status;
2660}
2661
2662static
2663int fixcapitalize(PyUnicodeObject *self)
2664{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002665 int len = self->length;
2666 Py_UNICODE *s = self->str;
2667 int status = 0;
2668
2669 if (len == 0)
2670 return 0;
2671 if (Py_UNICODE_ISLOWER(*s)) {
2672 *s = Py_UNICODE_TOUPPER(*s);
2673 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002674 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002675 s++;
2676 while (--len > 0) {
2677 if (Py_UNICODE_ISUPPER(*s)) {
2678 *s = Py_UNICODE_TOLOWER(*s);
2679 status = 1;
2680 }
2681 s++;
2682 }
2683 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002684}
2685
2686static
2687int fixtitle(PyUnicodeObject *self)
2688{
2689 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2690 register Py_UNICODE *e;
2691 int previous_is_cased;
2692
2693 /* Shortcut for single character strings */
2694 if (PyUnicode_GET_SIZE(self) == 1) {
2695 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2696 if (*p != ch) {
2697 *p = ch;
2698 return 1;
2699 }
2700 else
2701 return 0;
2702 }
2703
2704 e = p + PyUnicode_GET_SIZE(self);
2705 previous_is_cased = 0;
2706 for (; p < e; p++) {
2707 register const Py_UNICODE ch = *p;
2708
2709 if (previous_is_cased)
2710 *p = Py_UNICODE_TOLOWER(ch);
2711 else
2712 *p = Py_UNICODE_TOTITLE(ch);
2713
2714 if (Py_UNICODE_ISLOWER(ch) ||
2715 Py_UNICODE_ISUPPER(ch) ||
2716 Py_UNICODE_ISTITLE(ch))
2717 previous_is_cased = 1;
2718 else
2719 previous_is_cased = 0;
2720 }
2721 return 1;
2722}
2723
2724PyObject *PyUnicode_Join(PyObject *separator,
2725 PyObject *seq)
2726{
2727 Py_UNICODE *sep;
2728 int seplen;
2729 PyUnicodeObject *res = NULL;
2730 int reslen = 0;
2731 Py_UNICODE *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002732 int sz = 100;
2733 int i;
Tim Peters2cfe3682001-05-05 05:36:48 +00002734 PyObject *it;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002735
Tim Peters2cfe3682001-05-05 05:36:48 +00002736 it = PyObject_GetIter(seq);
2737 if (it == NULL)
2738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002739
2740 if (separator == NULL) {
2741 Py_UNICODE blank = ' ';
2742 sep = &blank;
2743 seplen = 1;
2744 }
2745 else {
2746 separator = PyUnicode_FromObject(separator);
2747 if (separator == NULL)
Tim Peters2cfe3682001-05-05 05:36:48 +00002748 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002749 sep = PyUnicode_AS_UNICODE(separator);
2750 seplen = PyUnicode_GET_SIZE(separator);
2751 }
2752
2753 res = _PyUnicode_New(sz);
2754 if (res == NULL)
2755 goto onError;
2756 p = PyUnicode_AS_UNICODE(res);
2757 reslen = 0;
2758
Tim Peters2cfe3682001-05-05 05:36:48 +00002759 for (i = 0; ; ++i) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002760 int itemlen;
Tim Peters2cfe3682001-05-05 05:36:48 +00002761 PyObject *item = PyIter_Next(it);
2762 if (item == NULL) {
2763 if (PyErr_Occurred())
2764 goto onError;
2765 break;
2766 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002767 if (!PyUnicode_Check(item)) {
2768 PyObject *v;
2769 v = PyUnicode_FromObject(item);
2770 Py_DECREF(item);
2771 item = v;
2772 if (item == NULL)
2773 goto onError;
2774 }
2775 itemlen = PyUnicode_GET_SIZE(item);
2776 while (reslen + itemlen + seplen >= sz) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002777 if (_PyUnicode_Resize(&res, sz*2))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002778 goto onError;
2779 sz *= 2;
2780 p = PyUnicode_AS_UNICODE(res) + reslen;
2781 }
2782 if (i > 0) {
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002783 Py_UNICODE_COPY(p, sep, seplen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002784 p += seplen;
2785 reslen += seplen;
2786 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002787 Py_UNICODE_COPY(p, PyUnicode_AS_UNICODE(item), itemlen);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002788 p += itemlen;
2789 reslen += itemlen;
2790 Py_DECREF(item);
2791 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002792 if (_PyUnicode_Resize(&res, reslen))
Guido van Rossumd57fd912000-03-10 22:53:23 +00002793 goto onError;
2794
2795 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002796 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002797 return (PyObject *)res;
2798
2799 onError:
2800 Py_XDECREF(separator);
Tim Peters2cfe3682001-05-05 05:36:48 +00002801 Py_XDECREF(res);
2802 Py_DECREF(it);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002803 return NULL;
2804}
2805
2806static
2807PyUnicodeObject *pad(PyUnicodeObject *self,
2808 int left,
2809 int right,
2810 Py_UNICODE fill)
2811{
2812 PyUnicodeObject *u;
2813
2814 if (left < 0)
2815 left = 0;
2816 if (right < 0)
2817 right = 0;
2818
2819 if (left == 0 && right == 0) {
2820 Py_INCREF(self);
2821 return self;
2822 }
2823
2824 u = _PyUnicode_New(left + self->length + right);
2825 if (u) {
2826 if (left)
2827 Py_UNICODE_FILL(u->str, fill, left);
2828 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2829 if (right)
2830 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2831 }
2832
2833 return u;
2834}
2835
2836#define SPLIT_APPEND(data, left, right) \
2837 str = PyUnicode_FromUnicode(data + left, right - left); \
2838 if (!str) \
2839 goto onError; \
2840 if (PyList_Append(list, str)) { \
2841 Py_DECREF(str); \
2842 goto onError; \
2843 } \
2844 else \
2845 Py_DECREF(str);
2846
2847static
2848PyObject *split_whitespace(PyUnicodeObject *self,
2849 PyObject *list,
2850 int maxcount)
2851{
2852 register int i;
2853 register int j;
2854 int len = self->length;
2855 PyObject *str;
2856
2857 for (i = j = 0; i < len; ) {
2858 /* find a token */
2859 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2860 i++;
2861 j = i;
2862 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2863 i++;
2864 if (j < i) {
2865 if (maxcount-- <= 0)
2866 break;
2867 SPLIT_APPEND(self->str, j, i);
2868 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2869 i++;
2870 j = i;
2871 }
2872 }
2873 if (j < len) {
2874 SPLIT_APPEND(self->str, j, len);
2875 }
2876 return list;
2877
2878 onError:
2879 Py_DECREF(list);
2880 return NULL;
2881}
2882
2883PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002884 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002885{
2886 register int i;
2887 register int j;
2888 int len;
2889 PyObject *list;
2890 PyObject *str;
2891 Py_UNICODE *data;
2892
2893 string = PyUnicode_FromObject(string);
2894 if (string == NULL)
2895 return NULL;
2896 data = PyUnicode_AS_UNICODE(string);
2897 len = PyUnicode_GET_SIZE(string);
2898
Guido van Rossumd57fd912000-03-10 22:53:23 +00002899 list = PyList_New(0);
2900 if (!list)
2901 goto onError;
2902
2903 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002904 int eol;
2905
Guido van Rossumd57fd912000-03-10 22:53:23 +00002906 /* Find a line and append it */
2907 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2908 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002909
2910 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002911 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002912 if (i < len) {
2913 if (data[i] == '\r' && i + 1 < len &&
2914 data[i+1] == '\n')
2915 i += 2;
2916 else
2917 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002918 if (keepends)
2919 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002920 }
Guido van Rossum86662912000-04-11 15:38:46 +00002921 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002922 j = i;
2923 }
2924 if (j < len) {
2925 SPLIT_APPEND(data, j, len);
2926 }
2927
2928 Py_DECREF(string);
2929 return list;
2930
2931 onError:
2932 Py_DECREF(list);
2933 Py_DECREF(string);
2934 return NULL;
2935}
2936
2937static
2938PyObject *split_char(PyUnicodeObject *self,
2939 PyObject *list,
2940 Py_UNICODE ch,
2941 int maxcount)
2942{
2943 register int i;
2944 register int j;
2945 int len = self->length;
2946 PyObject *str;
2947
2948 for (i = j = 0; i < len; ) {
2949 if (self->str[i] == ch) {
2950 if (maxcount-- <= 0)
2951 break;
2952 SPLIT_APPEND(self->str, j, i);
2953 i = j = i + 1;
2954 } else
2955 i++;
2956 }
2957 if (j <= len) {
2958 SPLIT_APPEND(self->str, j, len);
2959 }
2960 return list;
2961
2962 onError:
2963 Py_DECREF(list);
2964 return NULL;
2965}
2966
2967static
2968PyObject *split_substring(PyUnicodeObject *self,
2969 PyObject *list,
2970 PyUnicodeObject *substring,
2971 int maxcount)
2972{
2973 register int i;
2974 register int j;
2975 int len = self->length;
2976 int sublen = substring->length;
2977 PyObject *str;
2978
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002979 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002980 if (Py_UNICODE_MATCH(self, i, substring)) {
2981 if (maxcount-- <= 0)
2982 break;
2983 SPLIT_APPEND(self->str, j, i);
2984 i = j = i + sublen;
2985 } else
2986 i++;
2987 }
2988 if (j <= len) {
2989 SPLIT_APPEND(self->str, j, len);
2990 }
2991 return list;
2992
2993 onError:
2994 Py_DECREF(list);
2995 return NULL;
2996}
2997
2998#undef SPLIT_APPEND
2999
3000static
3001PyObject *split(PyUnicodeObject *self,
3002 PyUnicodeObject *substring,
3003 int maxcount)
3004{
3005 PyObject *list;
3006
3007 if (maxcount < 0)
3008 maxcount = INT_MAX;
3009
3010 list = PyList_New(0);
3011 if (!list)
3012 return NULL;
3013
3014 if (substring == NULL)
3015 return split_whitespace(self,list,maxcount);
3016
3017 else if (substring->length == 1)
3018 return split_char(self,list,substring->str[0],maxcount);
3019
3020 else if (substring->length == 0) {
3021 Py_DECREF(list);
3022 PyErr_SetString(PyExc_ValueError, "empty separator");
3023 return NULL;
3024 }
3025 else
3026 return split_substring(self,list,substring,maxcount);
3027}
3028
3029static
3030PyObject *strip(PyUnicodeObject *self,
3031 int left,
3032 int right)
3033{
3034 Py_UNICODE *p = self->str;
3035 int start = 0;
3036 int end = self->length;
3037
3038 if (left)
3039 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3040 start++;
3041
3042 if (right)
3043 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3044 end--;
3045
3046 if (start == 0 && end == self->length) {
3047 /* couldn't strip anything off, return original string */
3048 Py_INCREF(self);
3049 return (PyObject*) self;
3050 }
3051
3052 return (PyObject*) PyUnicode_FromUnicode(
3053 self->str + start,
3054 end - start
3055 );
3056}
3057
3058static
3059PyObject *replace(PyUnicodeObject *self,
3060 PyUnicodeObject *str1,
3061 PyUnicodeObject *str2,
3062 int maxcount)
3063{
3064 PyUnicodeObject *u;
3065
3066 if (maxcount < 0)
3067 maxcount = INT_MAX;
3068
3069 if (str1->length == 1 && str2->length == 1) {
3070 int i;
3071
3072 /* replace characters */
3073 if (!findchar(self->str, self->length, str1->str[0])) {
3074 /* nothing to replace, return original string */
3075 Py_INCREF(self);
3076 u = self;
3077 } else {
3078 Py_UNICODE u1 = str1->str[0];
3079 Py_UNICODE u2 = str2->str[0];
3080
3081 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003082 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00003083 self->length
3084 );
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003085 if (u != NULL) {
3086 Py_UNICODE_COPY(u->str, self->str,
3087 self->length);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003088 for (i = 0; i < u->length; i++)
3089 if (u->str[i] == u1) {
3090 if (--maxcount < 0)
3091 break;
3092 u->str[i] = u2;
3093 }
3094 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00003095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003096
3097 } else {
3098 int n, i;
3099 Py_UNICODE *p;
3100
3101 /* replace strings */
3102 n = count(self, 0, self->length, str1);
3103 if (n > maxcount)
3104 n = maxcount;
3105 if (n == 0) {
3106 /* nothing to replace, return original string */
3107 Py_INCREF(self);
3108 u = self;
3109 } else {
3110 u = _PyUnicode_New(
3111 self->length + n * (str2->length - str1->length));
3112 if (u) {
3113 i = 0;
3114 p = u->str;
3115 while (i <= self->length - str1->length)
3116 if (Py_UNICODE_MATCH(self, i, str1)) {
3117 /* replace string segment */
3118 Py_UNICODE_COPY(p, str2->str, str2->length);
3119 p += str2->length;
3120 i += str1->length;
3121 if (--n <= 0) {
3122 /* copy remaining part */
3123 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3124 break;
3125 }
3126 } else
3127 *p++ = self->str[i++];
3128 }
3129 }
3130 }
3131
3132 return (PyObject *) u;
3133}
3134
3135/* --- Unicode Object Methods --------------------------------------------- */
3136
3137static char title__doc__[] =
3138"S.title() -> unicode\n\
3139\n\
3140Return a titlecased version of S, i.e. words start with title case\n\
3141characters, all remaining cased characters have lower case.";
3142
3143static PyObject*
3144unicode_title(PyUnicodeObject *self, PyObject *args)
3145{
3146 if (!PyArg_NoArgs(args))
3147 return NULL;
3148 return fixup(self, fixtitle);
3149}
3150
3151static char capitalize__doc__[] =
3152"S.capitalize() -> unicode\n\
3153\n\
3154Return a capitalized version of S, i.e. make the first character\n\
3155have upper case.";
3156
3157static PyObject*
3158unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3159{
3160 if (!PyArg_NoArgs(args))
3161 return NULL;
3162 return fixup(self, fixcapitalize);
3163}
3164
3165#if 0
3166static char capwords__doc__[] =
3167"S.capwords() -> unicode\n\
3168\n\
3169Apply .capitalize() to all words in S and return the result with\n\
3170normalized whitespace (all whitespace strings are replaced by ' ').";
3171
3172static PyObject*
3173unicode_capwords(PyUnicodeObject *self, PyObject *args)
3174{
3175 PyObject *list;
3176 PyObject *item;
3177 int i;
3178
3179 if (!PyArg_NoArgs(args))
3180 return NULL;
3181
3182 /* Split into words */
3183 list = split(self, NULL, -1);
3184 if (!list)
3185 return NULL;
3186
3187 /* Capitalize each word */
3188 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3189 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3190 fixcapitalize);
3191 if (item == NULL)
3192 goto onError;
3193 Py_DECREF(PyList_GET_ITEM(list, i));
3194 PyList_SET_ITEM(list, i, item);
3195 }
3196
3197 /* Join the words to form a new string */
3198 item = PyUnicode_Join(NULL, list);
3199
3200onError:
3201 Py_DECREF(list);
3202 return (PyObject *)item;
3203}
3204#endif
3205
3206static char center__doc__[] =
3207"S.center(width) -> unicode\n\
3208\n\
3209Return S centered in a Unicode string of length width. Padding is done\n\
3210using spaces.";
3211
3212static PyObject *
3213unicode_center(PyUnicodeObject *self, PyObject *args)
3214{
3215 int marg, left;
3216 int width;
3217
3218 if (!PyArg_ParseTuple(args, "i:center", &width))
3219 return NULL;
3220
3221 if (self->length >= width) {
3222 Py_INCREF(self);
3223 return (PyObject*) self;
3224 }
3225
3226 marg = width - self->length;
3227 left = marg / 2 + (marg & width & 1);
3228
3229 return (PyObject*) pad(self, left, marg - left, ' ');
3230}
3231
Marc-André Lemburge5034372000-08-08 08:04:29 +00003232#if 0
3233
3234/* This code should go into some future Unicode collation support
3235 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003236 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003237
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003238/* speedy UTF-16 code point order comparison */
3239/* gleaned from: */
3240/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3241
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003242static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003243{
3244 0, 0, 0, 0, 0, 0, 0, 0,
3245 0, 0, 0, 0, 0, 0, 0, 0,
3246 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003247 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003248};
3249
Guido van Rossumd57fd912000-03-10 22:53:23 +00003250static int
3251unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3252{
3253 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003254
Guido van Rossumd57fd912000-03-10 22:53:23 +00003255 Py_UNICODE *s1 = str1->str;
3256 Py_UNICODE *s2 = str2->str;
3257
3258 len1 = str1->length;
3259 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003260
Guido van Rossumd57fd912000-03-10 22:53:23 +00003261 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003262 Py_UNICODE c1, c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003263
3264 c1 = *s1++;
3265 c2 = *s2++;
Fredrik Lundh45714e92001-06-26 16:39:36 +00003266
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003267 if (c1 > (1<<11) * 26)
3268 c1 += utf16Fixup[c1>>11];
3269 if (c2 > (1<<11) * 26)
3270 c2 += utf16Fixup[c2>>11];
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003271 /* now c1 and c2 are in UTF-32-compatible order */
Fredrik Lundh45714e92001-06-26 16:39:36 +00003272
3273 if (c1 != c2)
3274 return (c1 < c2) ? -1 : 1;
3275
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003276 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003277 }
3278
3279 return (len1 < len2) ? -1 : (len1 != len2);
3280}
3281
Marc-André Lemburge5034372000-08-08 08:04:29 +00003282#else
3283
3284static int
3285unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3286{
3287 register int len1, len2;
3288
3289 Py_UNICODE *s1 = str1->str;
3290 Py_UNICODE *s2 = str2->str;
3291
3292 len1 = str1->length;
3293 len2 = str2->length;
3294
3295 while (len1 > 0 && len2 > 0) {
Fredrik Lundh45714e92001-06-26 16:39:36 +00003296 Py_UNICODE c1, c2;
Marc-André Lemburge5034372000-08-08 08:04:29 +00003297
Fredrik Lundh45714e92001-06-26 16:39:36 +00003298 c1 = *s1++;
3299 c2 = *s2++;
3300
3301 if (c1 != c2)
3302 return (c1 < c2) ? -1 : 1;
3303
Marc-André Lemburge5034372000-08-08 08:04:29 +00003304 len1--; len2--;
3305 }
3306
3307 return (len1 < len2) ? -1 : (len1 != len2);
3308}
3309
3310#endif
3311
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312int PyUnicode_Compare(PyObject *left,
3313 PyObject *right)
3314{
3315 PyUnicodeObject *u = NULL, *v = NULL;
3316 int result;
3317
3318 /* Coerce the two arguments */
3319 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3320 if (u == NULL)
3321 goto onError;
3322 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3323 if (v == NULL)
3324 goto onError;
3325
Thomas Wouters7e474022000-07-16 12:04:32 +00003326 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003327 if (v == u) {
3328 Py_DECREF(u);
3329 Py_DECREF(v);
3330 return 0;
3331 }
3332
3333 result = unicode_compare(u, v);
3334
3335 Py_DECREF(u);
3336 Py_DECREF(v);
3337 return result;
3338
3339onError:
3340 Py_XDECREF(u);
3341 Py_XDECREF(v);
3342 return -1;
3343}
3344
Guido van Rossum403d68b2000-03-13 15:55:09 +00003345int PyUnicode_Contains(PyObject *container,
3346 PyObject *element)
3347{
3348 PyUnicodeObject *u = NULL, *v = NULL;
3349 int result;
3350 register const Py_UNICODE *p, *e;
3351 register Py_UNICODE ch;
3352
3353 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003354 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003355 if (v == NULL) {
3356 PyErr_SetString(PyExc_TypeError,
3357 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003358 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003359 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003360 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3361 if (u == NULL) {
3362 Py_DECREF(v);
3363 goto onError;
3364 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003365
3366 /* Check v in u */
3367 if (PyUnicode_GET_SIZE(v) != 1) {
3368 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003369 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003370 goto onError;
3371 }
3372 ch = *PyUnicode_AS_UNICODE(v);
3373 p = PyUnicode_AS_UNICODE(u);
3374 e = p + PyUnicode_GET_SIZE(u);
3375 result = 0;
3376 while (p < e) {
3377 if (*p++ == ch) {
3378 result = 1;
3379 break;
3380 }
3381 }
3382
3383 Py_DECREF(u);
3384 Py_DECREF(v);
3385 return result;
3386
3387onError:
3388 Py_XDECREF(u);
3389 Py_XDECREF(v);
3390 return -1;
3391}
3392
Guido van Rossumd57fd912000-03-10 22:53:23 +00003393/* Concat to string or Unicode object giving a new Unicode object. */
3394
3395PyObject *PyUnicode_Concat(PyObject *left,
3396 PyObject *right)
3397{
3398 PyUnicodeObject *u = NULL, *v = NULL, *w;
3399
3400 /* Coerce the two arguments */
3401 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3402 if (u == NULL)
3403 goto onError;
3404 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3405 if (v == NULL)
3406 goto onError;
3407
3408 /* Shortcuts */
3409 if (v == unicode_empty) {
3410 Py_DECREF(v);
3411 return (PyObject *)u;
3412 }
3413 if (u == unicode_empty) {
3414 Py_DECREF(u);
3415 return (PyObject *)v;
3416 }
3417
3418 /* Concat the two Unicode strings */
3419 w = _PyUnicode_New(u->length + v->length);
3420 if (w == NULL)
3421 goto onError;
3422 Py_UNICODE_COPY(w->str, u->str, u->length);
3423 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3424
3425 Py_DECREF(u);
3426 Py_DECREF(v);
3427 return (PyObject *)w;
3428
3429onError:
3430 Py_XDECREF(u);
3431 Py_XDECREF(v);
3432 return NULL;
3433}
3434
3435static char count__doc__[] =
3436"S.count(sub[, start[, end]]) -> int\n\
3437\n\
3438Return the number of occurrences of substring sub in Unicode string\n\
3439S[start:end]. Optional arguments start and end are\n\
3440interpreted as in slice notation.";
3441
3442static PyObject *
3443unicode_count(PyUnicodeObject *self, PyObject *args)
3444{
3445 PyUnicodeObject *substring;
3446 int start = 0;
3447 int end = INT_MAX;
3448 PyObject *result;
3449
Guido van Rossumb8872e62000-05-09 14:14:27 +00003450 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3451 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003452 return NULL;
3453
3454 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3455 (PyObject *)substring);
3456 if (substring == NULL)
3457 return NULL;
3458
Guido van Rossumd57fd912000-03-10 22:53:23 +00003459 if (start < 0)
3460 start += self->length;
3461 if (start < 0)
3462 start = 0;
3463 if (end > self->length)
3464 end = self->length;
3465 if (end < 0)
3466 end += self->length;
3467 if (end < 0)
3468 end = 0;
3469
3470 result = PyInt_FromLong((long) count(self, start, end, substring));
3471
3472 Py_DECREF(substring);
3473 return result;
3474}
3475
3476static char encode__doc__[] =
3477"S.encode([encoding[,errors]]) -> string\n\
3478\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003479Return an encoded string version of S. Default encoding is the current\n\
3480default string encoding. errors may be given to set a different error\n\
3481handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3482a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003483
3484static PyObject *
3485unicode_encode(PyUnicodeObject *self, PyObject *args)
3486{
3487 char *encoding = NULL;
3488 char *errors = NULL;
3489 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3490 return NULL;
3491 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3492}
3493
3494static char expandtabs__doc__[] =
3495"S.expandtabs([tabsize]) -> unicode\n\
3496\n\
3497Return a copy of S where all tab characters are expanded using spaces.\n\
3498If tabsize is not given, a tab size of 8 characters is assumed.";
3499
3500static PyObject*
3501unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3502{
3503 Py_UNICODE *e;
3504 Py_UNICODE *p;
3505 Py_UNICODE *q;
3506 int i, j;
3507 PyUnicodeObject *u;
3508 int tabsize = 8;
3509
3510 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3511 return NULL;
3512
Thomas Wouters7e474022000-07-16 12:04:32 +00003513 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 i = j = 0;
3515 e = self->str + self->length;
3516 for (p = self->str; p < e; p++)
3517 if (*p == '\t') {
3518 if (tabsize > 0)
3519 j += tabsize - (j % tabsize);
3520 }
3521 else {
3522 j++;
3523 if (*p == '\n' || *p == '\r') {
3524 i += j;
3525 j = 0;
3526 }
3527 }
3528
3529 /* Second pass: create output string and fill it */
3530 u = _PyUnicode_New(i + j);
3531 if (!u)
3532 return NULL;
3533
3534 j = 0;
3535 q = u->str;
3536
3537 for (p = self->str; p < e; p++)
3538 if (*p == '\t') {
3539 if (tabsize > 0) {
3540 i = tabsize - (j % tabsize);
3541 j += i;
3542 while (i--)
3543 *q++ = ' ';
3544 }
3545 }
3546 else {
3547 j++;
3548 *q++ = *p;
3549 if (*p == '\n' || *p == '\r')
3550 j = 0;
3551 }
3552
3553 return (PyObject*) u;
3554}
3555
3556static char find__doc__[] =
3557"S.find(sub [,start [,end]]) -> int\n\
3558\n\
3559Return the lowest index in S where substring sub is found,\n\
3560such that sub is contained within s[start,end]. Optional\n\
3561arguments start and end are interpreted as in slice notation.\n\
3562\n\
3563Return -1 on failure.";
3564
3565static PyObject *
3566unicode_find(PyUnicodeObject *self, PyObject *args)
3567{
3568 PyUnicodeObject *substring;
3569 int start = 0;
3570 int end = INT_MAX;
3571 PyObject *result;
3572
Guido van Rossumb8872e62000-05-09 14:14:27 +00003573 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3574 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003575 return NULL;
3576 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3577 (PyObject *)substring);
3578 if (substring == NULL)
3579 return NULL;
3580
3581 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3582
3583 Py_DECREF(substring);
3584 return result;
3585}
3586
3587static PyObject *
3588unicode_getitem(PyUnicodeObject *self, int index)
3589{
3590 if (index < 0 || index >= self->length) {
3591 PyErr_SetString(PyExc_IndexError, "string index out of range");
3592 return NULL;
3593 }
3594
3595 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3596}
3597
3598static long
3599unicode_hash(PyUnicodeObject *self)
3600{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003601 /* Since Unicode objects compare equal to their ASCII string
3602 counterparts, they should use the individual character values
3603 as basis for their hash value. This is needed to assure that
3604 strings and Unicode objects behave in the same way as
3605 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003606
Fredrik Lundhdde61642000-07-10 18:27:47 +00003607 register int len;
3608 register Py_UNICODE *p;
3609 register long x;
3610
Guido van Rossumd57fd912000-03-10 22:53:23 +00003611 if (self->hash != -1)
3612 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003613 len = PyUnicode_GET_SIZE(self);
3614 p = PyUnicode_AS_UNICODE(self);
3615 x = *p << 7;
3616 while (--len >= 0)
3617 x = (1000003*x) ^ *p++;
3618 x ^= PyUnicode_GET_SIZE(self);
3619 if (x == -1)
3620 x = -2;
3621 self->hash = x;
3622 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003623}
3624
3625static char index__doc__[] =
3626"S.index(sub [,start [,end]]) -> int\n\
3627\n\
3628Like S.find() but raise ValueError when the substring is not found.";
3629
3630static PyObject *
3631unicode_index(PyUnicodeObject *self, PyObject *args)
3632{
3633 int result;
3634 PyUnicodeObject *substring;
3635 int start = 0;
3636 int end = INT_MAX;
3637
Guido van Rossumb8872e62000-05-09 14:14:27 +00003638 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3639 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003640 return NULL;
3641
3642 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3643 (PyObject *)substring);
3644 if (substring == NULL)
3645 return NULL;
3646
3647 result = findstring(self, substring, start, end, 1);
3648
3649 Py_DECREF(substring);
3650 if (result < 0) {
3651 PyErr_SetString(PyExc_ValueError, "substring not found");
3652 return NULL;
3653 }
3654 return PyInt_FromLong(result);
3655}
3656
3657static char islower__doc__[] =
3658"S.islower() -> int\n\
3659\n\
3660Return 1 if all cased characters in S are lowercase and there is\n\
3661at least one cased character in S, 0 otherwise.";
3662
3663static PyObject*
3664unicode_islower(PyUnicodeObject *self, PyObject *args)
3665{
3666 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3667 register const Py_UNICODE *e;
3668 int cased;
3669
3670 if (!PyArg_NoArgs(args))
3671 return NULL;
3672
3673 /* Shortcut for single character strings */
3674 if (PyUnicode_GET_SIZE(self) == 1)
3675 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3676
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003677 /* Special case for empty strings */
3678 if (PyString_GET_SIZE(self) == 0)
3679 return PyInt_FromLong(0);
3680
Guido van Rossumd57fd912000-03-10 22:53:23 +00003681 e = p + PyUnicode_GET_SIZE(self);
3682 cased = 0;
3683 for (; p < e; p++) {
3684 register const Py_UNICODE ch = *p;
3685
3686 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3687 return PyInt_FromLong(0);
3688 else if (!cased && Py_UNICODE_ISLOWER(ch))
3689 cased = 1;
3690 }
3691 return PyInt_FromLong(cased);
3692}
3693
3694static char isupper__doc__[] =
3695"S.isupper() -> int\n\
3696\n\
3697Return 1 if all cased characters in S are uppercase and there is\n\
3698at least one cased character in S, 0 otherwise.";
3699
3700static PyObject*
3701unicode_isupper(PyUnicodeObject *self, PyObject *args)
3702{
3703 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3704 register const Py_UNICODE *e;
3705 int cased;
3706
3707 if (!PyArg_NoArgs(args))
3708 return NULL;
3709
3710 /* Shortcut for single character strings */
3711 if (PyUnicode_GET_SIZE(self) == 1)
3712 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3713
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003714 /* Special case for empty strings */
3715 if (PyString_GET_SIZE(self) == 0)
3716 return PyInt_FromLong(0);
3717
Guido van Rossumd57fd912000-03-10 22:53:23 +00003718 e = p + PyUnicode_GET_SIZE(self);
3719 cased = 0;
3720 for (; p < e; p++) {
3721 register const Py_UNICODE ch = *p;
3722
3723 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3724 return PyInt_FromLong(0);
3725 else if (!cased && Py_UNICODE_ISUPPER(ch))
3726 cased = 1;
3727 }
3728 return PyInt_FromLong(cased);
3729}
3730
3731static char istitle__doc__[] =
3732"S.istitle() -> int\n\
3733\n\
3734Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3735may only follow uncased characters and lowercase characters only cased\n\
3736ones. Return 0 otherwise.";
3737
3738static PyObject*
3739unicode_istitle(PyUnicodeObject *self, PyObject *args)
3740{
3741 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3742 register const Py_UNICODE *e;
3743 int cased, previous_is_cased;
3744
3745 if (!PyArg_NoArgs(args))
3746 return NULL;
3747
3748 /* Shortcut for single character strings */
3749 if (PyUnicode_GET_SIZE(self) == 1)
3750 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3751 (Py_UNICODE_ISUPPER(*p) != 0));
3752
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003753 /* Special case for empty strings */
3754 if (PyString_GET_SIZE(self) == 0)
3755 return PyInt_FromLong(0);
3756
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 e = p + PyUnicode_GET_SIZE(self);
3758 cased = 0;
3759 previous_is_cased = 0;
3760 for (; p < e; p++) {
3761 register const Py_UNICODE ch = *p;
3762
3763 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3764 if (previous_is_cased)
3765 return PyInt_FromLong(0);
3766 previous_is_cased = 1;
3767 cased = 1;
3768 }
3769 else if (Py_UNICODE_ISLOWER(ch)) {
3770 if (!previous_is_cased)
3771 return PyInt_FromLong(0);
3772 previous_is_cased = 1;
3773 cased = 1;
3774 }
3775 else
3776 previous_is_cased = 0;
3777 }
3778 return PyInt_FromLong(cased);
3779}
3780
3781static char isspace__doc__[] =
3782"S.isspace() -> int\n\
3783\n\
3784Return 1 if there are only whitespace characters in S,\n\
37850 otherwise.";
3786
3787static PyObject*
3788unicode_isspace(PyUnicodeObject *self, PyObject *args)
3789{
3790 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3791 register const Py_UNICODE *e;
3792
3793 if (!PyArg_NoArgs(args))
3794 return NULL;
3795
3796 /* Shortcut for single character strings */
3797 if (PyUnicode_GET_SIZE(self) == 1 &&
3798 Py_UNICODE_ISSPACE(*p))
3799 return PyInt_FromLong(1);
3800
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003801 /* Special case for empty strings */
3802 if (PyString_GET_SIZE(self) == 0)
3803 return PyInt_FromLong(0);
3804
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805 e = p + PyUnicode_GET_SIZE(self);
3806 for (; p < e; p++) {
3807 if (!Py_UNICODE_ISSPACE(*p))
3808 return PyInt_FromLong(0);
3809 }
3810 return PyInt_FromLong(1);
3811}
3812
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003813static char isalpha__doc__[] =
3814"S.isalpha() -> int\n\
3815\n\
3816Return 1 if all characters in S are alphabetic\n\
3817and there is at least one character in S, 0 otherwise.";
3818
3819static PyObject*
3820unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3821{
3822 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3823 register const Py_UNICODE *e;
3824
3825 if (!PyArg_NoArgs(args))
3826 return NULL;
3827
3828 /* Shortcut for single character strings */
3829 if (PyUnicode_GET_SIZE(self) == 1 &&
3830 Py_UNICODE_ISALPHA(*p))
3831 return PyInt_FromLong(1);
3832
3833 /* Special case for empty strings */
3834 if (PyString_GET_SIZE(self) == 0)
3835 return PyInt_FromLong(0);
3836
3837 e = p + PyUnicode_GET_SIZE(self);
3838 for (; p < e; p++) {
3839 if (!Py_UNICODE_ISALPHA(*p))
3840 return PyInt_FromLong(0);
3841 }
3842 return PyInt_FromLong(1);
3843}
3844
3845static char isalnum__doc__[] =
3846"S.isalnum() -> int\n\
3847\n\
3848Return 1 if all characters in S are alphanumeric\n\
3849and there is at least one character in S, 0 otherwise.";
3850
3851static PyObject*
3852unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3853{
3854 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3855 register const Py_UNICODE *e;
3856
3857 if (!PyArg_NoArgs(args))
3858 return NULL;
3859
3860 /* Shortcut for single character strings */
3861 if (PyUnicode_GET_SIZE(self) == 1 &&
3862 Py_UNICODE_ISALNUM(*p))
3863 return PyInt_FromLong(1);
3864
3865 /* Special case for empty strings */
3866 if (PyString_GET_SIZE(self) == 0)
3867 return PyInt_FromLong(0);
3868
3869 e = p + PyUnicode_GET_SIZE(self);
3870 for (; p < e; p++) {
3871 if (!Py_UNICODE_ISALNUM(*p))
3872 return PyInt_FromLong(0);
3873 }
3874 return PyInt_FromLong(1);
3875}
3876
Guido van Rossumd57fd912000-03-10 22:53:23 +00003877static char isdecimal__doc__[] =
3878"S.isdecimal() -> int\n\
3879\n\
3880Return 1 if there are only decimal characters in S,\n\
38810 otherwise.";
3882
3883static PyObject*
3884unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3885{
3886 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3887 register const Py_UNICODE *e;
3888
3889 if (!PyArg_NoArgs(args))
3890 return NULL;
3891
3892 /* Shortcut for single character strings */
3893 if (PyUnicode_GET_SIZE(self) == 1 &&
3894 Py_UNICODE_ISDECIMAL(*p))
3895 return PyInt_FromLong(1);
3896
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003897 /* Special case for empty strings */
3898 if (PyString_GET_SIZE(self) == 0)
3899 return PyInt_FromLong(0);
3900
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901 e = p + PyUnicode_GET_SIZE(self);
3902 for (; p < e; p++) {
3903 if (!Py_UNICODE_ISDECIMAL(*p))
3904 return PyInt_FromLong(0);
3905 }
3906 return PyInt_FromLong(1);
3907}
3908
3909static char isdigit__doc__[] =
3910"S.isdigit() -> int\n\
3911\n\
3912Return 1 if there are only digit characters in S,\n\
39130 otherwise.";
3914
3915static PyObject*
3916unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3917{
3918 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3919 register const Py_UNICODE *e;
3920
3921 if (!PyArg_NoArgs(args))
3922 return NULL;
3923
3924 /* Shortcut for single character strings */
3925 if (PyUnicode_GET_SIZE(self) == 1 &&
3926 Py_UNICODE_ISDIGIT(*p))
3927 return PyInt_FromLong(1);
3928
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003929 /* Special case for empty strings */
3930 if (PyString_GET_SIZE(self) == 0)
3931 return PyInt_FromLong(0);
3932
Guido van Rossumd57fd912000-03-10 22:53:23 +00003933 e = p + PyUnicode_GET_SIZE(self);
3934 for (; p < e; p++) {
3935 if (!Py_UNICODE_ISDIGIT(*p))
3936 return PyInt_FromLong(0);
3937 }
3938 return PyInt_FromLong(1);
3939}
3940
3941static char isnumeric__doc__[] =
3942"S.isnumeric() -> int\n\
3943\n\
3944Return 1 if there are only numeric characters in S,\n\
39450 otherwise.";
3946
3947static PyObject*
3948unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3949{
3950 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3951 register const Py_UNICODE *e;
3952
3953 if (!PyArg_NoArgs(args))
3954 return NULL;
3955
3956 /* Shortcut for single character strings */
3957 if (PyUnicode_GET_SIZE(self) == 1 &&
3958 Py_UNICODE_ISNUMERIC(*p))
3959 return PyInt_FromLong(1);
3960
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003961 /* Special case for empty strings */
3962 if (PyString_GET_SIZE(self) == 0)
3963 return PyInt_FromLong(0);
3964
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965 e = p + PyUnicode_GET_SIZE(self);
3966 for (; p < e; p++) {
3967 if (!Py_UNICODE_ISNUMERIC(*p))
3968 return PyInt_FromLong(0);
3969 }
3970 return PyInt_FromLong(1);
3971}
3972
3973static char join__doc__[] =
3974"S.join(sequence) -> unicode\n\
3975\n\
3976Return a string which is the concatenation of the strings in the\n\
3977sequence. The separator between elements is S.";
3978
3979static PyObject*
3980unicode_join(PyUnicodeObject *self, PyObject *args)
3981{
3982 PyObject *data;
3983 if (!PyArg_ParseTuple(args, "O:join", &data))
3984 return NULL;
3985
3986 return PyUnicode_Join((PyObject *)self, data);
3987}
3988
3989static int
3990unicode_length(PyUnicodeObject *self)
3991{
3992 return self->length;
3993}
3994
3995static char ljust__doc__[] =
3996"S.ljust(width) -> unicode\n\
3997\n\
3998Return S left justified in a Unicode string of length width. Padding is\n\
3999done using spaces.";
4000
4001static PyObject *
4002unicode_ljust(PyUnicodeObject *self, PyObject *args)
4003{
4004 int width;
4005 if (!PyArg_ParseTuple(args, "i:ljust", &width))
4006 return NULL;
4007
4008 if (self->length >= width) {
4009 Py_INCREF(self);
4010 return (PyObject*) self;
4011 }
4012
4013 return (PyObject*) pad(self, 0, width - self->length, ' ');
4014}
4015
4016static char lower__doc__[] =
4017"S.lower() -> unicode\n\
4018\n\
4019Return a copy of the string S converted to lowercase.";
4020
4021static PyObject*
4022unicode_lower(PyUnicodeObject *self, PyObject *args)
4023{
4024 if (!PyArg_NoArgs(args))
4025 return NULL;
4026 return fixup(self, fixlower);
4027}
4028
4029static char lstrip__doc__[] =
4030"S.lstrip() -> unicode\n\
4031\n\
4032Return a copy of the string S with leading whitespace removed.";
4033
4034static PyObject *
4035unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4036{
4037 if (!PyArg_NoArgs(args))
4038 return NULL;
4039 return strip(self, 1, 0);
4040}
4041
4042static PyObject*
4043unicode_repeat(PyUnicodeObject *str, int len)
4044{
4045 PyUnicodeObject *u;
4046 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004047 int nchars;
4048 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004049
4050 if (len < 0)
4051 len = 0;
4052
4053 if (len == 1) {
4054 /* no repeat, return original string */
4055 Py_INCREF(str);
4056 return (PyObject*) str;
4057 }
Tim Peters8f422462000-09-09 06:13:41 +00004058
4059 /* ensure # of chars needed doesn't overflow int and # of bytes
4060 * needed doesn't overflow size_t
4061 */
4062 nchars = len * str->length;
4063 if (len && nchars / len != str->length) {
4064 PyErr_SetString(PyExc_OverflowError,
4065 "repeated string is too long");
4066 return NULL;
4067 }
4068 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4069 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4070 PyErr_SetString(PyExc_OverflowError,
4071 "repeated string is too long");
4072 return NULL;
4073 }
4074 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004075 if (!u)
4076 return NULL;
4077
4078 p = u->str;
4079
4080 while (len-- > 0) {
4081 Py_UNICODE_COPY(p, str->str, str->length);
4082 p += str->length;
4083 }
4084
4085 return (PyObject*) u;
4086}
4087
4088PyObject *PyUnicode_Replace(PyObject *obj,
4089 PyObject *subobj,
4090 PyObject *replobj,
4091 int maxcount)
4092{
4093 PyObject *self;
4094 PyObject *str1;
4095 PyObject *str2;
4096 PyObject *result;
4097
4098 self = PyUnicode_FromObject(obj);
4099 if (self == NULL)
4100 return NULL;
4101 str1 = PyUnicode_FromObject(subobj);
4102 if (str1 == NULL) {
4103 Py_DECREF(self);
4104 return NULL;
4105 }
4106 str2 = PyUnicode_FromObject(replobj);
4107 if (str2 == NULL) {
4108 Py_DECREF(self);
4109 Py_DECREF(str1);
4110 return NULL;
4111 }
4112 result = replace((PyUnicodeObject *)self,
4113 (PyUnicodeObject *)str1,
4114 (PyUnicodeObject *)str2,
4115 maxcount);
4116 Py_DECREF(self);
4117 Py_DECREF(str1);
4118 Py_DECREF(str2);
4119 return result;
4120}
4121
4122static char replace__doc__[] =
4123"S.replace (old, new[, maxsplit]) -> unicode\n\
4124\n\
4125Return a copy of S with all occurrences of substring\n\
4126old replaced by new. If the optional argument maxsplit is\n\
4127given, only the first maxsplit occurrences are replaced.";
4128
4129static PyObject*
4130unicode_replace(PyUnicodeObject *self, PyObject *args)
4131{
4132 PyUnicodeObject *str1;
4133 PyUnicodeObject *str2;
4134 int maxcount = -1;
4135 PyObject *result;
4136
4137 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4138 return NULL;
4139 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4140 if (str1 == NULL)
4141 return NULL;
4142 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4143 if (str2 == NULL)
4144 return NULL;
4145
4146 result = replace(self, str1, str2, maxcount);
4147
4148 Py_DECREF(str1);
4149 Py_DECREF(str2);
4150 return result;
4151}
4152
4153static
4154PyObject *unicode_repr(PyObject *unicode)
4155{
4156 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4157 PyUnicode_GET_SIZE(unicode),
4158 1);
4159}
4160
4161static char rfind__doc__[] =
4162"S.rfind(sub [,start [,end]]) -> int\n\
4163\n\
4164Return the highest index in S where substring sub is found,\n\
4165such that sub is contained within s[start,end]. Optional\n\
4166arguments start and end are interpreted as in slice notation.\n\
4167\n\
4168Return -1 on failure.";
4169
4170static PyObject *
4171unicode_rfind(PyUnicodeObject *self, PyObject *args)
4172{
4173 PyUnicodeObject *substring;
4174 int start = 0;
4175 int end = INT_MAX;
4176 PyObject *result;
4177
Guido van Rossumb8872e62000-05-09 14:14:27 +00004178 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4179 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004180 return NULL;
4181 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4182 (PyObject *)substring);
4183 if (substring == NULL)
4184 return NULL;
4185
4186 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4187
4188 Py_DECREF(substring);
4189 return result;
4190}
4191
4192static char rindex__doc__[] =
4193"S.rindex(sub [,start [,end]]) -> int\n\
4194\n\
4195Like S.rfind() but raise ValueError when the substring is not found.";
4196
4197static PyObject *
4198unicode_rindex(PyUnicodeObject *self, PyObject *args)
4199{
4200 int result;
4201 PyUnicodeObject *substring;
4202 int start = 0;
4203 int end = INT_MAX;
4204
Guido van Rossumb8872e62000-05-09 14:14:27 +00004205 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4206 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004207 return NULL;
4208 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4209 (PyObject *)substring);
4210 if (substring == NULL)
4211 return NULL;
4212
4213 result = findstring(self, substring, start, end, -1);
4214
4215 Py_DECREF(substring);
4216 if (result < 0) {
4217 PyErr_SetString(PyExc_ValueError, "substring not found");
4218 return NULL;
4219 }
4220 return PyInt_FromLong(result);
4221}
4222
4223static char rjust__doc__[] =
4224"S.rjust(width) -> unicode\n\
4225\n\
4226Return S right justified in a Unicode string of length width. Padding is\n\
4227done using spaces.";
4228
4229static PyObject *
4230unicode_rjust(PyUnicodeObject *self, PyObject *args)
4231{
4232 int width;
4233 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4234 return NULL;
4235
4236 if (self->length >= width) {
4237 Py_INCREF(self);
4238 return (PyObject*) self;
4239 }
4240
4241 return (PyObject*) pad(self, width - self->length, 0, ' ');
4242}
4243
4244static char rstrip__doc__[] =
4245"S.rstrip() -> unicode\n\
4246\n\
4247Return a copy of the string S with trailing whitespace removed.";
4248
4249static PyObject *
4250unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4251{
4252 if (!PyArg_NoArgs(args))
4253 return NULL;
4254 return strip(self, 0, 1);
4255}
4256
4257static PyObject*
4258unicode_slice(PyUnicodeObject *self, int start, int end)
4259{
4260 /* standard clamping */
4261 if (start < 0)
4262 start = 0;
4263 if (end < 0)
4264 end = 0;
4265 if (end > self->length)
4266 end = self->length;
4267 if (start == 0 && end == self->length) {
4268 /* full slice, return original string */
4269 Py_INCREF(self);
4270 return (PyObject*) self;
4271 }
4272 if (start > end)
4273 start = end;
4274 /* copy slice */
4275 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4276 end - start);
4277}
4278
4279PyObject *PyUnicode_Split(PyObject *s,
4280 PyObject *sep,
4281 int maxsplit)
4282{
4283 PyObject *result;
4284
4285 s = PyUnicode_FromObject(s);
4286 if (s == NULL)
4287 return NULL;
4288 if (sep != NULL) {
4289 sep = PyUnicode_FromObject(sep);
4290 if (sep == NULL) {
4291 Py_DECREF(s);
4292 return NULL;
4293 }
4294 }
4295
4296 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4297
4298 Py_DECREF(s);
4299 Py_XDECREF(sep);
4300 return result;
4301}
4302
4303static char split__doc__[] =
4304"S.split([sep [,maxsplit]]) -> list of strings\n\
4305\n\
4306Return a list of the words in S, using sep as the\n\
4307delimiter string. If maxsplit is given, at most maxsplit\n\
4308splits are done. If sep is not specified, any whitespace string\n\
4309is a separator.";
4310
4311static PyObject*
4312unicode_split(PyUnicodeObject *self, PyObject *args)
4313{
4314 PyObject *substring = Py_None;
4315 int maxcount = -1;
4316
4317 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4318 return NULL;
4319
4320 if (substring == Py_None)
4321 return split(self, NULL, maxcount);
4322 else if (PyUnicode_Check(substring))
4323 return split(self, (PyUnicodeObject *)substring, maxcount);
4324 else
4325 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4326}
4327
4328static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004329"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004330\n\
4331Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004332Line breaks are not included in the resulting list unless keepends\n\
4333is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334
4335static PyObject*
4336unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4337{
Guido van Rossum86662912000-04-11 15:38:46 +00004338 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004339
Guido van Rossum86662912000-04-11 15:38:46 +00004340 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004341 return NULL;
4342
Guido van Rossum86662912000-04-11 15:38:46 +00004343 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004344}
4345
4346static
4347PyObject *unicode_str(PyUnicodeObject *self)
4348{
Fred Drakee4315f52000-05-09 19:53:39 +00004349 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004350}
4351
4352static char strip__doc__[] =
4353"S.strip() -> unicode\n\
4354\n\
4355Return a copy of S with leading and trailing whitespace removed.";
4356
4357static PyObject *
4358unicode_strip(PyUnicodeObject *self, PyObject *args)
4359{
4360 if (!PyArg_NoArgs(args))
4361 return NULL;
4362 return strip(self, 1, 1);
4363}
4364
4365static char swapcase__doc__[] =
4366"S.swapcase() -> unicode\n\
4367\n\
4368Return a copy of S with uppercase characters converted to lowercase\n\
4369and vice versa.";
4370
4371static PyObject*
4372unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4373{
4374 if (!PyArg_NoArgs(args))
4375 return NULL;
4376 return fixup(self, fixswapcase);
4377}
4378
4379static char translate__doc__[] =
4380"S.translate(table) -> unicode\n\
4381\n\
4382Return a copy of the string S, where all characters have been mapped\n\
4383through the given translation table, which must be a mapping of\n\
4384Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4385are left untouched. Characters mapped to None are deleted.";
4386
4387static PyObject*
4388unicode_translate(PyUnicodeObject *self, PyObject *args)
4389{
4390 PyObject *table;
4391
4392 if (!PyArg_ParseTuple(args, "O:translate", &table))
4393 return NULL;
4394 return PyUnicode_TranslateCharmap(self->str,
4395 self->length,
4396 table,
4397 "ignore");
4398}
4399
4400static char upper__doc__[] =
4401"S.upper() -> unicode\n\
4402\n\
4403Return a copy of S converted to uppercase.";
4404
4405static PyObject*
4406unicode_upper(PyUnicodeObject *self, PyObject *args)
4407{
4408 if (!PyArg_NoArgs(args))
4409 return NULL;
4410 return fixup(self, fixupper);
4411}
4412
4413#if 0
4414static char zfill__doc__[] =
4415"S.zfill(width) -> unicode\n\
4416\n\
4417Pad a numeric string x with zeros on the left, to fill a field\n\
4418of the specified width. The string x is never truncated.";
4419
4420static PyObject *
4421unicode_zfill(PyUnicodeObject *self, PyObject *args)
4422{
4423 int fill;
4424 PyUnicodeObject *u;
4425
4426 int width;
4427 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4428 return NULL;
4429
4430 if (self->length >= width) {
4431 Py_INCREF(self);
4432 return (PyObject*) self;
4433 }
4434
4435 fill = width - self->length;
4436
4437 u = pad(self, fill, 0, '0');
4438
4439 if (u->str[fill] == '+' || u->str[fill] == '-') {
4440 /* move sign to beginning of string */
4441 u->str[0] = u->str[fill];
4442 u->str[fill] = '0';
4443 }
4444
4445 return (PyObject*) u;
4446}
4447#endif
4448
4449#if 0
4450static PyObject*
4451unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4452{
4453 if (!PyArg_NoArgs(args))
4454 return NULL;
4455 return PyInt_FromLong(unicode_freelist_size);
4456}
4457#endif
4458
4459static char startswith__doc__[] =
4460"S.startswith(prefix[, start[, end]]) -> int\n\
4461\n\
4462Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4463optional start, test S beginning at that position. With optional end, stop\n\
4464comparing S at that position.";
4465
4466static PyObject *
4467unicode_startswith(PyUnicodeObject *self,
4468 PyObject *args)
4469{
4470 PyUnicodeObject *substring;
4471 int start = 0;
4472 int end = INT_MAX;
4473 PyObject *result;
4474
Guido van Rossumb8872e62000-05-09 14:14:27 +00004475 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4476 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477 return NULL;
4478 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4479 (PyObject *)substring);
4480 if (substring == NULL)
4481 return NULL;
4482
4483 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4484
4485 Py_DECREF(substring);
4486 return result;
4487}
4488
4489
4490static char endswith__doc__[] =
4491"S.endswith(suffix[, start[, end]]) -> int\n\
4492\n\
4493Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4494optional start, test S beginning at that position. With optional end, stop\n\
4495comparing S at that position.";
4496
4497static PyObject *
4498unicode_endswith(PyUnicodeObject *self,
4499 PyObject *args)
4500{
4501 PyUnicodeObject *substring;
4502 int start = 0;
4503 int end = INT_MAX;
4504 PyObject *result;
4505
Guido van Rossumb8872e62000-05-09 14:14:27 +00004506 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4507 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508 return NULL;
4509 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4510 (PyObject *)substring);
4511 if (substring == NULL)
4512 return NULL;
4513
4514 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4515
4516 Py_DECREF(substring);
4517 return result;
4518}
4519
4520
4521static PyMethodDef unicode_methods[] = {
4522
4523 /* Order is according to common usage: often used methods should
4524 appear first, since lookup is done sequentially. */
4525
4526 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4527 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4528 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4529 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4530 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4531 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4532 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4533 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4534 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4535 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4536 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4537 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4538 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4539 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4540/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4541 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4542 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4543 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4544 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4545 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4546 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4547 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4548 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4549 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4550 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4551 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4552 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4553 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4554 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4555 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4556 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4557 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4558 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004559 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4560 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004561#if 0
4562 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4563 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4564#endif
4565
4566#if 0
4567 /* This one is just used for debugging the implementation. */
4568 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4569#endif
4570
4571 {NULL, NULL}
4572};
4573
4574static PyObject *
4575unicode_getattr(PyUnicodeObject *self, char *name)
4576{
4577 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4578}
4579
4580static PySequenceMethods unicode_as_sequence = {
4581 (inquiry) unicode_length, /* sq_length */
4582 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4583 (intargfunc) unicode_repeat, /* sq_repeat */
4584 (intargfunc) unicode_getitem, /* sq_item */
4585 (intintargfunc) unicode_slice, /* sq_slice */
4586 0, /* sq_ass_item */
4587 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004588 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589};
4590
4591static int
4592unicode_buffer_getreadbuf(PyUnicodeObject *self,
4593 int index,
4594 const void **ptr)
4595{
4596 if (index != 0) {
4597 PyErr_SetString(PyExc_SystemError,
4598 "accessing non-existent unicode segment");
4599 return -1;
4600 }
4601 *ptr = (void *) self->str;
4602 return PyUnicode_GET_DATA_SIZE(self);
4603}
4604
4605static int
4606unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4607 const void **ptr)
4608{
4609 PyErr_SetString(PyExc_TypeError,
4610 "cannot use unicode as modifyable buffer");
4611 return -1;
4612}
4613
4614static int
4615unicode_buffer_getsegcount(PyUnicodeObject *self,
4616 int *lenp)
4617{
4618 if (lenp)
4619 *lenp = PyUnicode_GET_DATA_SIZE(self);
4620 return 1;
4621}
4622
4623static int
4624unicode_buffer_getcharbuf(PyUnicodeObject *self,
4625 int index,
4626 const void **ptr)
4627{
4628 PyObject *str;
4629
4630 if (index != 0) {
4631 PyErr_SetString(PyExc_SystemError,
4632 "accessing non-existent unicode segment");
4633 return -1;
4634 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004635 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636 if (str == NULL)
4637 return -1;
4638 *ptr = (void *) PyString_AS_STRING(str);
4639 return PyString_GET_SIZE(str);
4640}
4641
4642/* Helpers for PyUnicode_Format() */
4643
4644static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004645getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004646{
4647 int argidx = *p_argidx;
4648 if (argidx < arglen) {
4649 (*p_argidx)++;
4650 if (arglen < 0)
4651 return args;
4652 else
4653 return PyTuple_GetItem(args, argidx);
4654 }
4655 PyErr_SetString(PyExc_TypeError,
4656 "not enough arguments for format string");
4657 return NULL;
4658}
4659
4660#define F_LJUST (1<<0)
4661#define F_SIGN (1<<1)
4662#define F_BLANK (1<<2)
4663#define F_ALT (1<<3)
4664#define F_ZERO (1<<4)
4665
4666static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004668{
4669 register int i;
4670 int len;
4671 va_list va;
4672 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004673 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674
4675 /* First, format the string as char array, then expand to Py_UNICODE
4676 array. */
4677 charbuffer = (char *)buffer;
4678 len = vsprintf(charbuffer, format, va);
4679 for (i = len - 1; i >= 0; i--)
4680 buffer[i] = (Py_UNICODE) charbuffer[i];
4681
4682 va_end(va);
4683 return len;
4684}
4685
4686static int
4687formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004688 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004689 int flags,
4690 int prec,
4691 int type,
4692 PyObject *v)
4693{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004694 /* fmt = '%#.' + `prec` + `type`
4695 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004696 char fmt[20];
4697 double x;
4698
4699 x = PyFloat_AsDouble(v);
4700 if (x == -1.0 && PyErr_Occurred())
4701 return -1;
4702 if (prec < 0)
4703 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004704 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4705 type = 'g';
4706 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004707 /* worst case length calc to ensure no buffer overrun:
4708 fmt = %#.<prec>g
4709 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4710 for any double rep.)
4711 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4712 If prec=0 the effective precision is 1 (the leading digit is
4713 always given), therefore increase by one to 10+prec. */
4714 if (buflen <= (size_t)10 + (size_t)prec) {
4715 PyErr_SetString(PyExc_OverflowError,
4716 "formatted float is too long (precision too long?)");
4717 return -1;
4718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719 return usprintf(buf, fmt, x);
4720}
4721
Tim Peters38fd5b62000-09-21 05:43:11 +00004722static PyObject*
4723formatlong(PyObject *val, int flags, int prec, int type)
4724{
4725 char *buf;
4726 int i, len;
4727 PyObject *str; /* temporary string object. */
4728 PyUnicodeObject *result;
4729
4730 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4731 if (!str)
4732 return NULL;
4733 result = _PyUnicode_New(len);
4734 for (i = 0; i < len; i++)
4735 result->str[i] = buf[i];
4736 result->str[len] = 0;
4737 Py_DECREF(str);
4738 return (PyObject*)result;
4739}
4740
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741static int
4742formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004743 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004744 int flags,
4745 int prec,
4746 int type,
4747 PyObject *v)
4748{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004749 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004750 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4751 + 1 + 1 = 24*/
4752 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753 long x;
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004754 int use_native_c_format = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004755
4756 x = PyInt_AsLong(v);
4757 if (x == -1 && PyErr_Occurred())
4758 return -1;
4759 if (prec < 0)
4760 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004761 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4762 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4763 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4764 PyErr_SetString(PyExc_OverflowError,
4765 "formatted integer is too long (precision too long?)");
4766 return -1;
4767 }
Tim Petersfff53252001-04-12 18:38:48 +00004768 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4769 * but we want it (for consistency with other %#x conversions, and
4770 * for consistency with Python's hex() function).
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004771 * BUG 28-Apr-2001 tim: At least two platform Cs (Metrowerks &
4772 * Compaq Tru64) violate the std by converting 0 w/ leading 0x anyway.
4773 * So add it only if the platform doesn't already.
Tim Petersfff53252001-04-12 18:38:48 +00004774 */
Tim Petersb3d8d1f2001-04-28 05:38:26 +00004775 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X')) {
4776 /* Only way to know what the platform does is to try it. */
4777 sprintf(fmt, type == 'x' ? "%#x" : "%#X", 0);
4778 if (fmt[1] != (char)type) {
4779 /* Supply our own leading 0x/0X -- needed under std C */
4780 use_native_c_format = 0;
4781 sprintf(fmt, "0%c%%#.%dl%c", type, prec, type);
4782 }
4783 }
4784 if (use_native_c_format)
4785 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004786 return usprintf(buf, fmt, x);
4787}
4788
4789static int
4790formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004791 size_t buflen,
4792 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004794 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004795 if (PyUnicode_Check(v)) {
4796 if (PyUnicode_GET_SIZE(v) != 1)
4797 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004799 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004800
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004801 else if (PyString_Check(v)) {
4802 if (PyString_GET_SIZE(v) != 1)
4803 goto onError;
4804 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4805 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806
4807 else {
4808 /* Integer input truncated to a character */
4809 long x;
4810 x = PyInt_AsLong(v);
4811 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004812 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004813 buf[0] = (char) x;
4814 }
4815 buf[1] = '\0';
4816 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004817
4818 onError:
4819 PyErr_SetString(PyExc_TypeError,
4820 "%c requires int or char");
4821 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822}
4823
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004824/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4825
4826 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4827 chars are formatted. XXX This is a magic number. Each formatting
4828 routine does bounds checking to ensure no overflow, but a better
4829 solution may be to malloc a buffer of appropriate size for each
4830 format. For now, the current solution is sufficient.
4831*/
4832#define FORMATBUFLEN (size_t)120
4833
Guido van Rossumd57fd912000-03-10 22:53:23 +00004834PyObject *PyUnicode_Format(PyObject *format,
4835 PyObject *args)
4836{
4837 Py_UNICODE *fmt, *res;
4838 int fmtcnt, rescnt, reslen, arglen, argidx;
4839 int args_owned = 0;
4840 PyUnicodeObject *result = NULL;
4841 PyObject *dict = NULL;
4842 PyObject *uformat;
4843
4844 if (format == NULL || args == NULL) {
4845 PyErr_BadInternalCall();
4846 return NULL;
4847 }
4848 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004849 if (uformat == NULL)
4850 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004851 fmt = PyUnicode_AS_UNICODE(uformat);
4852 fmtcnt = PyUnicode_GET_SIZE(uformat);
4853
4854 reslen = rescnt = fmtcnt + 100;
4855 result = _PyUnicode_New(reslen);
4856 if (result == NULL)
4857 goto onError;
4858 res = PyUnicode_AS_UNICODE(result);
4859
4860 if (PyTuple_Check(args)) {
4861 arglen = PyTuple_Size(args);
4862 argidx = 0;
4863 }
4864 else {
4865 arglen = -1;
4866 argidx = -2;
4867 }
4868 if (args->ob_type->tp_as_mapping)
4869 dict = args;
4870
4871 while (--fmtcnt >= 0) {
4872 if (*fmt != '%') {
4873 if (--rescnt < 0) {
4874 rescnt = fmtcnt + 100;
4875 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00004876 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004877 return NULL;
4878 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4879 --rescnt;
4880 }
4881 *res++ = *fmt++;
4882 }
4883 else {
4884 /* Got a format specifier */
4885 int flags = 0;
4886 int width = -1;
4887 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004888 Py_UNICODE c = '\0';
4889 Py_UNICODE fill;
4890 PyObject *v = NULL;
4891 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004892 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004893 Py_UNICODE sign;
4894 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004895 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004896
4897 fmt++;
4898 if (*fmt == '(') {
4899 Py_UNICODE *keystart;
4900 int keylen;
4901 PyObject *key;
4902 int pcount = 1;
4903
4904 if (dict == NULL) {
4905 PyErr_SetString(PyExc_TypeError,
4906 "format requires a mapping");
4907 goto onError;
4908 }
4909 ++fmt;
4910 --fmtcnt;
4911 keystart = fmt;
4912 /* Skip over balanced parentheses */
4913 while (pcount > 0 && --fmtcnt >= 0) {
4914 if (*fmt == ')')
4915 --pcount;
4916 else if (*fmt == '(')
4917 ++pcount;
4918 fmt++;
4919 }
4920 keylen = fmt - keystart - 1;
4921 if (fmtcnt < 0 || pcount > 0) {
4922 PyErr_SetString(PyExc_ValueError,
4923 "incomplete format key");
4924 goto onError;
4925 }
Fred Drakee4315f52000-05-09 19:53:39 +00004926 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 then looked up since Python uses strings to hold
4928 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004929 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 key = PyUnicode_EncodeUTF8(keystart,
4931 keylen,
4932 NULL);
4933 if (key == NULL)
4934 goto onError;
4935 if (args_owned) {
4936 Py_DECREF(args);
4937 args_owned = 0;
4938 }
4939 args = PyObject_GetItem(dict, key);
4940 Py_DECREF(key);
4941 if (args == NULL) {
4942 goto onError;
4943 }
4944 args_owned = 1;
4945 arglen = -1;
4946 argidx = -2;
4947 }
4948 while (--fmtcnt >= 0) {
4949 switch (c = *fmt++) {
4950 case '-': flags |= F_LJUST; continue;
4951 case '+': flags |= F_SIGN; continue;
4952 case ' ': flags |= F_BLANK; continue;
4953 case '#': flags |= F_ALT; continue;
4954 case '0': flags |= F_ZERO; continue;
4955 }
4956 break;
4957 }
4958 if (c == '*') {
4959 v = getnextarg(args, arglen, &argidx);
4960 if (v == NULL)
4961 goto onError;
4962 if (!PyInt_Check(v)) {
4963 PyErr_SetString(PyExc_TypeError,
4964 "* wants int");
4965 goto onError;
4966 }
4967 width = PyInt_AsLong(v);
4968 if (width < 0) {
4969 flags |= F_LJUST;
4970 width = -width;
4971 }
4972 if (--fmtcnt >= 0)
4973 c = *fmt++;
4974 }
4975 else if (c >= '0' && c <= '9') {
4976 width = c - '0';
4977 while (--fmtcnt >= 0) {
4978 c = *fmt++;
4979 if (c < '0' || c > '9')
4980 break;
4981 if ((width*10) / 10 != width) {
4982 PyErr_SetString(PyExc_ValueError,
4983 "width too big");
4984 goto onError;
4985 }
4986 width = width*10 + (c - '0');
4987 }
4988 }
4989 if (c == '.') {
4990 prec = 0;
4991 if (--fmtcnt >= 0)
4992 c = *fmt++;
4993 if (c == '*') {
4994 v = getnextarg(args, arglen, &argidx);
4995 if (v == NULL)
4996 goto onError;
4997 if (!PyInt_Check(v)) {
4998 PyErr_SetString(PyExc_TypeError,
4999 "* wants int");
5000 goto onError;
5001 }
5002 prec = PyInt_AsLong(v);
5003 if (prec < 0)
5004 prec = 0;
5005 if (--fmtcnt >= 0)
5006 c = *fmt++;
5007 }
5008 else if (c >= '0' && c <= '9') {
5009 prec = c - '0';
5010 while (--fmtcnt >= 0) {
5011 c = Py_CHARMASK(*fmt++);
5012 if (c < '0' || c > '9')
5013 break;
5014 if ((prec*10) / 10 != prec) {
5015 PyErr_SetString(PyExc_ValueError,
5016 "prec too big");
5017 goto onError;
5018 }
5019 prec = prec*10 + (c - '0');
5020 }
5021 }
5022 } /* prec */
5023 if (fmtcnt >= 0) {
5024 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005025 if (--fmtcnt >= 0)
5026 c = *fmt++;
5027 }
5028 }
5029 if (fmtcnt < 0) {
5030 PyErr_SetString(PyExc_ValueError,
5031 "incomplete format");
5032 goto onError;
5033 }
5034 if (c != '%') {
5035 v = getnextarg(args, arglen, &argidx);
5036 if (v == NULL)
5037 goto onError;
5038 }
5039 sign = 0;
5040 fill = ' ';
5041 switch (c) {
5042
5043 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005044 pbuf = formatbuf;
5045 /* presume that buffer length is at least 1 */
5046 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 len = 1;
5048 break;
5049
5050 case 's':
5051 case 'r':
5052 if (PyUnicode_Check(v) && c == 's') {
5053 temp = v;
5054 Py_INCREF(temp);
5055 }
5056 else {
5057 PyObject *unicode;
5058 if (c == 's')
5059 temp = PyObject_Str(v);
5060 else
5061 temp = PyObject_Repr(v);
5062 if (temp == NULL)
5063 goto onError;
5064 if (!PyString_Check(temp)) {
5065 /* XXX Note: this should never happen, since
5066 PyObject_Repr() and PyObject_Str() assure
5067 this */
5068 Py_DECREF(temp);
5069 PyErr_SetString(PyExc_TypeError,
5070 "%s argument has non-string str()");
5071 goto onError;
5072 }
Fred Drakee4315f52000-05-09 19:53:39 +00005073 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005074 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005075 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005076 "strict");
5077 Py_DECREF(temp);
5078 temp = unicode;
5079 if (temp == NULL)
5080 goto onError;
5081 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005082 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005083 len = PyUnicode_GET_SIZE(temp);
5084 if (prec >= 0 && len > prec)
5085 len = prec;
5086 break;
5087
5088 case 'i':
5089 case 'd':
5090 case 'u':
5091 case 'o':
5092 case 'x':
5093 case 'X':
5094 if (c == 'i')
5095 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005096 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005097 temp = formatlong(v, flags, prec, c);
5098 if (!temp)
5099 goto onError;
5100 pbuf = PyUnicode_AS_UNICODE(temp);
5101 len = PyUnicode_GET_SIZE(temp);
5102 /* unbounded ints can always produce
5103 a sign character! */
5104 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005106 else {
5107 pbuf = formatbuf;
5108 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5109 flags, prec, c, v);
5110 if (len < 0)
5111 goto onError;
5112 /* only d conversion is signed */
5113 sign = c == 'd';
5114 }
5115 if (flags & F_ZERO)
5116 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005117 break;
5118
5119 case 'e':
5120 case 'E':
5121 case 'f':
5122 case 'g':
5123 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005124 pbuf = formatbuf;
5125 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5126 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005127 if (len < 0)
5128 goto onError;
5129 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005130 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005131 fill = '0';
5132 break;
5133
5134 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005135 pbuf = formatbuf;
5136 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005137 if (len < 0)
5138 goto onError;
5139 break;
5140
5141 default:
5142 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005143 "unsupported format character '%c' (0x%x) "
5144 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005145 (31<=c && c<=126) ? c : '?',
5146 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005147 goto onError;
5148 }
5149 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005150 if (*pbuf == '-' || *pbuf == '+') {
5151 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 len--;
5153 }
5154 else if (flags & F_SIGN)
5155 sign = '+';
5156 else if (flags & F_BLANK)
5157 sign = ' ';
5158 else
5159 sign = 0;
5160 }
5161 if (width < len)
5162 width = len;
5163 if (rescnt < width + (sign != 0)) {
5164 reslen -= rescnt;
5165 rescnt = width + fmtcnt + 100;
5166 reslen += rescnt;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005167 if (_PyUnicode_Resize(&result, reslen) < 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005168 return NULL;
5169 res = PyUnicode_AS_UNICODE(result)
5170 + reslen - rescnt;
5171 }
5172 if (sign) {
5173 if (fill != ' ')
5174 *res++ = sign;
5175 rescnt--;
5176 if (width > len)
5177 width--;
5178 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005179 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5180 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005181 assert(pbuf[1] == c);
5182 if (fill != ' ') {
5183 *res++ = *pbuf++;
5184 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005185 }
Tim Petersfff53252001-04-12 18:38:48 +00005186 rescnt -= 2;
5187 width -= 2;
5188 if (width < 0)
5189 width = 0;
5190 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005191 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005192 if (width > len && !(flags & F_LJUST)) {
5193 do {
5194 --rescnt;
5195 *res++ = fill;
5196 } while (--width > len);
5197 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005198 if (fill == ' ') {
5199 if (sign)
5200 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005201 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005202 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005203 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005204 *res++ = *pbuf++;
5205 *res++ = *pbuf++;
5206 }
5207 }
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005208 Py_UNICODE_COPY(res, pbuf, len);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 res += len;
5210 rescnt -= len;
5211 while (--width >= len) {
5212 --rescnt;
5213 *res++ = ' ';
5214 }
5215 if (dict && (argidx < arglen) && c != '%') {
5216 PyErr_SetString(PyExc_TypeError,
5217 "not all arguments converted");
5218 goto onError;
5219 }
5220 Py_XDECREF(temp);
5221 } /* '%' */
5222 } /* until end */
5223 if (argidx < arglen && !dict) {
5224 PyErr_SetString(PyExc_TypeError,
5225 "not all arguments converted");
5226 goto onError;
5227 }
5228
5229 if (args_owned) {
5230 Py_DECREF(args);
5231 }
5232 Py_DECREF(uformat);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005233 if (_PyUnicode_Resize(&result, reslen - rescnt))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005234 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 return (PyObject *)result;
5236
5237 onError:
5238 Py_XDECREF(result);
5239 Py_DECREF(uformat);
5240 if (args_owned) {
5241 Py_DECREF(args);
5242 }
5243 return NULL;
5244}
5245
5246static PyBufferProcs unicode_as_buffer = {
5247 (getreadbufferproc) unicode_buffer_getreadbuf,
5248 (getwritebufferproc) unicode_buffer_getwritebuf,
5249 (getsegcountproc) unicode_buffer_getsegcount,
5250 (getcharbufferproc) unicode_buffer_getcharbuf,
5251};
5252
5253PyTypeObject PyUnicode_Type = {
5254 PyObject_HEAD_INIT(&PyType_Type)
5255 0, /* ob_size */
5256 "unicode", /* tp_name */
5257 sizeof(PyUnicodeObject), /* tp_size */
5258 0, /* tp_itemsize */
5259 /* Slots */
5260 (destructor)_PyUnicode_Free, /* tp_dealloc */
5261 0, /* tp_print */
5262 (getattrfunc)unicode_getattr, /* tp_getattr */
5263 0, /* tp_setattr */
5264 (cmpfunc) unicode_compare, /* tp_compare */
5265 (reprfunc) unicode_repr, /* tp_repr */
5266 0, /* tp_as_number */
5267 &unicode_as_sequence, /* tp_as_sequence */
5268 0, /* tp_as_mapping */
5269 (hashfunc) unicode_hash, /* tp_hash*/
5270 0, /* tp_call*/
5271 (reprfunc) unicode_str, /* tp_str */
5272 (getattrofunc) NULL, /* tp_getattro */
5273 (setattrofunc) NULL, /* tp_setattro */
5274 &unicode_as_buffer, /* tp_as_buffer */
5275 Py_TPFLAGS_DEFAULT, /* tp_flags */
5276};
5277
5278/* Initialize the Unicode implementation */
5279
Thomas Wouters78890102000-07-22 19:25:51 +00005280void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005281{
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005282 int i;
5283
Guido van Rossumd57fd912000-03-10 22:53:23 +00005284 /* Doublecheck the configuration... */
5285 if (sizeof(Py_UNICODE) != 2)
5286 Py_FatalError("Unicode configuration error: "
5287 "sizeof(Py_UNICODE) != 2 bytes");
5288
Fred Drakee4315f52000-05-09 19:53:39 +00005289 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005290 unicode_freelist = NULL;
5291 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005292 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005293 strcpy(unicode_default_encoding, "ascii");
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005294 for (i = 0; i < 256; i++)
5295 unicode_latin1[i] = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005296}
5297
5298/* Finalize the Unicode implementation */
5299
5300void
Thomas Wouters78890102000-07-22 19:25:51 +00005301_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005302{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005303 PyUnicodeObject *u;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005304 int i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005305
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005306 Py_XDECREF(unicode_empty);
5307 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005308
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00005309 for (i = 0; i < 256; i++) {
5310 if (unicode_latin1[i]) {
5311 Py_DECREF(unicode_latin1[i]);
5312 unicode_latin1[i] = NULL;
5313 }
5314 }
5315
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005316 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317 PyUnicodeObject *v = u;
5318 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005319 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005320 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005321 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005322 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005323 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005324 unicode_freelist = NULL;
5325 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005326}