blob: 5c193dda4ce837e7fc2cc25eebb9cb726ba80a0d [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
86/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
89/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000090static PyUnicodeObject *unicode_freelist;
91static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Fred Drakee4315f52000-05-09 19:53:39 +000093/* Default encoding to use and assume when NULL is passed as encoding
94 parameter; it is initialized by _PyUnicode_Init().
95
96 Always use the PyUnicode_SetDefaultEncoding() and
97 PyUnicode_GetDefaultEncoding() APIs to access this global.
98
99*/
100
101static char unicode_default_encoding[100];
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* --- Unicode Object ----------------------------------------------------- */
104
105static
106int _PyUnicode_Resize(register PyUnicodeObject *unicode,
107 int length)
108{
109 void *oldstr;
110
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000111 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000112 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000113 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115 /* Resizing unicode_empty is not allowed. */
116 if (unicode == unicode_empty) {
117 PyErr_SetString(PyExc_SystemError,
118 "can't resize empty unicode object");
119 return -1;
120 }
121
122 /* We allocate one more byte to make sure the string is
123 Ux0000 terminated -- XXX is this needed ? */
124 oldstr = unicode->str;
125 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
126 if (!unicode->str) {
127 unicode->str = oldstr;
128 PyErr_NoMemory();
129 return -1;
130 }
131 unicode->str[length] = 0;
132 unicode->length = length;
133
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000136 if (unicode->defenc) {
137 Py_DECREF(unicode->defenc);
138 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 }
140 unicode->hash = -1;
141
142 return 0;
143}
144
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145int PyUnicode_Resize(PyObject **unicode,
146 int length)
147{
148 PyUnicodeObject *v;
149
150 if (unicode == NULL) {
151 PyErr_BadInternalCall();
152 return -1;
153 }
154 v = (PyUnicodeObject *)*unicode;
155 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
156 PyErr_BadInternalCall();
157 return -1;
158 }
159 return _PyUnicode_Resize(v, length);
160}
161
Guido van Rossumd57fd912000-03-10 22:53:23 +0000162/* We allocate one more byte to make sure the string is
163 Ux0000 terminated -- XXX is this needed ?
164
165 XXX This allocator could further be enhanced by assuring that the
166 free list never reduces its size below 1.
167
168*/
169
170static
171PyUnicodeObject *_PyUnicode_New(int length)
172{
173 register PyUnicodeObject *unicode;
174
175 /* Optimization for empty strings */
176 if (length == 0 && unicode_empty != NULL) {
177 Py_INCREF(unicode_empty);
178 return unicode_empty;
179 }
180
181 /* Unicode freelist & memory allocation */
182 if (unicode_freelist) {
183 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000184 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000187 /* Keep-Alive optimization: we only upsize the buffer,
188 never downsize it. */
189 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000191 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 }
194 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000195 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000197 }
198 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 }
200 else {
201 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
202 if (unicode == NULL)
203 return NULL;
204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
205 }
206
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000207 if (!unicode->str) {
208 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000209 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode->str[length] = 0;
212 unicode->length = length;
213 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000216
217 onError:
218 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000219 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221}
222
223static
224void _PyUnicode_Free(register PyUnicodeObject *unicode)
225{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 /* Keep-Alive optimization */
228 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->str = NULL;
231 unicode->length = 0;
232 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000233 if (unicode->defenc) {
234 Py_DECREF(unicode->defenc);
235 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000236 }
237 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 *(PyUnicodeObject **)unicode = unicode_freelist;
239 unicode_freelist = unicode;
240 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 }
242 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000243 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000244 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 }
247}
248
249PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
250 int size)
251{
252 PyUnicodeObject *unicode;
253
254 unicode = _PyUnicode_New(size);
255 if (!unicode)
256 return NULL;
257
258 /* Copy the Unicode data into the new object */
259 if (u != NULL)
260 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
261
262 return (PyObject *)unicode;
263}
264
265#ifdef HAVE_WCHAR_H
266
267PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
268 int size)
269{
270 PyUnicodeObject *unicode;
271
272 if (w == NULL) {
273 PyErr_BadInternalCall();
274 return NULL;
275 }
276
277 unicode = _PyUnicode_New(size);
278 if (!unicode)
279 return NULL;
280
281 /* Copy the wchar_t data into the new object */
282#ifdef HAVE_USABLE_WCHAR_T
283 memcpy(unicode->str, w, size * sizeof(wchar_t));
284#else
285 {
286 register Py_UNICODE *u;
287 register int i;
288 u = PyUnicode_AS_UNICODE(unicode);
289 for (i = size; i >= 0; i--)
290 *u++ = *w++;
291 }
292#endif
293
294 return (PyObject *)unicode;
295}
296
297int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
298 register wchar_t *w,
299 int size)
300{
301 if (unicode == NULL) {
302 PyErr_BadInternalCall();
303 return -1;
304 }
305 if (size > PyUnicode_GET_SIZE(unicode))
306 size = PyUnicode_GET_SIZE(unicode);
307#ifdef HAVE_USABLE_WCHAR_T
308 memcpy(w, unicode->str, size * sizeof(wchar_t));
309#else
310 {
311 register Py_UNICODE *u;
312 register int i;
313 u = PyUnicode_AS_UNICODE(unicode);
314 for (i = size; i >= 0; i--)
315 *w++ = *u++;
316 }
317#endif
318
319 return size;
320}
321
322#endif
323
324PyObject *PyUnicode_FromObject(register PyObject *obj)
325{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000326 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
327}
328
329PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
330 const char *encoding,
331 const char *errors)
332{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 const char *s;
334 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000335 int owned = 0;
336 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337
338 if (obj == NULL) {
339 PyErr_BadInternalCall();
340 return NULL;
341 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000342
343 /* Coerce object */
344 if (PyInstance_Check(obj)) {
345 PyObject *func;
346 func = PyObject_GetAttrString(obj, "__str__");
347 if (func == NULL) {
348 PyErr_SetString(PyExc_TypeError,
349 "coercing to Unicode: instance doesn't define __str__");
350 return NULL;
351 }
352 obj = PyEval_CallObject(func, NULL);
353 Py_DECREF(func);
354 if (obj == NULL)
355 return NULL;
356 owned = 1;
357 }
358 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000360 v = obj;
361 if (encoding) {
362 PyErr_SetString(PyExc_TypeError,
363 "decoding Unicode is not supported");
364 return NULL;
365 }
366 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 }
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
371 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000376 PyErr_Format(PyExc_TypeError,
377 "coercing to Unicode: need string or buffer, "
378 "%.80s found",
379 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000380 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000381 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382
383 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 if (len == 0) {
385 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000388 else
389 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000390
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000392 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000394 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000395 return v;
396
397 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000398 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000399 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000400 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402}
403
404PyObject *PyUnicode_Decode(const char *s,
405 int size,
406 const char *encoding,
407 const char *errors)
408{
409 PyObject *buffer = NULL, *unicode;
410
Fred Drakee4315f52000-05-09 19:53:39 +0000411 if (encoding == NULL)
412 encoding = PyUnicode_GetDefaultEncoding();
413
414 /* Shortcuts for common default encodings */
415 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000417 else if (strcmp(encoding, "latin-1") == 0)
418 return PyUnicode_DecodeLatin1(s, size, errors);
419 else if (strcmp(encoding, "ascii") == 0)
420 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421
422 /* Decode via the codec registry */
423 buffer = PyBuffer_FromMemory((void *)s, size);
424 if (buffer == NULL)
425 goto onError;
426 unicode = PyCodec_Decode(buffer, encoding, errors);
427 if (unicode == NULL)
428 goto onError;
429 if (!PyUnicode_Check(unicode)) {
430 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000431 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 unicode->ob_type->tp_name);
433 Py_DECREF(unicode);
434 goto onError;
435 }
436 Py_DECREF(buffer);
437 return unicode;
438
439 onError:
440 Py_XDECREF(buffer);
441 return NULL;
442}
443
444PyObject *PyUnicode_Encode(const Py_UNICODE *s,
445 int size,
446 const char *encoding,
447 const char *errors)
448{
449 PyObject *v, *unicode;
450
451 unicode = PyUnicode_FromUnicode(s, size);
452 if (unicode == NULL)
453 return NULL;
454 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
455 Py_DECREF(unicode);
456 return v;
457}
458
459PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
460 const char *encoding,
461 const char *errors)
462{
463 PyObject *v;
464
465 if (!PyUnicode_Check(unicode)) {
466 PyErr_BadArgument();
467 goto onError;
468 }
Fred Drakee4315f52000-05-09 19:53:39 +0000469
470 if (encoding == NULL)
471 encoding = PyUnicode_GetDefaultEncoding();
472
473 /* Shortcuts for common default encodings */
474 if (errors == NULL) {
475 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000477 else if (strcmp(encoding, "latin-1") == 0)
478 return PyUnicode_AsLatin1String(unicode);
479 else if (strcmp(encoding, "ascii") == 0)
480 return PyUnicode_AsASCIIString(unicode);
481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482
483 /* Encode via the codec registry */
484 v = PyCodec_Encode(unicode, encoding, errors);
485 if (v == NULL)
486 goto onError;
487 /* XXX Should we really enforce this ? */
488 if (!PyString_Check(v)) {
489 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000490 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 v->ob_type->tp_name);
492 Py_DECREF(v);
493 goto onError;
494 }
495 return v;
496
497 onError:
498 return NULL;
499}
500
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000501/* Return a Python string holding the default encoded value of the
502 Unicode object.
503
504 The resulting string is cached in the Unicode object for subsequent
505 usage by this function. The cached version is needed to implement
506 the character buffer interface and will live (at least) as long as
507 the Unicode object itself.
508
509 The refcount of the string is *not* incremented.
510
511 *** Exported for internal use by the interpreter only !!! ***
512
513*/
514
515PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
516 const char *errors)
517{
518 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
519
520 if (v)
521 return v;
522 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
523 if (v && errors == NULL)
524 ((PyUnicodeObject *)unicode)->defenc = v;
525 return v;
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
529{
530 if (!PyUnicode_Check(unicode)) {
531 PyErr_BadArgument();
532 goto onError;
533 }
534 return PyUnicode_AS_UNICODE(unicode);
535
536 onError:
537 return NULL;
538}
539
540int PyUnicode_GetSize(PyObject *unicode)
541{
542 if (!PyUnicode_Check(unicode)) {
543 PyErr_BadArgument();
544 goto onError;
545 }
546 return PyUnicode_GET_SIZE(unicode);
547
548 onError:
549 return -1;
550}
551
Thomas Wouters78890102000-07-22 19:25:51 +0000552const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000553{
554 return unicode_default_encoding;
555}
556
557int PyUnicode_SetDefaultEncoding(const char *encoding)
558{
559 PyObject *v;
560
561 /* Make sure the encoding is valid. As side effect, this also
562 loads the encoding into the codec registry cache. */
563 v = _PyCodec_Lookup(encoding);
564 if (v == NULL)
565 goto onError;
566 Py_DECREF(v);
567 strncpy(unicode_default_encoding,
568 encoding,
569 sizeof(unicode_default_encoding));
570 return 0;
571
572 onError:
573 return -1;
574}
575
Guido van Rossumd57fd912000-03-10 22:53:23 +0000576/* --- UTF-8 Codec -------------------------------------------------------- */
577
578static
579char utf8_code_length[256] = {
580 /* Map UTF-8 encoded prefix byte to sequence length. zero means
581 illegal prefix. see RFC 2279 for details */
582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
597 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
598};
599
600static
601int utf8_decoding_error(const char **source,
602 Py_UNICODE **dest,
603 const char *errors,
604 const char *details)
605{
606 if ((errors == NULL) ||
607 (strcmp(errors,"strict") == 0)) {
608 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000609 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 details);
611 return -1;
612 }
613 else if (strcmp(errors,"ignore") == 0) {
614 (*source)++;
615 return 0;
616 }
617 else if (strcmp(errors,"replace") == 0) {
618 (*source)++;
619 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
620 (*dest)++;
621 return 0;
622 }
623 else {
624 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 errors);
627 return -1;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631PyObject *PyUnicode_DecodeUTF8(const char *s,
632 int size,
633 const char *errors)
634{
635 int n;
636 const char *e;
637 PyUnicodeObject *unicode;
638 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000639 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Note: size will always be longer than the resulting Unicode
642 character count */
643 unicode = _PyUnicode_New(size);
644 if (!unicode)
645 return NULL;
646 if (size == 0)
647 return (PyObject *)unicode;
648
649 /* Unpack UTF-8 encoded data */
650 p = unicode->str;
651 e = s + size;
652
653 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000654 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655
656 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000657 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 s++;
659 continue;
660 }
661
662 n = utf8_code_length[ch];
663
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 if (s + n > e) {
665 errmsg = "unexpected end of data";
666 goto utf8Error;
667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668
669 switch (n) {
670
671 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000672 errmsg = "unexpected code byte";
673 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000674 break;
675
676 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000677 errmsg = "internal error";
678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000679 break;
680
681 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000682 if ((s[1] & 0xc0) != 0x80) {
683 errmsg = "invalid data";
684 goto utf8Error;
685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000687 if (ch < 0x80) {
688 errmsg = "illegal encoding";
689 goto utf8Error;
690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000692 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 break;
694
695 case 3:
696 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 (s[2] & 0xc0) != 0x80) {
698 errmsg = "invalid data";
699 goto utf8Error;
700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
703 errmsg = "illegal encoding";
704 goto utf8Error;
705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000707 *p++ = (Py_UNICODE)ch;
708 break;
709
710 case 4:
711 if ((s[1] & 0xc0) != 0x80 ||
712 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 (s[3] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000717 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
718 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
719 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if ((ch < 0x10000) || /* minimum value allowed for 4
721 byte encoding */
722 (ch > 0x10ffff)) { /* maximum value allowed for
723 UTF-16 */
724 errmsg = "illegal encoding";
725 goto utf8Error;
726 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000727 /* compute and append the two surrogates: */
728
729 /* translate from 10000..10FFFF to 0..FFFF */
730 ch -= 0x10000;
731
732 /* high surrogate = top 10 bits added to D800 */
733 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
734
735 /* low surrogate = bottom 10 bits added to DC00 */
736 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 break;
738
739 default:
740 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 errmsg = "unsupported Unicode code range";
742 goto utf8Error;
743 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 }
745 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 continue;
747
748 utf8Error:
749 if (utf8_decoding_error(&s, &p, errors, errmsg))
750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000751 }
752
753 /* Adjust length */
754 if (_PyUnicode_Resize(unicode, p - unicode->str))
755 goto onError;
756
757 return (PyObject *)unicode;
758
759onError:
760 Py_DECREF(unicode);
761 return NULL;
762}
763
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000764/* Not used anymore, now that the encoder supports UTF-16
765 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000766#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767static
768int utf8_encoding_error(const Py_UNICODE **source,
769 char **dest,
770 const char *errors,
771 const char *details)
772{
773 if ((errors == NULL) ||
774 (strcmp(errors,"strict") == 0)) {
775 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000776 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777 details);
778 return -1;
779 }
780 else if (strcmp(errors,"ignore") == 0) {
781 return 0;
782 }
783 else if (strcmp(errors,"replace") == 0) {
784 **dest = '?';
785 (*dest)++;
786 return 0;
787 }
788 else {
789 PyErr_Format(PyExc_ValueError,
790 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000791 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792 errors);
793 return -1;
794 }
795}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000796#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
799 int size,
800 const char *errors)
801{
802 PyObject *v;
803 char *p;
804 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000805 Py_UCS4 ch2;
806 unsigned int cbAllocated = 3 * size;
807 unsigned int cbWritten = 0;
808 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000810 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (v == NULL)
812 return NULL;
813 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000814 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815
816 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000817 while (i < size) {
818 Py_UCS4 ch = s[i++];
819 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000821 cbWritten++;
822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 else if (ch < 0x0800) {
824 *p++ = 0xc0 | (ch >> 6);
825 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000826 cbWritten += 2;
827 }
828 else {
829 /* Check for high surrogate */
830 if (0xD800 <= ch && ch <= 0xDBFF) {
831 if (i != size) {
832 ch2 = s[i];
833 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
834
835 if (cbWritten >= (cbAllocated - 4)) {
836 /* Provide enough room for some more
837 surrogates */
838 cbAllocated += 4*10;
839 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000840 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 }
842
843 /* combine the two values */
844 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
845
846 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000847 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 i++;
849 cbWritten += 4;
850 }
851 }
852 }
853 else {
854 *p++ = (char)(0xe0 | (ch >> 12));
855 cbWritten += 3;
856 }
857 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
858 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859 }
860 }
861 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000862 if (_PyString_Resize(&v, p - q))
863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 return v;
865
866 onError:
867 Py_DECREF(v);
868 return NULL;
869}
870
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
872{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 if (!PyUnicode_Check(unicode)) {
874 PyErr_BadArgument();
875 return NULL;
876 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000877 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
878 PyUnicode_GET_SIZE(unicode),
879 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880}
881
882/* --- UTF-16 Codec ------------------------------------------------------- */
883
884static
885int utf16_decoding_error(const Py_UNICODE **source,
886 Py_UNICODE **dest,
887 const char *errors,
888 const char *details)
889{
890 if ((errors == NULL) ||
891 (strcmp(errors,"strict") == 0)) {
892 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000893 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000894 details);
895 return -1;
896 }
897 else if (strcmp(errors,"ignore") == 0) {
898 return 0;
899 }
900 else if (strcmp(errors,"replace") == 0) {
901 if (dest) {
902 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
903 (*dest)++;
904 }
905 return 0;
906 }
907 else {
908 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000909 "UTF-16 decoding error; "
910 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 errors);
912 return -1;
913 }
914}
915
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916PyObject *PyUnicode_DecodeUTF16(const char *s,
917 int size,
918 const char *errors,
919 int *byteorder)
920{
921 PyUnicodeObject *unicode;
922 Py_UNICODE *p;
923 const Py_UNICODE *q, *e;
924 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000925 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926
927 /* size should be an even number */
928 if (size % sizeof(Py_UNICODE) != 0) {
929 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
930 return NULL;
931 /* The remaining input chars are ignored if we fall through
932 here... */
933 }
934
935 /* Note: size will always be longer than the resulting Unicode
936 character count */
937 unicode = _PyUnicode_New(size);
938 if (!unicode)
939 return NULL;
940 if (size == 0)
941 return (PyObject *)unicode;
942
943 /* Unpack UTF-16 encoded data */
944 p = unicode->str;
945 q = (Py_UNICODE *)s;
946 e = q + (size / sizeof(Py_UNICODE));
947
948 if (byteorder)
949 bo = *byteorder;
950
951 while (q < e) {
952 register Py_UNICODE ch = *q++;
953
954 /* Check for BOM marks (U+FEFF) in the input and adjust
955 current byte order setting accordingly. Swap input
956 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
957 !) */
958#ifdef BYTEORDER_IS_LITTLE_ENDIAN
959 if (ch == 0xFEFF) {
960 bo = -1;
961 continue;
962 } else if (ch == 0xFFFE) {
963 bo = 1;
964 continue;
965 }
966 if (bo == 1)
967 ch = (ch >> 8) | (ch << 8);
968#else
969 if (ch == 0xFEFF) {
970 bo = 1;
971 continue;
972 } else if (ch == 0xFFFE) {
973 bo = -1;
974 continue;
975 }
976 if (bo == -1)
977 ch = (ch >> 8) | (ch << 8);
978#endif
979 if (ch < 0xD800 || ch > 0xDFFF) {
980 *p++ = ch;
981 continue;
982 }
983
984 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000985 if (q >= e) {
986 errmsg = "unexpected end of data";
987 goto utf16Error;
988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 if (0xDC00 <= *q && *q <= 0xDFFF) {
990 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000991 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992 /* This is valid data (a UTF-16 surrogate pair), but
993 we are not able to store this information since our
994 Py_UNICODE type only has 16 bits... this might
995 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000996 errmsg = "code pairs are not supported";
997 goto utf16Error;
998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999 else
1000 continue;
1001 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001002 errmsg = "illegal encoding";
1003 /* Fall through to report the error */
1004
1005 utf16Error:
1006 if (utf16_decoding_error(&q, &p, errors, errmsg))
1007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 }
1009
1010 if (byteorder)
1011 *byteorder = bo;
1012
1013 /* Adjust length */
1014 if (_PyUnicode_Resize(unicode, p - unicode->str))
1015 goto onError;
1016
1017 return (PyObject *)unicode;
1018
1019onError:
1020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024#undef UTF16_ERROR
1025
1026PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1027 int size,
1028 const char *errors,
1029 int byteorder)
1030{
1031 PyObject *v;
1032 Py_UNICODE *p;
1033 char *q;
1034
1035 /* We don't create UTF-16 pairs... */
1036 v = PyString_FromStringAndSize(NULL,
1037 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1038 if (v == NULL)
1039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040
1041 q = PyString_AS_STRING(v);
1042 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (byteorder == 0)
1044 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001045 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001046 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (byteorder == 0 ||
1048#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1049 byteorder == -1
1050#else
1051 byteorder == 1
1052#endif
1053 )
1054 memcpy(p, s, size * sizeof(Py_UNICODE));
1055 else
1056 while (size-- > 0) {
1057 Py_UNICODE ch = *s++;
1058 *p++ = (ch >> 8) | (ch << 8);
1059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return v;
1061}
1062
1063PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1064{
1065 if (!PyUnicode_Check(unicode)) {
1066 PyErr_BadArgument();
1067 return NULL;
1068 }
1069 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1070 PyUnicode_GET_SIZE(unicode),
1071 NULL,
1072 0);
1073}
1074
1075/* --- Unicode Escape Codec ----------------------------------------------- */
1076
1077static
1078int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001079 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 const char *errors,
1081 const char *details)
1082{
1083 if ((errors == NULL) ||
1084 (strcmp(errors,"strict") == 0)) {
1085 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001086 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 details);
1088 return -1;
1089 }
1090 else if (strcmp(errors,"ignore") == 0) {
1091 return 0;
1092 }
1093 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001094 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 return 0;
1096 }
1097 else {
1098 PyErr_Format(PyExc_ValueError,
1099 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001100 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 errors);
1102 return -1;
1103 }
1104}
1105
Fredrik Lundh06d12682001-01-24 07:59:11 +00001106static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001107
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1109 int size,
1110 const char *errors)
1111{
1112 PyUnicodeObject *v;
1113 Py_UNICODE *p = NULL, *buf = NULL;
1114 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001115 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116
1117 /* Escaped strings will always be longer than the resulting
1118 Unicode string, so we start with size here and then reduce the
1119 length after conversion to the true value. */
1120 v = _PyUnicode_New(size);
1121 if (v == NULL)
1122 goto onError;
1123 if (size == 0)
1124 return (PyObject *)v;
1125 p = buf = PyUnicode_AS_UNICODE(v);
1126 end = s + size;
1127 while (s < end) {
1128 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001129 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130 int i;
1131
1132 /* Non-escape characters are interpreted as Unicode ordinals */
1133 if (*s != '\\') {
1134 *p++ = (unsigned char)*s++;
1135 continue;
1136 }
1137
1138 /* \ - Escapes */
1139 s++;
1140 switch (*s++) {
1141
1142 /* \x escapes */
1143 case '\n': break;
1144 case '\\': *p++ = '\\'; break;
1145 case '\'': *p++ = '\''; break;
1146 case '\"': *p++ = '\"'; break;
1147 case 'b': *p++ = '\b'; break;
1148 case 'f': *p++ = '\014'; break; /* FF */
1149 case 't': *p++ = '\t'; break;
1150 case 'n': *p++ = '\n'; break;
1151 case 'r': *p++ = '\r'; break;
1152 case 'v': *p++ = '\013'; break; /* VT */
1153 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1154
1155 /* \OOO (octal) escapes */
1156 case '0': case '1': case '2': case '3':
1157 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001158 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001160 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001162 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001164 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 break;
1166
Fredrik Lundhdf846752000-09-03 11:29:49 +00001167 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001169 for (x = 0, i = 0; i < 2; i++) {
1170 c = (unsigned char)s[i];
1171 if (!isxdigit(c)) {
1172 if (unicodeescape_decoding_error(&s, &x, errors,
1173 "truncated \\xXX"))
1174 goto onError;
1175 i++;
1176 break;
1177 }
1178 x = (x<<4) & ~0xF;
1179 if (c >= '0' && c <= '9')
1180 x += c - '0';
1181 else if (c >= 'a' && c <= 'f')
1182 x += 10 + c - 'a';
1183 else
1184 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001186 s += i;
1187 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 break;
1189
1190 /* \uXXXX with 4 hex digits */
1191 case 'u':
1192 for (x = 0, i = 0; i < 4; i++) {
1193 c = (unsigned char)s[i];
1194 if (!isxdigit(c)) {
1195 if (unicodeescape_decoding_error(&s, &x, errors,
1196 "truncated \\uXXXX"))
1197 goto onError;
1198 i++;
1199 break;
1200 }
1201 x = (x<<4) & ~0xF;
1202 if (c >= '0' && c <= '9')
1203 x += c - '0';
1204 else if (c >= 'a' && c <= 'f')
1205 x += 10 + c - 'a';
1206 else
1207 x += 10 + c - 'A';
1208 }
1209 s += i;
1210 *p++ = x;
1211 break;
1212
Fredrik Lundhdf846752000-09-03 11:29:49 +00001213 /* \UXXXXXXXX with 8 hex digits */
1214 case 'U':
1215 for (chr = 0, i = 0; i < 8; i++) {
1216 c = (unsigned char)s[i];
1217 if (!isxdigit(c)) {
1218 if (unicodeescape_decoding_error(&s, &x, errors,
1219 "truncated \\uXXXX"))
1220 goto onError;
1221 i++;
1222 break;
1223 }
1224 chr = (chr<<4) & ~0xF;
1225 if (c >= '0' && c <= '9')
1226 chr += c - '0';
1227 else if (c >= 'a' && c <= 'f')
1228 chr += 10 + c - 'a';
1229 else
1230 chr += 10 + c - 'A';
1231 }
1232 s += i;
1233 goto store;
1234
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001235 case 'N':
1236 /* Ok, we need to deal with Unicode Character Names now,
1237 * make sure we've imported the hash table data...
1238 */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001239 if (ucnhash_CAPI == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001240 PyObject *mod = 0, *v = 0;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001241 mod = PyImport_ImportModule("unicodedata");
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001242 if (mod == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001243 goto ucnhashError;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001244 v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001245 Py_DECREF(mod);
1246 if (v == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001247 goto ucnhashError;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001248 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001249 Py_DECREF(v);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001250 if (ucnhash_CAPI == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001251 goto ucnhashError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001252 }
1253
Fredrik Lundhdf846752000-09-03 11:29:49 +00001254 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001255 const char *start = s + 1;
1256 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001257
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001258 /* look for the closing brace */
1259 while (*endBrace != '}' && endBrace < end)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001260 endBrace++;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001261 if (endBrace != end && *endBrace == '}') {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001262 if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001263 if (unicodeescape_decoding_error(
1264 &s, &x, errors,
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001265 "Invalid Unicode Character Name")
1266 )
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001267 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001268 goto ucnFallthrough;
1269 }
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001270 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001271 goto store;
1272 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001273 if (unicodeescape_decoding_error(
1274 &s, &x, errors,
1275 "Unicode name missing closing brace"))
1276 goto onError;
1277 goto ucnFallthrough;
1278 }
1279 break;
1280 }
1281 if (unicodeescape_decoding_error(
1282 &s, &x, errors,
1283 "Missing opening brace for Unicode Character Name escape"))
1284 goto onError;
1285ucnFallthrough:
1286 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001287 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 *p++ = '\\';
1289 *p++ = (unsigned char)s[-1];
1290 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001291store:
1292 /* when we get here, chr is a 32-bit unicode character */
1293 if (chr <= 0xffff)
1294 /* UCS-2 character */
1295 *p++ = (Py_UNICODE) chr;
1296 else if (chr <= 0x10ffff) {
1297 /* UCS-4 character. store as two surrogate characters */
1298 chr -= 0x10000L;
1299 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1300 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1301 } else {
1302 if (unicodeescape_decoding_error(
1303 &s, &x, errors,
1304 "Illegal Unicode character")
1305 )
1306 goto onError;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 }
1309 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001310 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001311 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 return (PyObject *)v;
1313
Fredrik Lundhf6056062001-01-20 11:15:25 +00001314 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001315 PyErr_SetString(
1316 PyExc_UnicodeError,
1317 "\\N escapes not supported (can't load unicodedata module)"
1318 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001319 return NULL;
1320
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 onError:
1322 Py_XDECREF(v);
1323 return NULL;
1324}
1325
1326/* Return a Unicode-Escape string version of the Unicode object.
1327
1328 If quotes is true, the string is enclosed in u"" or u'' quotes as
1329 appropriate.
1330
1331*/
1332
Barry Warsaw51ac5802000-03-20 16:36:48 +00001333static const Py_UNICODE *findchar(const Py_UNICODE *s,
1334 int size,
1335 Py_UNICODE ch);
1336
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337static
1338PyObject *unicodeescape_string(const Py_UNICODE *s,
1339 int size,
1340 int quotes)
1341{
1342 PyObject *repr;
1343 char *p;
1344 char *q;
1345
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001346 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001347
1348 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1349 if (repr == NULL)
1350 return NULL;
1351
1352 p = q = PyString_AS_STRING(repr);
1353
1354 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 *p++ = 'u';
1356 *p++ = (findchar(s, size, '\'') &&
1357 !findchar(s, size, '"')) ? '"' : '\'';
1358 }
1359 while (size-- > 0) {
1360 Py_UNICODE ch = *s++;
1361 /* Escape quotes */
1362 if (quotes && (ch == q[1] || ch == '\\')) {
1363 *p++ = '\\';
1364 *p++ = (char) ch;
1365 }
1366 /* Map 16-bit characters to '\uxxxx' */
1367 else if (ch >= 256) {
1368 *p++ = '\\';
1369 *p++ = 'u';
1370 *p++ = hexdigit[(ch >> 12) & 0xf];
1371 *p++ = hexdigit[(ch >> 8) & 0xf];
1372 *p++ = hexdigit[(ch >> 4) & 0xf];
1373 *p++ = hexdigit[ch & 15];
1374 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001375 /* Map special whitespace to '\t', \n', '\r' */
1376 else if (ch == '\t') {
1377 *p++ = '\\';
1378 *p++ = 't';
1379 }
1380 else if (ch == '\n') {
1381 *p++ = '\\';
1382 *p++ = 'n';
1383 }
1384 else if (ch == '\r') {
1385 *p++ = '\\';
1386 *p++ = 'r';
1387 }
1388 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 else if (ch < ' ' || ch >= 128) {
1390 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001391 *p++ = 'x';
1392 *p++ = hexdigit[(ch >> 4) & 0xf];
1393 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 }
1395 /* Copy everything else as-is */
1396 else
1397 *p++ = (char) ch;
1398 }
1399 if (quotes)
1400 *p++ = q[1];
1401
1402 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001403 if (_PyString_Resize(&repr, p - q))
1404 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001405
1406 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001407
1408 onError:
1409 Py_DECREF(repr);
1410 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001411}
1412
1413PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1414 int size)
1415{
1416 return unicodeescape_string(s, size, 0);
1417}
1418
1419PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1420{
1421 if (!PyUnicode_Check(unicode)) {
1422 PyErr_BadArgument();
1423 return NULL;
1424 }
1425 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1426 PyUnicode_GET_SIZE(unicode));
1427}
1428
1429/* --- Raw Unicode Escape Codec ------------------------------------------- */
1430
1431PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1432 int size,
1433 const char *errors)
1434{
1435 PyUnicodeObject *v;
1436 Py_UNICODE *p, *buf;
1437 const char *end;
1438 const char *bs;
1439
1440 /* Escaped strings will always be longer than the resulting
1441 Unicode string, so we start with size here and then reduce the
1442 length after conversion to the true value. */
1443 v = _PyUnicode_New(size);
1444 if (v == NULL)
1445 goto onError;
1446 if (size == 0)
1447 return (PyObject *)v;
1448 p = buf = PyUnicode_AS_UNICODE(v);
1449 end = s + size;
1450 while (s < end) {
1451 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001452 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001453 int i;
1454
1455 /* Non-escape characters are interpreted as Unicode ordinals */
1456 if (*s != '\\') {
1457 *p++ = (unsigned char)*s++;
1458 continue;
1459 }
1460
1461 /* \u-escapes are only interpreted iff the number of leading
1462 backslashes if odd */
1463 bs = s;
1464 for (;s < end;) {
1465 if (*s != '\\')
1466 break;
1467 *p++ = (unsigned char)*s++;
1468 }
1469 if (((s - bs) & 1) == 0 ||
1470 s >= end ||
1471 *s != 'u') {
1472 continue;
1473 }
1474 p--;
1475 s++;
1476
1477 /* \uXXXX with 4 hex digits */
1478 for (x = 0, i = 0; i < 4; i++) {
1479 c = (unsigned char)s[i];
1480 if (!isxdigit(c)) {
1481 if (unicodeescape_decoding_error(&s, &x, errors,
1482 "truncated \\uXXXX"))
1483 goto onError;
1484 i++;
1485 break;
1486 }
1487 x = (x<<4) & ~0xF;
1488 if (c >= '0' && c <= '9')
1489 x += c - '0';
1490 else if (c >= 'a' && c <= 'f')
1491 x += 10 + c - 'a';
1492 else
1493 x += 10 + c - 'A';
1494 }
1495 s += i;
1496 *p++ = x;
1497 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001498 if (_PyUnicode_Resize(v, (int)(p - buf)))
1499 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500 return (PyObject *)v;
1501
1502 onError:
1503 Py_XDECREF(v);
1504 return NULL;
1505}
1506
1507PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1508 int size)
1509{
1510 PyObject *repr;
1511 char *p;
1512 char *q;
1513
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001514 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001515
1516 repr = PyString_FromStringAndSize(NULL, 6 * size);
1517 if (repr == NULL)
1518 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001519 if (size == 0)
1520 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001521
1522 p = q = PyString_AS_STRING(repr);
1523 while (size-- > 0) {
1524 Py_UNICODE ch = *s++;
1525 /* Map 16-bit characters to '\uxxxx' */
1526 if (ch >= 256) {
1527 *p++ = '\\';
1528 *p++ = 'u';
1529 *p++ = hexdigit[(ch >> 12) & 0xf];
1530 *p++ = hexdigit[(ch >> 8) & 0xf];
1531 *p++ = hexdigit[(ch >> 4) & 0xf];
1532 *p++ = hexdigit[ch & 15];
1533 }
1534 /* Copy everything else as-is */
1535 else
1536 *p++ = (char) ch;
1537 }
1538 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001539 if (_PyString_Resize(&repr, p - q))
1540 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541
1542 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001543
1544 onError:
1545 Py_DECREF(repr);
1546 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001547}
1548
1549PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1550{
1551 if (!PyUnicode_Check(unicode)) {
1552 PyErr_BadArgument();
1553 return NULL;
1554 }
1555 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1556 PyUnicode_GET_SIZE(unicode));
1557}
1558
1559/* --- Latin-1 Codec ------------------------------------------------------ */
1560
1561PyObject *PyUnicode_DecodeLatin1(const char *s,
1562 int size,
1563 const char *errors)
1564{
1565 PyUnicodeObject *v;
1566 Py_UNICODE *p;
1567
1568 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1569 v = _PyUnicode_New(size);
1570 if (v == NULL)
1571 goto onError;
1572 if (size == 0)
1573 return (PyObject *)v;
1574 p = PyUnicode_AS_UNICODE(v);
1575 while (size-- > 0)
1576 *p++ = (unsigned char)*s++;
1577 return (PyObject *)v;
1578
1579 onError:
1580 Py_XDECREF(v);
1581 return NULL;
1582}
1583
1584static
1585int latin1_encoding_error(const Py_UNICODE **source,
1586 char **dest,
1587 const char *errors,
1588 const char *details)
1589{
1590 if ((errors == NULL) ||
1591 (strcmp(errors,"strict") == 0)) {
1592 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001593 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 details);
1595 return -1;
1596 }
1597 else if (strcmp(errors,"ignore") == 0) {
1598 return 0;
1599 }
1600 else if (strcmp(errors,"replace") == 0) {
1601 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001602 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603 return 0;
1604 }
1605 else {
1606 PyErr_Format(PyExc_ValueError,
1607 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001608 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001609 errors);
1610 return -1;
1611 }
1612}
1613
1614PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1615 int size,
1616 const char *errors)
1617{
1618 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001619 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001620
Guido van Rossumd57fd912000-03-10 22:53:23 +00001621 repr = PyString_FromStringAndSize(NULL, size);
1622 if (repr == NULL)
1623 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001624 if (size == 0)
1625 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001626
1627 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001628 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 while (size-- > 0) {
1630 Py_UNICODE ch = *p++;
1631 if (ch >= 256) {
1632 if (latin1_encoding_error(&p, &s, errors,
1633 "ordinal not in range(256)"))
1634 goto onError;
1635 }
1636 else
1637 *s++ = (char)ch;
1638 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001639 /* Resize if error handling skipped some characters */
1640 if (s - start < PyString_GET_SIZE(repr))
1641 if (_PyString_Resize(&repr, s - start))
1642 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001643 return repr;
1644
1645 onError:
1646 Py_DECREF(repr);
1647 return NULL;
1648}
1649
1650PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1651{
1652 if (!PyUnicode_Check(unicode)) {
1653 PyErr_BadArgument();
1654 return NULL;
1655 }
1656 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1657 PyUnicode_GET_SIZE(unicode),
1658 NULL);
1659}
1660
1661/* --- 7-bit ASCII Codec -------------------------------------------------- */
1662
1663static
1664int ascii_decoding_error(const char **source,
1665 Py_UNICODE **dest,
1666 const char *errors,
1667 const char *details)
1668{
1669 if ((errors == NULL) ||
1670 (strcmp(errors,"strict") == 0)) {
1671 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001672 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 details);
1674 return -1;
1675 }
1676 else if (strcmp(errors,"ignore") == 0) {
1677 return 0;
1678 }
1679 else if (strcmp(errors,"replace") == 0) {
1680 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1681 (*dest)++;
1682 return 0;
1683 }
1684 else {
1685 PyErr_Format(PyExc_ValueError,
1686 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001687 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 errors);
1689 return -1;
1690 }
1691}
1692
1693PyObject *PyUnicode_DecodeASCII(const char *s,
1694 int size,
1695 const char *errors)
1696{
1697 PyUnicodeObject *v;
1698 Py_UNICODE *p;
1699
1700 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1701 v = _PyUnicode_New(size);
1702 if (v == NULL)
1703 goto onError;
1704 if (size == 0)
1705 return (PyObject *)v;
1706 p = PyUnicode_AS_UNICODE(v);
1707 while (size-- > 0) {
1708 register unsigned char c;
1709
1710 c = (unsigned char)*s++;
1711 if (c < 128)
1712 *p++ = c;
1713 else if (ascii_decoding_error(&s, &p, errors,
1714 "ordinal not in range(128)"))
1715 goto onError;
1716 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001717 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1718 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1719 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720 return (PyObject *)v;
1721
1722 onError:
1723 Py_XDECREF(v);
1724 return NULL;
1725}
1726
1727static
1728int ascii_encoding_error(const Py_UNICODE **source,
1729 char **dest,
1730 const char *errors,
1731 const char *details)
1732{
1733 if ((errors == NULL) ||
1734 (strcmp(errors,"strict") == 0)) {
1735 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001736 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 details);
1738 return -1;
1739 }
1740 else if (strcmp(errors,"ignore") == 0) {
1741 return 0;
1742 }
1743 else if (strcmp(errors,"replace") == 0) {
1744 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001745 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746 return 0;
1747 }
1748 else {
1749 PyErr_Format(PyExc_ValueError,
1750 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001751 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001752 errors);
1753 return -1;
1754 }
1755}
1756
1757PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1758 int size,
1759 const char *errors)
1760{
1761 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001762 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001763
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 repr = PyString_FromStringAndSize(NULL, size);
1765 if (repr == NULL)
1766 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001767 if (size == 0)
1768 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001769
1770 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001771 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001772 while (size-- > 0) {
1773 Py_UNICODE ch = *p++;
1774 if (ch >= 128) {
1775 if (ascii_encoding_error(&p, &s, errors,
1776 "ordinal not in range(128)"))
1777 goto onError;
1778 }
1779 else
1780 *s++ = (char)ch;
1781 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001782 /* Resize if error handling skipped some characters */
1783 if (s - start < PyString_GET_SIZE(repr))
1784 if (_PyString_Resize(&repr, s - start))
1785 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001786 return repr;
1787
1788 onError:
1789 Py_DECREF(repr);
1790 return NULL;
1791}
1792
1793PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1794{
1795 if (!PyUnicode_Check(unicode)) {
1796 PyErr_BadArgument();
1797 return NULL;
1798 }
1799 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1800 PyUnicode_GET_SIZE(unicode),
1801 NULL);
1802}
1803
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001804#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001805
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001806/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001807
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001808PyObject *PyUnicode_DecodeMBCS(const char *s,
1809 int size,
1810 const char *errors)
1811{
1812 PyUnicodeObject *v;
1813 Py_UNICODE *p;
1814
1815 /* First get the size of the result */
1816 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001817 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001818 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1819
1820 v = _PyUnicode_New(usize);
1821 if (v == NULL)
1822 return NULL;
1823 if (usize == 0)
1824 return (PyObject *)v;
1825 p = PyUnicode_AS_UNICODE(v);
1826 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1827 Py_DECREF(v);
1828 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1829 }
1830
1831 return (PyObject *)v;
1832}
1833
1834PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1835 int size,
1836 const char *errors)
1837{
1838 PyObject *repr;
1839 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001840 DWORD mbcssize;
1841
1842 /* If there are no characters, bail now! */
1843 if (size==0)
1844 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001845
1846 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001847 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001848 if (mbcssize==0)
1849 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1850
1851 repr = PyString_FromStringAndSize(NULL, mbcssize);
1852 if (repr == NULL)
1853 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001854 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001855 return repr;
1856
1857 /* Do the conversion */
1858 s = PyString_AS_STRING(repr);
1859 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1860 Py_DECREF(repr);
1861 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1862 }
1863 return repr;
1864}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001865
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001866#endif /* MS_WIN32 */
1867
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868/* --- Character Mapping Codec -------------------------------------------- */
1869
1870static
1871int charmap_decoding_error(const char **source,
1872 Py_UNICODE **dest,
1873 const char *errors,
1874 const char *details)
1875{
1876 if ((errors == NULL) ||
1877 (strcmp(errors,"strict") == 0)) {
1878 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001879 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 details);
1881 return -1;
1882 }
1883 else if (strcmp(errors,"ignore") == 0) {
1884 return 0;
1885 }
1886 else if (strcmp(errors,"replace") == 0) {
1887 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1888 (*dest)++;
1889 return 0;
1890 }
1891 else {
1892 PyErr_Format(PyExc_ValueError,
1893 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001894 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895 errors);
1896 return -1;
1897 }
1898}
1899
1900PyObject *PyUnicode_DecodeCharmap(const char *s,
1901 int size,
1902 PyObject *mapping,
1903 const char *errors)
1904{
1905 PyUnicodeObject *v;
1906 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001907 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001908
1909 /* Default to Latin-1 */
1910 if (mapping == NULL)
1911 return PyUnicode_DecodeLatin1(s, size, errors);
1912
1913 v = _PyUnicode_New(size);
1914 if (v == NULL)
1915 goto onError;
1916 if (size == 0)
1917 return (PyObject *)v;
1918 p = PyUnicode_AS_UNICODE(v);
1919 while (size-- > 0) {
1920 unsigned char ch = *s++;
1921 PyObject *w, *x;
1922
1923 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1924 w = PyInt_FromLong((long)ch);
1925 if (w == NULL)
1926 goto onError;
1927 x = PyObject_GetItem(mapping, w);
1928 Py_DECREF(w);
1929 if (x == NULL) {
1930 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001931 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001933 x = Py_None;
1934 Py_INCREF(x);
1935 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001936 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 }
1938
1939 /* Apply mapping */
1940 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001941 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 if (value < 0 || value > 65535) {
1943 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001944 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 Py_DECREF(x);
1946 goto onError;
1947 }
1948 *p++ = (Py_UNICODE)value;
1949 }
1950 else if (x == Py_None) {
1951 /* undefined mapping */
1952 if (charmap_decoding_error(&s, &p, errors,
1953 "character maps to <undefined>")) {
1954 Py_DECREF(x);
1955 goto onError;
1956 }
1957 }
1958 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001959 int targetsize = PyUnicode_GET_SIZE(x);
1960
1961 if (targetsize == 1)
1962 /* 1-1 mapping */
1963 *p++ = *PyUnicode_AS_UNICODE(x);
1964
1965 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001966 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001967 if (targetsize > extrachars) {
1968 /* resize first */
1969 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1970 int needed = (targetsize - extrachars) + \
1971 (targetsize << 2);
1972 extrachars += needed;
1973 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001974 Py_DECREF(x);
1975 goto onError;
1976 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001977 p = PyUnicode_AS_UNICODE(v) + oldpos;
1978 }
1979 Py_UNICODE_COPY(p,
1980 PyUnicode_AS_UNICODE(x),
1981 targetsize);
1982 p += targetsize;
1983 extrachars -= targetsize;
1984 }
1985 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 }
1987 else {
1988 /* wrong return value */
1989 PyErr_SetString(PyExc_TypeError,
1990 "character mapping must return integer, None or unicode");
1991 Py_DECREF(x);
1992 goto onError;
1993 }
1994 Py_DECREF(x);
1995 }
1996 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1997 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1998 goto onError;
1999 return (PyObject *)v;
2000
2001 onError:
2002 Py_XDECREF(v);
2003 return NULL;
2004}
2005
2006static
2007int charmap_encoding_error(const Py_UNICODE **source,
2008 char **dest,
2009 const char *errors,
2010 const char *details)
2011{
2012 if ((errors == NULL) ||
2013 (strcmp(errors,"strict") == 0)) {
2014 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002015 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 details);
2017 return -1;
2018 }
2019 else if (strcmp(errors,"ignore") == 0) {
2020 return 0;
2021 }
2022 else if (strcmp(errors,"replace") == 0) {
2023 **dest = '?';
2024 (*dest)++;
2025 return 0;
2026 }
2027 else {
2028 PyErr_Format(PyExc_ValueError,
2029 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002030 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031 errors);
2032 return -1;
2033 }
2034}
2035
2036PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2037 int size,
2038 PyObject *mapping,
2039 const char *errors)
2040{
2041 PyObject *v;
2042 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002043 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002044
2045 /* Default to Latin-1 */
2046 if (mapping == NULL)
2047 return PyUnicode_EncodeLatin1(p, size, errors);
2048
2049 v = PyString_FromStringAndSize(NULL, size);
2050 if (v == NULL)
2051 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002052 if (size == 0)
2053 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002054 s = PyString_AS_STRING(v);
2055 while (size-- > 0) {
2056 Py_UNICODE ch = *p++;
2057 PyObject *w, *x;
2058
2059 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2060 w = PyInt_FromLong((long)ch);
2061 if (w == NULL)
2062 goto onError;
2063 x = PyObject_GetItem(mapping, w);
2064 Py_DECREF(w);
2065 if (x == NULL) {
2066 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002067 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002068 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002069 x = Py_None;
2070 Py_INCREF(x);
2071 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002072 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002073 }
2074
2075 /* Apply mapping */
2076 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002077 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 if (value < 0 || value > 255) {
2079 PyErr_SetString(PyExc_TypeError,
2080 "character mapping must be in range(256)");
2081 Py_DECREF(x);
2082 goto onError;
2083 }
2084 *s++ = (char)value;
2085 }
2086 else if (x == Py_None) {
2087 /* undefined mapping */
2088 if (charmap_encoding_error(&p, &s, errors,
2089 "character maps to <undefined>")) {
2090 Py_DECREF(x);
2091 goto onError;
2092 }
2093 }
2094 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002095 int targetsize = PyString_GET_SIZE(x);
2096
2097 if (targetsize == 1)
2098 /* 1-1 mapping */
2099 *s++ = *PyString_AS_STRING(x);
2100
2101 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002103 if (targetsize > extrachars) {
2104 /* resize first */
2105 int oldpos = (int)(s - PyString_AS_STRING(v));
2106 int needed = (targetsize - extrachars) + \
2107 (targetsize << 2);
2108 extrachars += needed;
2109 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002110 Py_DECREF(x);
2111 goto onError;
2112 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002113 s = PyString_AS_STRING(v) + oldpos;
2114 }
2115 memcpy(s,
2116 PyString_AS_STRING(x),
2117 targetsize);
2118 s += targetsize;
2119 extrachars -= targetsize;
2120 }
2121 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 }
2123 else {
2124 /* wrong return value */
2125 PyErr_SetString(PyExc_TypeError,
2126 "character mapping must return integer, None or unicode");
2127 Py_DECREF(x);
2128 goto onError;
2129 }
2130 Py_DECREF(x);
2131 }
2132 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2133 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2134 goto onError;
2135 return v;
2136
2137 onError:
2138 Py_DECREF(v);
2139 return NULL;
2140}
2141
2142PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2143 PyObject *mapping)
2144{
2145 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2146 PyErr_BadArgument();
2147 return NULL;
2148 }
2149 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2150 PyUnicode_GET_SIZE(unicode),
2151 mapping,
2152 NULL);
2153}
2154
2155static
2156int translate_error(const Py_UNICODE **source,
2157 Py_UNICODE **dest,
2158 const char *errors,
2159 const char *details)
2160{
2161 if ((errors == NULL) ||
2162 (strcmp(errors,"strict") == 0)) {
2163 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002164 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 details);
2166 return -1;
2167 }
2168 else if (strcmp(errors,"ignore") == 0) {
2169 return 0;
2170 }
2171 else if (strcmp(errors,"replace") == 0) {
2172 **dest = '?';
2173 (*dest)++;
2174 return 0;
2175 }
2176 else {
2177 PyErr_Format(PyExc_ValueError,
2178 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002179 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002180 errors);
2181 return -1;
2182 }
2183}
2184
2185PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2186 int size,
2187 PyObject *mapping,
2188 const char *errors)
2189{
2190 PyUnicodeObject *v;
2191 Py_UNICODE *p;
2192
2193 if (mapping == NULL) {
2194 PyErr_BadArgument();
2195 return NULL;
2196 }
2197
2198 /* Output will never be longer than input */
2199 v = _PyUnicode_New(size);
2200 if (v == NULL)
2201 goto onError;
2202 if (size == 0)
2203 goto done;
2204 p = PyUnicode_AS_UNICODE(v);
2205 while (size-- > 0) {
2206 Py_UNICODE ch = *s++;
2207 PyObject *w, *x;
2208
2209 /* Get mapping */
2210 w = PyInt_FromLong(ch);
2211 if (w == NULL)
2212 goto onError;
2213 x = PyObject_GetItem(mapping, w);
2214 Py_DECREF(w);
2215 if (x == NULL) {
2216 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2217 /* No mapping found: default to 1-1 mapping */
2218 PyErr_Clear();
2219 *p++ = ch;
2220 continue;
2221 }
2222 goto onError;
2223 }
2224
2225 /* Apply mapping */
2226 if (PyInt_Check(x))
2227 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2228 else if (x == Py_None) {
2229 /* undefined mapping */
2230 if (translate_error(&s, &p, errors,
2231 "character maps to <undefined>")) {
2232 Py_DECREF(x);
2233 goto onError;
2234 }
2235 }
2236 else if (PyUnicode_Check(x)) {
2237 if (PyUnicode_GET_SIZE(x) != 1) {
2238 /* 1-n mapping */
2239 PyErr_SetString(PyExc_NotImplementedError,
2240 "1-n mappings are currently not implemented");
2241 Py_DECREF(x);
2242 goto onError;
2243 }
2244 *p++ = *PyUnicode_AS_UNICODE(x);
2245 }
2246 else {
2247 /* wrong return value */
2248 PyErr_SetString(PyExc_TypeError,
2249 "translate mapping must return integer, None or unicode");
2250 Py_DECREF(x);
2251 goto onError;
2252 }
2253 Py_DECREF(x);
2254 }
2255 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002256 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2257 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002258
2259 done:
2260 return (PyObject *)v;
2261
2262 onError:
2263 Py_XDECREF(v);
2264 return NULL;
2265}
2266
2267PyObject *PyUnicode_Translate(PyObject *str,
2268 PyObject *mapping,
2269 const char *errors)
2270{
2271 PyObject *result;
2272
2273 str = PyUnicode_FromObject(str);
2274 if (str == NULL)
2275 goto onError;
2276 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2277 PyUnicode_GET_SIZE(str),
2278 mapping,
2279 errors);
2280 Py_DECREF(str);
2281 return result;
2282
2283 onError:
2284 Py_XDECREF(str);
2285 return NULL;
2286}
2287
Guido van Rossum9e896b32000-04-05 20:11:21 +00002288/* --- Decimal Encoder ---------------------------------------------------- */
2289
2290int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2291 int length,
2292 char *output,
2293 const char *errors)
2294{
2295 Py_UNICODE *p, *end;
2296
2297 if (output == NULL) {
2298 PyErr_BadArgument();
2299 return -1;
2300 }
2301
2302 p = s;
2303 end = s + length;
2304 while (p < end) {
2305 register Py_UNICODE ch = *p++;
2306 int decimal;
2307
2308 if (Py_UNICODE_ISSPACE(ch)) {
2309 *output++ = ' ';
2310 continue;
2311 }
2312 decimal = Py_UNICODE_TODECIMAL(ch);
2313 if (decimal >= 0) {
2314 *output++ = '0' + decimal;
2315 continue;
2316 }
Guido van Rossumba477042000-04-06 18:18:10 +00002317 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002318 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002319 continue;
2320 }
2321 /* All other characters are considered invalid */
2322 if (errors == NULL || strcmp(errors, "strict") == 0) {
2323 PyErr_SetString(PyExc_ValueError,
2324 "invalid decimal Unicode string");
2325 goto onError;
2326 }
2327 else if (strcmp(errors, "ignore") == 0)
2328 continue;
2329 else if (strcmp(errors, "replace") == 0) {
2330 *output++ = '?';
2331 continue;
2332 }
2333 }
2334 /* 0-terminate the output string */
2335 *output++ = '\0';
2336 return 0;
2337
2338 onError:
2339 return -1;
2340}
2341
Guido van Rossumd57fd912000-03-10 22:53:23 +00002342/* --- Helpers ------------------------------------------------------------ */
2343
2344static
2345int count(PyUnicodeObject *self,
2346 int start,
2347 int end,
2348 PyUnicodeObject *substring)
2349{
2350 int count = 0;
2351
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002352 if (start < 0)
2353 start += self->length;
2354 if (start < 0)
2355 start = 0;
2356 if (end > self->length)
2357 end = self->length;
2358 if (end < 0)
2359 end += self->length;
2360 if (end < 0)
2361 end = 0;
2362
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002363 if (substring->length == 0)
2364 return (end - start + 1);
2365
Guido van Rossumd57fd912000-03-10 22:53:23 +00002366 end -= substring->length;
2367
2368 while (start <= end)
2369 if (Py_UNICODE_MATCH(self, start, substring)) {
2370 count++;
2371 start += substring->length;
2372 } else
2373 start++;
2374
2375 return count;
2376}
2377
2378int PyUnicode_Count(PyObject *str,
2379 PyObject *substr,
2380 int start,
2381 int end)
2382{
2383 int result;
2384
2385 str = PyUnicode_FromObject(str);
2386 if (str == NULL)
2387 return -1;
2388 substr = PyUnicode_FromObject(substr);
2389 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002390 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002391 return -1;
2392 }
2393
2394 result = count((PyUnicodeObject *)str,
2395 start, end,
2396 (PyUnicodeObject *)substr);
2397
2398 Py_DECREF(str);
2399 Py_DECREF(substr);
2400 return result;
2401}
2402
2403static
2404int findstring(PyUnicodeObject *self,
2405 PyUnicodeObject *substring,
2406 int start,
2407 int end,
2408 int direction)
2409{
2410 if (start < 0)
2411 start += self->length;
2412 if (start < 0)
2413 start = 0;
2414
2415 if (substring->length == 0)
2416 return start;
2417
2418 if (end > self->length)
2419 end = self->length;
2420 if (end < 0)
2421 end += self->length;
2422 if (end < 0)
2423 end = 0;
2424
2425 end -= substring->length;
2426
2427 if (direction < 0) {
2428 for (; end >= start; end--)
2429 if (Py_UNICODE_MATCH(self, end, substring))
2430 return end;
2431 } else {
2432 for (; start <= end; start++)
2433 if (Py_UNICODE_MATCH(self, start, substring))
2434 return start;
2435 }
2436
2437 return -1;
2438}
2439
2440int PyUnicode_Find(PyObject *str,
2441 PyObject *substr,
2442 int start,
2443 int end,
2444 int direction)
2445{
2446 int result;
2447
2448 str = PyUnicode_FromObject(str);
2449 if (str == NULL)
2450 return -1;
2451 substr = PyUnicode_FromObject(substr);
2452 if (substr == NULL) {
2453 Py_DECREF(substr);
2454 return -1;
2455 }
2456
2457 result = findstring((PyUnicodeObject *)str,
2458 (PyUnicodeObject *)substr,
2459 start, end, direction);
2460 Py_DECREF(str);
2461 Py_DECREF(substr);
2462 return result;
2463}
2464
2465static
2466int tailmatch(PyUnicodeObject *self,
2467 PyUnicodeObject *substring,
2468 int start,
2469 int end,
2470 int direction)
2471{
2472 if (start < 0)
2473 start += self->length;
2474 if (start < 0)
2475 start = 0;
2476
2477 if (substring->length == 0)
2478 return 1;
2479
2480 if (end > self->length)
2481 end = self->length;
2482 if (end < 0)
2483 end += self->length;
2484 if (end < 0)
2485 end = 0;
2486
2487 end -= substring->length;
2488 if (end < start)
2489 return 0;
2490
2491 if (direction > 0) {
2492 if (Py_UNICODE_MATCH(self, end, substring))
2493 return 1;
2494 } else {
2495 if (Py_UNICODE_MATCH(self, start, substring))
2496 return 1;
2497 }
2498
2499 return 0;
2500}
2501
2502int PyUnicode_Tailmatch(PyObject *str,
2503 PyObject *substr,
2504 int start,
2505 int end,
2506 int direction)
2507{
2508 int result;
2509
2510 str = PyUnicode_FromObject(str);
2511 if (str == NULL)
2512 return -1;
2513 substr = PyUnicode_FromObject(substr);
2514 if (substr == NULL) {
2515 Py_DECREF(substr);
2516 return -1;
2517 }
2518
2519 result = tailmatch((PyUnicodeObject *)str,
2520 (PyUnicodeObject *)substr,
2521 start, end, direction);
2522 Py_DECREF(str);
2523 Py_DECREF(substr);
2524 return result;
2525}
2526
2527static
2528const Py_UNICODE *findchar(const Py_UNICODE *s,
2529 int size,
2530 Py_UNICODE ch)
2531{
2532 /* like wcschr, but doesn't stop at NULL characters */
2533
2534 while (size-- > 0) {
2535 if (*s == ch)
2536 return s;
2537 s++;
2538 }
2539
2540 return NULL;
2541}
2542
2543/* Apply fixfct filter to the Unicode object self and return a
2544 reference to the modified object */
2545
2546static
2547PyObject *fixup(PyUnicodeObject *self,
2548 int (*fixfct)(PyUnicodeObject *s))
2549{
2550
2551 PyUnicodeObject *u;
2552
2553 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2554 self->length);
2555 if (u == NULL)
2556 return NULL;
2557 if (!fixfct(u)) {
2558 /* fixfct should return TRUE if it modified the buffer. If
2559 FALSE, return a reference to the original buffer instead
2560 (to save space, not time) */
2561 Py_INCREF(self);
2562 Py_DECREF(u);
2563 return (PyObject*) self;
2564 }
2565 return (PyObject*) u;
2566}
2567
2568static
2569int fixupper(PyUnicodeObject *self)
2570{
2571 int len = self->length;
2572 Py_UNICODE *s = self->str;
2573 int status = 0;
2574
2575 while (len-- > 0) {
2576 register Py_UNICODE ch;
2577
2578 ch = Py_UNICODE_TOUPPER(*s);
2579 if (ch != *s) {
2580 status = 1;
2581 *s = ch;
2582 }
2583 s++;
2584 }
2585
2586 return status;
2587}
2588
2589static
2590int fixlower(PyUnicodeObject *self)
2591{
2592 int len = self->length;
2593 Py_UNICODE *s = self->str;
2594 int status = 0;
2595
2596 while (len-- > 0) {
2597 register Py_UNICODE ch;
2598
2599 ch = Py_UNICODE_TOLOWER(*s);
2600 if (ch != *s) {
2601 status = 1;
2602 *s = ch;
2603 }
2604 s++;
2605 }
2606
2607 return status;
2608}
2609
2610static
2611int fixswapcase(PyUnicodeObject *self)
2612{
2613 int len = self->length;
2614 Py_UNICODE *s = self->str;
2615 int status = 0;
2616
2617 while (len-- > 0) {
2618 if (Py_UNICODE_ISUPPER(*s)) {
2619 *s = Py_UNICODE_TOLOWER(*s);
2620 status = 1;
2621 } else if (Py_UNICODE_ISLOWER(*s)) {
2622 *s = Py_UNICODE_TOUPPER(*s);
2623 status = 1;
2624 }
2625 s++;
2626 }
2627
2628 return status;
2629}
2630
2631static
2632int fixcapitalize(PyUnicodeObject *self)
2633{
2634 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2635 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2636 return 1;
2637 }
2638 return 0;
2639}
2640
2641static
2642int fixtitle(PyUnicodeObject *self)
2643{
2644 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2645 register Py_UNICODE *e;
2646 int previous_is_cased;
2647
2648 /* Shortcut for single character strings */
2649 if (PyUnicode_GET_SIZE(self) == 1) {
2650 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2651 if (*p != ch) {
2652 *p = ch;
2653 return 1;
2654 }
2655 else
2656 return 0;
2657 }
2658
2659 e = p + PyUnicode_GET_SIZE(self);
2660 previous_is_cased = 0;
2661 for (; p < e; p++) {
2662 register const Py_UNICODE ch = *p;
2663
2664 if (previous_is_cased)
2665 *p = Py_UNICODE_TOLOWER(ch);
2666 else
2667 *p = Py_UNICODE_TOTITLE(ch);
2668
2669 if (Py_UNICODE_ISLOWER(ch) ||
2670 Py_UNICODE_ISUPPER(ch) ||
2671 Py_UNICODE_ISTITLE(ch))
2672 previous_is_cased = 1;
2673 else
2674 previous_is_cased = 0;
2675 }
2676 return 1;
2677}
2678
2679PyObject *PyUnicode_Join(PyObject *separator,
2680 PyObject *seq)
2681{
2682 Py_UNICODE *sep;
2683 int seplen;
2684 PyUnicodeObject *res = NULL;
2685 int reslen = 0;
2686 Py_UNICODE *p;
2687 int seqlen = 0;
2688 int sz = 100;
2689 int i;
2690
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002691 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002692 if (seqlen < 0 && PyErr_Occurred())
2693 return NULL;
2694
2695 if (separator == NULL) {
2696 Py_UNICODE blank = ' ';
2697 sep = &blank;
2698 seplen = 1;
2699 }
2700 else {
2701 separator = PyUnicode_FromObject(separator);
2702 if (separator == NULL)
2703 return NULL;
2704 sep = PyUnicode_AS_UNICODE(separator);
2705 seplen = PyUnicode_GET_SIZE(separator);
2706 }
2707
2708 res = _PyUnicode_New(sz);
2709 if (res == NULL)
2710 goto onError;
2711 p = PyUnicode_AS_UNICODE(res);
2712 reslen = 0;
2713
2714 for (i = 0; i < seqlen; i++) {
2715 int itemlen;
2716 PyObject *item;
2717
2718 item = PySequence_GetItem(seq, i);
2719 if (item == NULL)
2720 goto onError;
2721 if (!PyUnicode_Check(item)) {
2722 PyObject *v;
2723 v = PyUnicode_FromObject(item);
2724 Py_DECREF(item);
2725 item = v;
2726 if (item == NULL)
2727 goto onError;
2728 }
2729 itemlen = PyUnicode_GET_SIZE(item);
2730 while (reslen + itemlen + seplen >= sz) {
2731 if (_PyUnicode_Resize(res, sz*2))
2732 goto onError;
2733 sz *= 2;
2734 p = PyUnicode_AS_UNICODE(res) + reslen;
2735 }
2736 if (i > 0) {
2737 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2738 p += seplen;
2739 reslen += seplen;
2740 }
2741 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2742 p += itemlen;
2743 reslen += itemlen;
2744 Py_DECREF(item);
2745 }
2746 if (_PyUnicode_Resize(res, reslen))
2747 goto onError;
2748
2749 Py_XDECREF(separator);
2750 return (PyObject *)res;
2751
2752 onError:
2753 Py_XDECREF(separator);
2754 Py_DECREF(res);
2755 return NULL;
2756}
2757
2758static
2759PyUnicodeObject *pad(PyUnicodeObject *self,
2760 int left,
2761 int right,
2762 Py_UNICODE fill)
2763{
2764 PyUnicodeObject *u;
2765
2766 if (left < 0)
2767 left = 0;
2768 if (right < 0)
2769 right = 0;
2770
2771 if (left == 0 && right == 0) {
2772 Py_INCREF(self);
2773 return self;
2774 }
2775
2776 u = _PyUnicode_New(left + self->length + right);
2777 if (u) {
2778 if (left)
2779 Py_UNICODE_FILL(u->str, fill, left);
2780 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2781 if (right)
2782 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2783 }
2784
2785 return u;
2786}
2787
2788#define SPLIT_APPEND(data, left, right) \
2789 str = PyUnicode_FromUnicode(data + left, right - left); \
2790 if (!str) \
2791 goto onError; \
2792 if (PyList_Append(list, str)) { \
2793 Py_DECREF(str); \
2794 goto onError; \
2795 } \
2796 else \
2797 Py_DECREF(str);
2798
2799static
2800PyObject *split_whitespace(PyUnicodeObject *self,
2801 PyObject *list,
2802 int maxcount)
2803{
2804 register int i;
2805 register int j;
2806 int len = self->length;
2807 PyObject *str;
2808
2809 for (i = j = 0; i < len; ) {
2810 /* find a token */
2811 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2812 i++;
2813 j = i;
2814 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2815 i++;
2816 if (j < i) {
2817 if (maxcount-- <= 0)
2818 break;
2819 SPLIT_APPEND(self->str, j, i);
2820 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2821 i++;
2822 j = i;
2823 }
2824 }
2825 if (j < len) {
2826 SPLIT_APPEND(self->str, j, len);
2827 }
2828 return list;
2829
2830 onError:
2831 Py_DECREF(list);
2832 return NULL;
2833}
2834
2835PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002836 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837{
2838 register int i;
2839 register int j;
2840 int len;
2841 PyObject *list;
2842 PyObject *str;
2843 Py_UNICODE *data;
2844
2845 string = PyUnicode_FromObject(string);
2846 if (string == NULL)
2847 return NULL;
2848 data = PyUnicode_AS_UNICODE(string);
2849 len = PyUnicode_GET_SIZE(string);
2850
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 list = PyList_New(0);
2852 if (!list)
2853 goto onError;
2854
2855 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002856 int eol;
2857
Guido van Rossumd57fd912000-03-10 22:53:23 +00002858 /* Find a line and append it */
2859 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2860 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861
2862 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002863 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002864 if (i < len) {
2865 if (data[i] == '\r' && i + 1 < len &&
2866 data[i+1] == '\n')
2867 i += 2;
2868 else
2869 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002870 if (keepends)
2871 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002872 }
Guido van Rossum86662912000-04-11 15:38:46 +00002873 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002874 j = i;
2875 }
2876 if (j < len) {
2877 SPLIT_APPEND(data, j, len);
2878 }
2879
2880 Py_DECREF(string);
2881 return list;
2882
2883 onError:
2884 Py_DECREF(list);
2885 Py_DECREF(string);
2886 return NULL;
2887}
2888
2889static
2890PyObject *split_char(PyUnicodeObject *self,
2891 PyObject *list,
2892 Py_UNICODE ch,
2893 int maxcount)
2894{
2895 register int i;
2896 register int j;
2897 int len = self->length;
2898 PyObject *str;
2899
2900 for (i = j = 0; i < len; ) {
2901 if (self->str[i] == ch) {
2902 if (maxcount-- <= 0)
2903 break;
2904 SPLIT_APPEND(self->str, j, i);
2905 i = j = i + 1;
2906 } else
2907 i++;
2908 }
2909 if (j <= len) {
2910 SPLIT_APPEND(self->str, j, len);
2911 }
2912 return list;
2913
2914 onError:
2915 Py_DECREF(list);
2916 return NULL;
2917}
2918
2919static
2920PyObject *split_substring(PyUnicodeObject *self,
2921 PyObject *list,
2922 PyUnicodeObject *substring,
2923 int maxcount)
2924{
2925 register int i;
2926 register int j;
2927 int len = self->length;
2928 int sublen = substring->length;
2929 PyObject *str;
2930
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002931 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002932 if (Py_UNICODE_MATCH(self, i, substring)) {
2933 if (maxcount-- <= 0)
2934 break;
2935 SPLIT_APPEND(self->str, j, i);
2936 i = j = i + sublen;
2937 } else
2938 i++;
2939 }
2940 if (j <= len) {
2941 SPLIT_APPEND(self->str, j, len);
2942 }
2943 return list;
2944
2945 onError:
2946 Py_DECREF(list);
2947 return NULL;
2948}
2949
2950#undef SPLIT_APPEND
2951
2952static
2953PyObject *split(PyUnicodeObject *self,
2954 PyUnicodeObject *substring,
2955 int maxcount)
2956{
2957 PyObject *list;
2958
2959 if (maxcount < 0)
2960 maxcount = INT_MAX;
2961
2962 list = PyList_New(0);
2963 if (!list)
2964 return NULL;
2965
2966 if (substring == NULL)
2967 return split_whitespace(self,list,maxcount);
2968
2969 else if (substring->length == 1)
2970 return split_char(self,list,substring->str[0],maxcount);
2971
2972 else if (substring->length == 0) {
2973 Py_DECREF(list);
2974 PyErr_SetString(PyExc_ValueError, "empty separator");
2975 return NULL;
2976 }
2977 else
2978 return split_substring(self,list,substring,maxcount);
2979}
2980
2981static
2982PyObject *strip(PyUnicodeObject *self,
2983 int left,
2984 int right)
2985{
2986 Py_UNICODE *p = self->str;
2987 int start = 0;
2988 int end = self->length;
2989
2990 if (left)
2991 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2992 start++;
2993
2994 if (right)
2995 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2996 end--;
2997
2998 if (start == 0 && end == self->length) {
2999 /* couldn't strip anything off, return original string */
3000 Py_INCREF(self);
3001 return (PyObject*) self;
3002 }
3003
3004 return (PyObject*) PyUnicode_FromUnicode(
3005 self->str + start,
3006 end - start
3007 );
3008}
3009
3010static
3011PyObject *replace(PyUnicodeObject *self,
3012 PyUnicodeObject *str1,
3013 PyUnicodeObject *str2,
3014 int maxcount)
3015{
3016 PyUnicodeObject *u;
3017
3018 if (maxcount < 0)
3019 maxcount = INT_MAX;
3020
3021 if (str1->length == 1 && str2->length == 1) {
3022 int i;
3023
3024 /* replace characters */
3025 if (!findchar(self->str, self->length, str1->str[0])) {
3026 /* nothing to replace, return original string */
3027 Py_INCREF(self);
3028 u = self;
3029 } else {
3030 Py_UNICODE u1 = str1->str[0];
3031 Py_UNICODE u2 = str2->str[0];
3032
3033 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3034 self->str,
3035 self->length
3036 );
3037 if (u)
3038 for (i = 0; i < u->length; i++)
3039 if (u->str[i] == u1) {
3040 if (--maxcount < 0)
3041 break;
3042 u->str[i] = u2;
3043 }
3044 }
3045
3046 } else {
3047 int n, i;
3048 Py_UNICODE *p;
3049
3050 /* replace strings */
3051 n = count(self, 0, self->length, str1);
3052 if (n > maxcount)
3053 n = maxcount;
3054 if (n == 0) {
3055 /* nothing to replace, return original string */
3056 Py_INCREF(self);
3057 u = self;
3058 } else {
3059 u = _PyUnicode_New(
3060 self->length + n * (str2->length - str1->length));
3061 if (u) {
3062 i = 0;
3063 p = u->str;
3064 while (i <= self->length - str1->length)
3065 if (Py_UNICODE_MATCH(self, i, str1)) {
3066 /* replace string segment */
3067 Py_UNICODE_COPY(p, str2->str, str2->length);
3068 p += str2->length;
3069 i += str1->length;
3070 if (--n <= 0) {
3071 /* copy remaining part */
3072 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3073 break;
3074 }
3075 } else
3076 *p++ = self->str[i++];
3077 }
3078 }
3079 }
3080
3081 return (PyObject *) u;
3082}
3083
3084/* --- Unicode Object Methods --------------------------------------------- */
3085
3086static char title__doc__[] =
3087"S.title() -> unicode\n\
3088\n\
3089Return a titlecased version of S, i.e. words start with title case\n\
3090characters, all remaining cased characters have lower case.";
3091
3092static PyObject*
3093unicode_title(PyUnicodeObject *self, PyObject *args)
3094{
3095 if (!PyArg_NoArgs(args))
3096 return NULL;
3097 return fixup(self, fixtitle);
3098}
3099
3100static char capitalize__doc__[] =
3101"S.capitalize() -> unicode\n\
3102\n\
3103Return a capitalized version of S, i.e. make the first character\n\
3104have upper case.";
3105
3106static PyObject*
3107unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3108{
3109 if (!PyArg_NoArgs(args))
3110 return NULL;
3111 return fixup(self, fixcapitalize);
3112}
3113
3114#if 0
3115static char capwords__doc__[] =
3116"S.capwords() -> unicode\n\
3117\n\
3118Apply .capitalize() to all words in S and return the result with\n\
3119normalized whitespace (all whitespace strings are replaced by ' ').";
3120
3121static PyObject*
3122unicode_capwords(PyUnicodeObject *self, PyObject *args)
3123{
3124 PyObject *list;
3125 PyObject *item;
3126 int i;
3127
3128 if (!PyArg_NoArgs(args))
3129 return NULL;
3130
3131 /* Split into words */
3132 list = split(self, NULL, -1);
3133 if (!list)
3134 return NULL;
3135
3136 /* Capitalize each word */
3137 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3138 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3139 fixcapitalize);
3140 if (item == NULL)
3141 goto onError;
3142 Py_DECREF(PyList_GET_ITEM(list, i));
3143 PyList_SET_ITEM(list, i, item);
3144 }
3145
3146 /* Join the words to form a new string */
3147 item = PyUnicode_Join(NULL, list);
3148
3149onError:
3150 Py_DECREF(list);
3151 return (PyObject *)item;
3152}
3153#endif
3154
3155static char center__doc__[] =
3156"S.center(width) -> unicode\n\
3157\n\
3158Return S centered in a Unicode string of length width. Padding is done\n\
3159using spaces.";
3160
3161static PyObject *
3162unicode_center(PyUnicodeObject *self, PyObject *args)
3163{
3164 int marg, left;
3165 int width;
3166
3167 if (!PyArg_ParseTuple(args, "i:center", &width))
3168 return NULL;
3169
3170 if (self->length >= width) {
3171 Py_INCREF(self);
3172 return (PyObject*) self;
3173 }
3174
3175 marg = width - self->length;
3176 left = marg / 2 + (marg & width & 1);
3177
3178 return (PyObject*) pad(self, left, marg - left, ' ');
3179}
3180
Marc-André Lemburge5034372000-08-08 08:04:29 +00003181#if 0
3182
3183/* This code should go into some future Unicode collation support
3184 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003185 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003186
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003187/* speedy UTF-16 code point order comparison */
3188/* gleaned from: */
3189/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3190
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003191static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003192{
3193 0, 0, 0, 0, 0, 0, 0, 0,
3194 0, 0, 0, 0, 0, 0, 0, 0,
3195 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003196 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003197};
3198
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199static int
3200unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3201{
3202 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003203
Guido van Rossumd57fd912000-03-10 22:53:23 +00003204 Py_UNICODE *s1 = str1->str;
3205 Py_UNICODE *s2 = str2->str;
3206
3207 len1 = str1->length;
3208 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003209
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003211 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003212 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003213
3214 c1 = *s1++;
3215 c2 = *s2++;
3216 if (c1 > (1<<11) * 26)
3217 c1 += utf16Fixup[c1>>11];
3218 if (c2 > (1<<11) * 26)
3219 c2 += utf16Fixup[c2>>11];
3220
3221 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003222 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003223 if (diff)
3224 return (diff < 0) ? -1 : (diff != 0);
3225 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003226 }
3227
3228 return (len1 < len2) ? -1 : (len1 != len2);
3229}
3230
Marc-André Lemburge5034372000-08-08 08:04:29 +00003231#else
3232
3233static int
3234unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3235{
3236 register int len1, len2;
3237
3238 Py_UNICODE *s1 = str1->str;
3239 Py_UNICODE *s2 = str2->str;
3240
3241 len1 = str1->length;
3242 len2 = str2->length;
3243
3244 while (len1 > 0 && len2 > 0) {
3245 register long diff;
3246
3247 diff = (long)*s1++ - (long)*s2++;
3248 if (diff)
3249 return (diff < 0) ? -1 : (diff != 0);
3250 len1--; len2--;
3251 }
3252
3253 return (len1 < len2) ? -1 : (len1 != len2);
3254}
3255
3256#endif
3257
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258int PyUnicode_Compare(PyObject *left,
3259 PyObject *right)
3260{
3261 PyUnicodeObject *u = NULL, *v = NULL;
3262 int result;
3263
3264 /* Coerce the two arguments */
3265 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3266 if (u == NULL)
3267 goto onError;
3268 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3269 if (v == NULL)
3270 goto onError;
3271
Thomas Wouters7e474022000-07-16 12:04:32 +00003272 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 if (v == u) {
3274 Py_DECREF(u);
3275 Py_DECREF(v);
3276 return 0;
3277 }
3278
3279 result = unicode_compare(u, v);
3280
3281 Py_DECREF(u);
3282 Py_DECREF(v);
3283 return result;
3284
3285onError:
3286 Py_XDECREF(u);
3287 Py_XDECREF(v);
3288 return -1;
3289}
3290
Guido van Rossum403d68b2000-03-13 15:55:09 +00003291int PyUnicode_Contains(PyObject *container,
3292 PyObject *element)
3293{
3294 PyUnicodeObject *u = NULL, *v = NULL;
3295 int result;
3296 register const Py_UNICODE *p, *e;
3297 register Py_UNICODE ch;
3298
3299 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003300 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003301 if (v == NULL) {
3302 PyErr_SetString(PyExc_TypeError,
3303 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003304 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003305 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003306 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3307 if (u == NULL) {
3308 Py_DECREF(v);
3309 goto onError;
3310 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003311
3312 /* Check v in u */
3313 if (PyUnicode_GET_SIZE(v) != 1) {
3314 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003315 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003316 goto onError;
3317 }
3318 ch = *PyUnicode_AS_UNICODE(v);
3319 p = PyUnicode_AS_UNICODE(u);
3320 e = p + PyUnicode_GET_SIZE(u);
3321 result = 0;
3322 while (p < e) {
3323 if (*p++ == ch) {
3324 result = 1;
3325 break;
3326 }
3327 }
3328
3329 Py_DECREF(u);
3330 Py_DECREF(v);
3331 return result;
3332
3333onError:
3334 Py_XDECREF(u);
3335 Py_XDECREF(v);
3336 return -1;
3337}
3338
Guido van Rossumd57fd912000-03-10 22:53:23 +00003339/* Concat to string or Unicode object giving a new Unicode object. */
3340
3341PyObject *PyUnicode_Concat(PyObject *left,
3342 PyObject *right)
3343{
3344 PyUnicodeObject *u = NULL, *v = NULL, *w;
3345
3346 /* Coerce the two arguments */
3347 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3348 if (u == NULL)
3349 goto onError;
3350 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3351 if (v == NULL)
3352 goto onError;
3353
3354 /* Shortcuts */
3355 if (v == unicode_empty) {
3356 Py_DECREF(v);
3357 return (PyObject *)u;
3358 }
3359 if (u == unicode_empty) {
3360 Py_DECREF(u);
3361 return (PyObject *)v;
3362 }
3363
3364 /* Concat the two Unicode strings */
3365 w = _PyUnicode_New(u->length + v->length);
3366 if (w == NULL)
3367 goto onError;
3368 Py_UNICODE_COPY(w->str, u->str, u->length);
3369 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3370
3371 Py_DECREF(u);
3372 Py_DECREF(v);
3373 return (PyObject *)w;
3374
3375onError:
3376 Py_XDECREF(u);
3377 Py_XDECREF(v);
3378 return NULL;
3379}
3380
3381static char count__doc__[] =
3382"S.count(sub[, start[, end]]) -> int\n\
3383\n\
3384Return the number of occurrences of substring sub in Unicode string\n\
3385S[start:end]. Optional arguments start and end are\n\
3386interpreted as in slice notation.";
3387
3388static PyObject *
3389unicode_count(PyUnicodeObject *self, PyObject *args)
3390{
3391 PyUnicodeObject *substring;
3392 int start = 0;
3393 int end = INT_MAX;
3394 PyObject *result;
3395
Guido van Rossumb8872e62000-05-09 14:14:27 +00003396 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3397 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003398 return NULL;
3399
3400 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3401 (PyObject *)substring);
3402 if (substring == NULL)
3403 return NULL;
3404
Guido van Rossumd57fd912000-03-10 22:53:23 +00003405 if (start < 0)
3406 start += self->length;
3407 if (start < 0)
3408 start = 0;
3409 if (end > self->length)
3410 end = self->length;
3411 if (end < 0)
3412 end += self->length;
3413 if (end < 0)
3414 end = 0;
3415
3416 result = PyInt_FromLong((long) count(self, start, end, substring));
3417
3418 Py_DECREF(substring);
3419 return result;
3420}
3421
3422static char encode__doc__[] =
3423"S.encode([encoding[,errors]]) -> string\n\
3424\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003425Return an encoded string version of S. Default encoding is the current\n\
3426default string encoding. errors may be given to set a different error\n\
3427handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3428a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429
3430static PyObject *
3431unicode_encode(PyUnicodeObject *self, PyObject *args)
3432{
3433 char *encoding = NULL;
3434 char *errors = NULL;
3435 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3436 return NULL;
3437 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3438}
3439
3440static char expandtabs__doc__[] =
3441"S.expandtabs([tabsize]) -> unicode\n\
3442\n\
3443Return a copy of S where all tab characters are expanded using spaces.\n\
3444If tabsize is not given, a tab size of 8 characters is assumed.";
3445
3446static PyObject*
3447unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3448{
3449 Py_UNICODE *e;
3450 Py_UNICODE *p;
3451 Py_UNICODE *q;
3452 int i, j;
3453 PyUnicodeObject *u;
3454 int tabsize = 8;
3455
3456 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3457 return NULL;
3458
Thomas Wouters7e474022000-07-16 12:04:32 +00003459 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003460 i = j = 0;
3461 e = self->str + self->length;
3462 for (p = self->str; p < e; p++)
3463 if (*p == '\t') {
3464 if (tabsize > 0)
3465 j += tabsize - (j % tabsize);
3466 }
3467 else {
3468 j++;
3469 if (*p == '\n' || *p == '\r') {
3470 i += j;
3471 j = 0;
3472 }
3473 }
3474
3475 /* Second pass: create output string and fill it */
3476 u = _PyUnicode_New(i + j);
3477 if (!u)
3478 return NULL;
3479
3480 j = 0;
3481 q = u->str;
3482
3483 for (p = self->str; p < e; p++)
3484 if (*p == '\t') {
3485 if (tabsize > 0) {
3486 i = tabsize - (j % tabsize);
3487 j += i;
3488 while (i--)
3489 *q++ = ' ';
3490 }
3491 }
3492 else {
3493 j++;
3494 *q++ = *p;
3495 if (*p == '\n' || *p == '\r')
3496 j = 0;
3497 }
3498
3499 return (PyObject*) u;
3500}
3501
3502static char find__doc__[] =
3503"S.find(sub [,start [,end]]) -> int\n\
3504\n\
3505Return the lowest index in S where substring sub is found,\n\
3506such that sub is contained within s[start,end]. Optional\n\
3507arguments start and end are interpreted as in slice notation.\n\
3508\n\
3509Return -1 on failure.";
3510
3511static PyObject *
3512unicode_find(PyUnicodeObject *self, PyObject *args)
3513{
3514 PyUnicodeObject *substring;
3515 int start = 0;
3516 int end = INT_MAX;
3517 PyObject *result;
3518
Guido van Rossumb8872e62000-05-09 14:14:27 +00003519 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3520 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003521 return NULL;
3522 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3523 (PyObject *)substring);
3524 if (substring == NULL)
3525 return NULL;
3526
3527 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3528
3529 Py_DECREF(substring);
3530 return result;
3531}
3532
3533static PyObject *
3534unicode_getitem(PyUnicodeObject *self, int index)
3535{
3536 if (index < 0 || index >= self->length) {
3537 PyErr_SetString(PyExc_IndexError, "string index out of range");
3538 return NULL;
3539 }
3540
3541 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3542}
3543
3544static long
3545unicode_hash(PyUnicodeObject *self)
3546{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003547 /* Since Unicode objects compare equal to their ASCII string
3548 counterparts, they should use the individual character values
3549 as basis for their hash value. This is needed to assure that
3550 strings and Unicode objects behave in the same way as
3551 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003552
Fredrik Lundhdde61642000-07-10 18:27:47 +00003553 register int len;
3554 register Py_UNICODE *p;
3555 register long x;
3556
Guido van Rossumd57fd912000-03-10 22:53:23 +00003557 if (self->hash != -1)
3558 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003559 len = PyUnicode_GET_SIZE(self);
3560 p = PyUnicode_AS_UNICODE(self);
3561 x = *p << 7;
3562 while (--len >= 0)
3563 x = (1000003*x) ^ *p++;
3564 x ^= PyUnicode_GET_SIZE(self);
3565 if (x == -1)
3566 x = -2;
3567 self->hash = x;
3568 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003569}
3570
3571static char index__doc__[] =
3572"S.index(sub [,start [,end]]) -> int\n\
3573\n\
3574Like S.find() but raise ValueError when the substring is not found.";
3575
3576static PyObject *
3577unicode_index(PyUnicodeObject *self, PyObject *args)
3578{
3579 int result;
3580 PyUnicodeObject *substring;
3581 int start = 0;
3582 int end = INT_MAX;
3583
Guido van Rossumb8872e62000-05-09 14:14:27 +00003584 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3585 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003586 return NULL;
3587
3588 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3589 (PyObject *)substring);
3590 if (substring == NULL)
3591 return NULL;
3592
3593 result = findstring(self, substring, start, end, 1);
3594
3595 Py_DECREF(substring);
3596 if (result < 0) {
3597 PyErr_SetString(PyExc_ValueError, "substring not found");
3598 return NULL;
3599 }
3600 return PyInt_FromLong(result);
3601}
3602
3603static char islower__doc__[] =
3604"S.islower() -> int\n\
3605\n\
3606Return 1 if all cased characters in S are lowercase and there is\n\
3607at least one cased character in S, 0 otherwise.";
3608
3609static PyObject*
3610unicode_islower(PyUnicodeObject *self, PyObject *args)
3611{
3612 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3613 register const Py_UNICODE *e;
3614 int cased;
3615
3616 if (!PyArg_NoArgs(args))
3617 return NULL;
3618
3619 /* Shortcut for single character strings */
3620 if (PyUnicode_GET_SIZE(self) == 1)
3621 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3622
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003623 /* Special case for empty strings */
3624 if (PyString_GET_SIZE(self) == 0)
3625 return PyInt_FromLong(0);
3626
Guido van Rossumd57fd912000-03-10 22:53:23 +00003627 e = p + PyUnicode_GET_SIZE(self);
3628 cased = 0;
3629 for (; p < e; p++) {
3630 register const Py_UNICODE ch = *p;
3631
3632 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3633 return PyInt_FromLong(0);
3634 else if (!cased && Py_UNICODE_ISLOWER(ch))
3635 cased = 1;
3636 }
3637 return PyInt_FromLong(cased);
3638}
3639
3640static char isupper__doc__[] =
3641"S.isupper() -> int\n\
3642\n\
3643Return 1 if all cased characters in S are uppercase and there is\n\
3644at least one cased character in S, 0 otherwise.";
3645
3646static PyObject*
3647unicode_isupper(PyUnicodeObject *self, PyObject *args)
3648{
3649 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3650 register const Py_UNICODE *e;
3651 int cased;
3652
3653 if (!PyArg_NoArgs(args))
3654 return NULL;
3655
3656 /* Shortcut for single character strings */
3657 if (PyUnicode_GET_SIZE(self) == 1)
3658 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3659
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003660 /* Special case for empty strings */
3661 if (PyString_GET_SIZE(self) == 0)
3662 return PyInt_FromLong(0);
3663
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 e = p + PyUnicode_GET_SIZE(self);
3665 cased = 0;
3666 for (; p < e; p++) {
3667 register const Py_UNICODE ch = *p;
3668
3669 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3670 return PyInt_FromLong(0);
3671 else if (!cased && Py_UNICODE_ISUPPER(ch))
3672 cased = 1;
3673 }
3674 return PyInt_FromLong(cased);
3675}
3676
3677static char istitle__doc__[] =
3678"S.istitle() -> int\n\
3679\n\
3680Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3681may only follow uncased characters and lowercase characters only cased\n\
3682ones. Return 0 otherwise.";
3683
3684static PyObject*
3685unicode_istitle(PyUnicodeObject *self, PyObject *args)
3686{
3687 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3688 register const Py_UNICODE *e;
3689 int cased, previous_is_cased;
3690
3691 if (!PyArg_NoArgs(args))
3692 return NULL;
3693
3694 /* Shortcut for single character strings */
3695 if (PyUnicode_GET_SIZE(self) == 1)
3696 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3697 (Py_UNICODE_ISUPPER(*p) != 0));
3698
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003699 /* Special case for empty strings */
3700 if (PyString_GET_SIZE(self) == 0)
3701 return PyInt_FromLong(0);
3702
Guido van Rossumd57fd912000-03-10 22:53:23 +00003703 e = p + PyUnicode_GET_SIZE(self);
3704 cased = 0;
3705 previous_is_cased = 0;
3706 for (; p < e; p++) {
3707 register const Py_UNICODE ch = *p;
3708
3709 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3710 if (previous_is_cased)
3711 return PyInt_FromLong(0);
3712 previous_is_cased = 1;
3713 cased = 1;
3714 }
3715 else if (Py_UNICODE_ISLOWER(ch)) {
3716 if (!previous_is_cased)
3717 return PyInt_FromLong(0);
3718 previous_is_cased = 1;
3719 cased = 1;
3720 }
3721 else
3722 previous_is_cased = 0;
3723 }
3724 return PyInt_FromLong(cased);
3725}
3726
3727static char isspace__doc__[] =
3728"S.isspace() -> int\n\
3729\n\
3730Return 1 if there are only whitespace characters in S,\n\
37310 otherwise.";
3732
3733static PyObject*
3734unicode_isspace(PyUnicodeObject *self, PyObject *args)
3735{
3736 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3737 register const Py_UNICODE *e;
3738
3739 if (!PyArg_NoArgs(args))
3740 return NULL;
3741
3742 /* Shortcut for single character strings */
3743 if (PyUnicode_GET_SIZE(self) == 1 &&
3744 Py_UNICODE_ISSPACE(*p))
3745 return PyInt_FromLong(1);
3746
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003747 /* Special case for empty strings */
3748 if (PyString_GET_SIZE(self) == 0)
3749 return PyInt_FromLong(0);
3750
Guido van Rossumd57fd912000-03-10 22:53:23 +00003751 e = p + PyUnicode_GET_SIZE(self);
3752 for (; p < e; p++) {
3753 if (!Py_UNICODE_ISSPACE(*p))
3754 return PyInt_FromLong(0);
3755 }
3756 return PyInt_FromLong(1);
3757}
3758
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003759static char isalpha__doc__[] =
3760"S.isalpha() -> int\n\
3761\n\
3762Return 1 if all characters in S are alphabetic\n\
3763and there is at least one character in S, 0 otherwise.";
3764
3765static PyObject*
3766unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3767{
3768 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3769 register const Py_UNICODE *e;
3770
3771 if (!PyArg_NoArgs(args))
3772 return NULL;
3773
3774 /* Shortcut for single character strings */
3775 if (PyUnicode_GET_SIZE(self) == 1 &&
3776 Py_UNICODE_ISALPHA(*p))
3777 return PyInt_FromLong(1);
3778
3779 /* Special case for empty strings */
3780 if (PyString_GET_SIZE(self) == 0)
3781 return PyInt_FromLong(0);
3782
3783 e = p + PyUnicode_GET_SIZE(self);
3784 for (; p < e; p++) {
3785 if (!Py_UNICODE_ISALPHA(*p))
3786 return PyInt_FromLong(0);
3787 }
3788 return PyInt_FromLong(1);
3789}
3790
3791static char isalnum__doc__[] =
3792"S.isalnum() -> int\n\
3793\n\
3794Return 1 if all characters in S are alphanumeric\n\
3795and there is at least one character in S, 0 otherwise.";
3796
3797static PyObject*
3798unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3799{
3800 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3801 register const Py_UNICODE *e;
3802
3803 if (!PyArg_NoArgs(args))
3804 return NULL;
3805
3806 /* Shortcut for single character strings */
3807 if (PyUnicode_GET_SIZE(self) == 1 &&
3808 Py_UNICODE_ISALNUM(*p))
3809 return PyInt_FromLong(1);
3810
3811 /* Special case for empty strings */
3812 if (PyString_GET_SIZE(self) == 0)
3813 return PyInt_FromLong(0);
3814
3815 e = p + PyUnicode_GET_SIZE(self);
3816 for (; p < e; p++) {
3817 if (!Py_UNICODE_ISALNUM(*p))
3818 return PyInt_FromLong(0);
3819 }
3820 return PyInt_FromLong(1);
3821}
3822
Guido van Rossumd57fd912000-03-10 22:53:23 +00003823static char isdecimal__doc__[] =
3824"S.isdecimal() -> int\n\
3825\n\
3826Return 1 if there are only decimal characters in S,\n\
38270 otherwise.";
3828
3829static PyObject*
3830unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3831{
3832 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3833 register const Py_UNICODE *e;
3834
3835 if (!PyArg_NoArgs(args))
3836 return NULL;
3837
3838 /* Shortcut for single character strings */
3839 if (PyUnicode_GET_SIZE(self) == 1 &&
3840 Py_UNICODE_ISDECIMAL(*p))
3841 return PyInt_FromLong(1);
3842
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003843 /* Special case for empty strings */
3844 if (PyString_GET_SIZE(self) == 0)
3845 return PyInt_FromLong(0);
3846
Guido van Rossumd57fd912000-03-10 22:53:23 +00003847 e = p + PyUnicode_GET_SIZE(self);
3848 for (; p < e; p++) {
3849 if (!Py_UNICODE_ISDECIMAL(*p))
3850 return PyInt_FromLong(0);
3851 }
3852 return PyInt_FromLong(1);
3853}
3854
3855static char isdigit__doc__[] =
3856"S.isdigit() -> int\n\
3857\n\
3858Return 1 if there are only digit characters in S,\n\
38590 otherwise.";
3860
3861static PyObject*
3862unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3863{
3864 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3865 register const Py_UNICODE *e;
3866
3867 if (!PyArg_NoArgs(args))
3868 return NULL;
3869
3870 /* Shortcut for single character strings */
3871 if (PyUnicode_GET_SIZE(self) == 1 &&
3872 Py_UNICODE_ISDIGIT(*p))
3873 return PyInt_FromLong(1);
3874
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003875 /* Special case for empty strings */
3876 if (PyString_GET_SIZE(self) == 0)
3877 return PyInt_FromLong(0);
3878
Guido van Rossumd57fd912000-03-10 22:53:23 +00003879 e = p + PyUnicode_GET_SIZE(self);
3880 for (; p < e; p++) {
3881 if (!Py_UNICODE_ISDIGIT(*p))
3882 return PyInt_FromLong(0);
3883 }
3884 return PyInt_FromLong(1);
3885}
3886
3887static char isnumeric__doc__[] =
3888"S.isnumeric() -> int\n\
3889\n\
3890Return 1 if there are only numeric characters in S,\n\
38910 otherwise.";
3892
3893static PyObject*
3894unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3895{
3896 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3897 register const Py_UNICODE *e;
3898
3899 if (!PyArg_NoArgs(args))
3900 return NULL;
3901
3902 /* Shortcut for single character strings */
3903 if (PyUnicode_GET_SIZE(self) == 1 &&
3904 Py_UNICODE_ISNUMERIC(*p))
3905 return PyInt_FromLong(1);
3906
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003907 /* Special case for empty strings */
3908 if (PyString_GET_SIZE(self) == 0)
3909 return PyInt_FromLong(0);
3910
Guido van Rossumd57fd912000-03-10 22:53:23 +00003911 e = p + PyUnicode_GET_SIZE(self);
3912 for (; p < e; p++) {
3913 if (!Py_UNICODE_ISNUMERIC(*p))
3914 return PyInt_FromLong(0);
3915 }
3916 return PyInt_FromLong(1);
3917}
3918
3919static char join__doc__[] =
3920"S.join(sequence) -> unicode\n\
3921\n\
3922Return a string which is the concatenation of the strings in the\n\
3923sequence. The separator between elements is S.";
3924
3925static PyObject*
3926unicode_join(PyUnicodeObject *self, PyObject *args)
3927{
3928 PyObject *data;
3929 if (!PyArg_ParseTuple(args, "O:join", &data))
3930 return NULL;
3931
3932 return PyUnicode_Join((PyObject *)self, data);
3933}
3934
3935static int
3936unicode_length(PyUnicodeObject *self)
3937{
3938 return self->length;
3939}
3940
3941static char ljust__doc__[] =
3942"S.ljust(width) -> unicode\n\
3943\n\
3944Return S left justified in a Unicode string of length width. Padding is\n\
3945done using spaces.";
3946
3947static PyObject *
3948unicode_ljust(PyUnicodeObject *self, PyObject *args)
3949{
3950 int width;
3951 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3952 return NULL;
3953
3954 if (self->length >= width) {
3955 Py_INCREF(self);
3956 return (PyObject*) self;
3957 }
3958
3959 return (PyObject*) pad(self, 0, width - self->length, ' ');
3960}
3961
3962static char lower__doc__[] =
3963"S.lower() -> unicode\n\
3964\n\
3965Return a copy of the string S converted to lowercase.";
3966
3967static PyObject*
3968unicode_lower(PyUnicodeObject *self, PyObject *args)
3969{
3970 if (!PyArg_NoArgs(args))
3971 return NULL;
3972 return fixup(self, fixlower);
3973}
3974
3975static char lstrip__doc__[] =
3976"S.lstrip() -> unicode\n\
3977\n\
3978Return a copy of the string S with leading whitespace removed.";
3979
3980static PyObject *
3981unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3982{
3983 if (!PyArg_NoArgs(args))
3984 return NULL;
3985 return strip(self, 1, 0);
3986}
3987
3988static PyObject*
3989unicode_repeat(PyUnicodeObject *str, int len)
3990{
3991 PyUnicodeObject *u;
3992 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00003993 int nchars;
3994 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003995
3996 if (len < 0)
3997 len = 0;
3998
3999 if (len == 1) {
4000 /* no repeat, return original string */
4001 Py_INCREF(str);
4002 return (PyObject*) str;
4003 }
Tim Peters8f422462000-09-09 06:13:41 +00004004
4005 /* ensure # of chars needed doesn't overflow int and # of bytes
4006 * needed doesn't overflow size_t
4007 */
4008 nchars = len * str->length;
4009 if (len && nchars / len != str->length) {
4010 PyErr_SetString(PyExc_OverflowError,
4011 "repeated string is too long");
4012 return NULL;
4013 }
4014 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4015 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4016 PyErr_SetString(PyExc_OverflowError,
4017 "repeated string is too long");
4018 return NULL;
4019 }
4020 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004021 if (!u)
4022 return NULL;
4023
4024 p = u->str;
4025
4026 while (len-- > 0) {
4027 Py_UNICODE_COPY(p, str->str, str->length);
4028 p += str->length;
4029 }
4030
4031 return (PyObject*) u;
4032}
4033
4034PyObject *PyUnicode_Replace(PyObject *obj,
4035 PyObject *subobj,
4036 PyObject *replobj,
4037 int maxcount)
4038{
4039 PyObject *self;
4040 PyObject *str1;
4041 PyObject *str2;
4042 PyObject *result;
4043
4044 self = PyUnicode_FromObject(obj);
4045 if (self == NULL)
4046 return NULL;
4047 str1 = PyUnicode_FromObject(subobj);
4048 if (str1 == NULL) {
4049 Py_DECREF(self);
4050 return NULL;
4051 }
4052 str2 = PyUnicode_FromObject(replobj);
4053 if (str2 == NULL) {
4054 Py_DECREF(self);
4055 Py_DECREF(str1);
4056 return NULL;
4057 }
4058 result = replace((PyUnicodeObject *)self,
4059 (PyUnicodeObject *)str1,
4060 (PyUnicodeObject *)str2,
4061 maxcount);
4062 Py_DECREF(self);
4063 Py_DECREF(str1);
4064 Py_DECREF(str2);
4065 return result;
4066}
4067
4068static char replace__doc__[] =
4069"S.replace (old, new[, maxsplit]) -> unicode\n\
4070\n\
4071Return a copy of S with all occurrences of substring\n\
4072old replaced by new. If the optional argument maxsplit is\n\
4073given, only the first maxsplit occurrences are replaced.";
4074
4075static PyObject*
4076unicode_replace(PyUnicodeObject *self, PyObject *args)
4077{
4078 PyUnicodeObject *str1;
4079 PyUnicodeObject *str2;
4080 int maxcount = -1;
4081 PyObject *result;
4082
4083 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4084 return NULL;
4085 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4086 if (str1 == NULL)
4087 return NULL;
4088 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4089 if (str2 == NULL)
4090 return NULL;
4091
4092 result = replace(self, str1, str2, maxcount);
4093
4094 Py_DECREF(str1);
4095 Py_DECREF(str2);
4096 return result;
4097}
4098
4099static
4100PyObject *unicode_repr(PyObject *unicode)
4101{
4102 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4103 PyUnicode_GET_SIZE(unicode),
4104 1);
4105}
4106
4107static char rfind__doc__[] =
4108"S.rfind(sub [,start [,end]]) -> int\n\
4109\n\
4110Return the highest index in S where substring sub is found,\n\
4111such that sub is contained within s[start,end]. Optional\n\
4112arguments start and end are interpreted as in slice notation.\n\
4113\n\
4114Return -1 on failure.";
4115
4116static PyObject *
4117unicode_rfind(PyUnicodeObject *self, PyObject *args)
4118{
4119 PyUnicodeObject *substring;
4120 int start = 0;
4121 int end = INT_MAX;
4122 PyObject *result;
4123
Guido van Rossumb8872e62000-05-09 14:14:27 +00004124 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4125 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 return NULL;
4127 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4128 (PyObject *)substring);
4129 if (substring == NULL)
4130 return NULL;
4131
4132 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4133
4134 Py_DECREF(substring);
4135 return result;
4136}
4137
4138static char rindex__doc__[] =
4139"S.rindex(sub [,start [,end]]) -> int\n\
4140\n\
4141Like S.rfind() but raise ValueError when the substring is not found.";
4142
4143static PyObject *
4144unicode_rindex(PyUnicodeObject *self, PyObject *args)
4145{
4146 int result;
4147 PyUnicodeObject *substring;
4148 int start = 0;
4149 int end = INT_MAX;
4150
Guido van Rossumb8872e62000-05-09 14:14:27 +00004151 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4152 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 return NULL;
4154 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4155 (PyObject *)substring);
4156 if (substring == NULL)
4157 return NULL;
4158
4159 result = findstring(self, substring, start, end, -1);
4160
4161 Py_DECREF(substring);
4162 if (result < 0) {
4163 PyErr_SetString(PyExc_ValueError, "substring not found");
4164 return NULL;
4165 }
4166 return PyInt_FromLong(result);
4167}
4168
4169static char rjust__doc__[] =
4170"S.rjust(width) -> unicode\n\
4171\n\
4172Return S right justified in a Unicode string of length width. Padding is\n\
4173done using spaces.";
4174
4175static PyObject *
4176unicode_rjust(PyUnicodeObject *self, PyObject *args)
4177{
4178 int width;
4179 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4180 return NULL;
4181
4182 if (self->length >= width) {
4183 Py_INCREF(self);
4184 return (PyObject*) self;
4185 }
4186
4187 return (PyObject*) pad(self, width - self->length, 0, ' ');
4188}
4189
4190static char rstrip__doc__[] =
4191"S.rstrip() -> unicode\n\
4192\n\
4193Return a copy of the string S with trailing whitespace removed.";
4194
4195static PyObject *
4196unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4197{
4198 if (!PyArg_NoArgs(args))
4199 return NULL;
4200 return strip(self, 0, 1);
4201}
4202
4203static PyObject*
4204unicode_slice(PyUnicodeObject *self, int start, int end)
4205{
4206 /* standard clamping */
4207 if (start < 0)
4208 start = 0;
4209 if (end < 0)
4210 end = 0;
4211 if (end > self->length)
4212 end = self->length;
4213 if (start == 0 && end == self->length) {
4214 /* full slice, return original string */
4215 Py_INCREF(self);
4216 return (PyObject*) self;
4217 }
4218 if (start > end)
4219 start = end;
4220 /* copy slice */
4221 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4222 end - start);
4223}
4224
4225PyObject *PyUnicode_Split(PyObject *s,
4226 PyObject *sep,
4227 int maxsplit)
4228{
4229 PyObject *result;
4230
4231 s = PyUnicode_FromObject(s);
4232 if (s == NULL)
4233 return NULL;
4234 if (sep != NULL) {
4235 sep = PyUnicode_FromObject(sep);
4236 if (sep == NULL) {
4237 Py_DECREF(s);
4238 return NULL;
4239 }
4240 }
4241
4242 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4243
4244 Py_DECREF(s);
4245 Py_XDECREF(sep);
4246 return result;
4247}
4248
4249static char split__doc__[] =
4250"S.split([sep [,maxsplit]]) -> list of strings\n\
4251\n\
4252Return a list of the words in S, using sep as the\n\
4253delimiter string. If maxsplit is given, at most maxsplit\n\
4254splits are done. If sep is not specified, any whitespace string\n\
4255is a separator.";
4256
4257static PyObject*
4258unicode_split(PyUnicodeObject *self, PyObject *args)
4259{
4260 PyObject *substring = Py_None;
4261 int maxcount = -1;
4262
4263 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4264 return NULL;
4265
4266 if (substring == Py_None)
4267 return split(self, NULL, maxcount);
4268 else if (PyUnicode_Check(substring))
4269 return split(self, (PyUnicodeObject *)substring, maxcount);
4270 else
4271 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4272}
4273
4274static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004275"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004276\n\
4277Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004278Line breaks are not included in the resulting list unless keepends\n\
4279is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004280
4281static PyObject*
4282unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4283{
Guido van Rossum86662912000-04-11 15:38:46 +00004284 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004285
Guido van Rossum86662912000-04-11 15:38:46 +00004286 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004287 return NULL;
4288
Guido van Rossum86662912000-04-11 15:38:46 +00004289 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004290}
4291
4292static
4293PyObject *unicode_str(PyUnicodeObject *self)
4294{
Fred Drakee4315f52000-05-09 19:53:39 +00004295 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004296}
4297
4298static char strip__doc__[] =
4299"S.strip() -> unicode\n\
4300\n\
4301Return a copy of S with leading and trailing whitespace removed.";
4302
4303static PyObject *
4304unicode_strip(PyUnicodeObject *self, PyObject *args)
4305{
4306 if (!PyArg_NoArgs(args))
4307 return NULL;
4308 return strip(self, 1, 1);
4309}
4310
4311static char swapcase__doc__[] =
4312"S.swapcase() -> unicode\n\
4313\n\
4314Return a copy of S with uppercase characters converted to lowercase\n\
4315and vice versa.";
4316
4317static PyObject*
4318unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4319{
4320 if (!PyArg_NoArgs(args))
4321 return NULL;
4322 return fixup(self, fixswapcase);
4323}
4324
4325static char translate__doc__[] =
4326"S.translate(table) -> unicode\n\
4327\n\
4328Return a copy of the string S, where all characters have been mapped\n\
4329through the given translation table, which must be a mapping of\n\
4330Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4331are left untouched. Characters mapped to None are deleted.";
4332
4333static PyObject*
4334unicode_translate(PyUnicodeObject *self, PyObject *args)
4335{
4336 PyObject *table;
4337
4338 if (!PyArg_ParseTuple(args, "O:translate", &table))
4339 return NULL;
4340 return PyUnicode_TranslateCharmap(self->str,
4341 self->length,
4342 table,
4343 "ignore");
4344}
4345
4346static char upper__doc__[] =
4347"S.upper() -> unicode\n\
4348\n\
4349Return a copy of S converted to uppercase.";
4350
4351static PyObject*
4352unicode_upper(PyUnicodeObject *self, PyObject *args)
4353{
4354 if (!PyArg_NoArgs(args))
4355 return NULL;
4356 return fixup(self, fixupper);
4357}
4358
4359#if 0
4360static char zfill__doc__[] =
4361"S.zfill(width) -> unicode\n\
4362\n\
4363Pad a numeric string x with zeros on the left, to fill a field\n\
4364of the specified width. The string x is never truncated.";
4365
4366static PyObject *
4367unicode_zfill(PyUnicodeObject *self, PyObject *args)
4368{
4369 int fill;
4370 PyUnicodeObject *u;
4371
4372 int width;
4373 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4374 return NULL;
4375
4376 if (self->length >= width) {
4377 Py_INCREF(self);
4378 return (PyObject*) self;
4379 }
4380
4381 fill = width - self->length;
4382
4383 u = pad(self, fill, 0, '0');
4384
4385 if (u->str[fill] == '+' || u->str[fill] == '-') {
4386 /* move sign to beginning of string */
4387 u->str[0] = u->str[fill];
4388 u->str[fill] = '0';
4389 }
4390
4391 return (PyObject*) u;
4392}
4393#endif
4394
4395#if 0
4396static PyObject*
4397unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4398{
4399 if (!PyArg_NoArgs(args))
4400 return NULL;
4401 return PyInt_FromLong(unicode_freelist_size);
4402}
4403#endif
4404
4405static char startswith__doc__[] =
4406"S.startswith(prefix[, start[, end]]) -> int\n\
4407\n\
4408Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4409optional start, test S beginning at that position. With optional end, stop\n\
4410comparing S at that position.";
4411
4412static PyObject *
4413unicode_startswith(PyUnicodeObject *self,
4414 PyObject *args)
4415{
4416 PyUnicodeObject *substring;
4417 int start = 0;
4418 int end = INT_MAX;
4419 PyObject *result;
4420
Guido van Rossumb8872e62000-05-09 14:14:27 +00004421 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4422 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 return NULL;
4424 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4425 (PyObject *)substring);
4426 if (substring == NULL)
4427 return NULL;
4428
4429 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4430
4431 Py_DECREF(substring);
4432 return result;
4433}
4434
4435
4436static char endswith__doc__[] =
4437"S.endswith(suffix[, start[, end]]) -> int\n\
4438\n\
4439Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4440optional start, test S beginning at that position. With optional end, stop\n\
4441comparing S at that position.";
4442
4443static PyObject *
4444unicode_endswith(PyUnicodeObject *self,
4445 PyObject *args)
4446{
4447 PyUnicodeObject *substring;
4448 int start = 0;
4449 int end = INT_MAX;
4450 PyObject *result;
4451
Guido van Rossumb8872e62000-05-09 14:14:27 +00004452 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4453 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004454 return NULL;
4455 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4456 (PyObject *)substring);
4457 if (substring == NULL)
4458 return NULL;
4459
4460 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4461
4462 Py_DECREF(substring);
4463 return result;
4464}
4465
4466
4467static PyMethodDef unicode_methods[] = {
4468
4469 /* Order is according to common usage: often used methods should
4470 appear first, since lookup is done sequentially. */
4471
4472 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4473 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4474 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4475 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4476 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4477 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4478 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4479 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4480 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4481 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4482 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4483 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4484 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4485 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4486/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4487 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4488 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4489 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4490 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4491 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4492 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4493 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4494 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4495 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4496 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4497 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4498 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4499 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4500 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4501 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4502 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4503 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4504 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004505 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4506 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004507#if 0
4508 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4509 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4510#endif
4511
4512#if 0
4513 /* This one is just used for debugging the implementation. */
4514 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4515#endif
4516
4517 {NULL, NULL}
4518};
4519
4520static PyObject *
4521unicode_getattr(PyUnicodeObject *self, char *name)
4522{
4523 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4524}
4525
4526static PySequenceMethods unicode_as_sequence = {
4527 (inquiry) unicode_length, /* sq_length */
4528 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4529 (intargfunc) unicode_repeat, /* sq_repeat */
4530 (intargfunc) unicode_getitem, /* sq_item */
4531 (intintargfunc) unicode_slice, /* sq_slice */
4532 0, /* sq_ass_item */
4533 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004534 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004535};
4536
4537static int
4538unicode_buffer_getreadbuf(PyUnicodeObject *self,
4539 int index,
4540 const void **ptr)
4541{
4542 if (index != 0) {
4543 PyErr_SetString(PyExc_SystemError,
4544 "accessing non-existent unicode segment");
4545 return -1;
4546 }
4547 *ptr = (void *) self->str;
4548 return PyUnicode_GET_DATA_SIZE(self);
4549}
4550
4551static int
4552unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4553 const void **ptr)
4554{
4555 PyErr_SetString(PyExc_TypeError,
4556 "cannot use unicode as modifyable buffer");
4557 return -1;
4558}
4559
4560static int
4561unicode_buffer_getsegcount(PyUnicodeObject *self,
4562 int *lenp)
4563{
4564 if (lenp)
4565 *lenp = PyUnicode_GET_DATA_SIZE(self);
4566 return 1;
4567}
4568
4569static int
4570unicode_buffer_getcharbuf(PyUnicodeObject *self,
4571 int index,
4572 const void **ptr)
4573{
4574 PyObject *str;
4575
4576 if (index != 0) {
4577 PyErr_SetString(PyExc_SystemError,
4578 "accessing non-existent unicode segment");
4579 return -1;
4580 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004581 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004582 if (str == NULL)
4583 return -1;
4584 *ptr = (void *) PyString_AS_STRING(str);
4585 return PyString_GET_SIZE(str);
4586}
4587
4588/* Helpers for PyUnicode_Format() */
4589
4590static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004591getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592{
4593 int argidx = *p_argidx;
4594 if (argidx < arglen) {
4595 (*p_argidx)++;
4596 if (arglen < 0)
4597 return args;
4598 else
4599 return PyTuple_GetItem(args, argidx);
4600 }
4601 PyErr_SetString(PyExc_TypeError,
4602 "not enough arguments for format string");
4603 return NULL;
4604}
4605
4606#define F_LJUST (1<<0)
4607#define F_SIGN (1<<1)
4608#define F_BLANK (1<<2)
4609#define F_ALT (1<<3)
4610#define F_ZERO (1<<4)
4611
4612static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004613int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004614{
4615 register int i;
4616 int len;
4617 va_list va;
4618 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004619 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620
4621 /* First, format the string as char array, then expand to Py_UNICODE
4622 array. */
4623 charbuffer = (char *)buffer;
4624 len = vsprintf(charbuffer, format, va);
4625 for (i = len - 1; i >= 0; i--)
4626 buffer[i] = (Py_UNICODE) charbuffer[i];
4627
4628 va_end(va);
4629 return len;
4630}
4631
4632static int
4633formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004634 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 int flags,
4636 int prec,
4637 int type,
4638 PyObject *v)
4639{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004640 /* fmt = '%#.' + `prec` + `type`
4641 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004642 char fmt[20];
4643 double x;
4644
4645 x = PyFloat_AsDouble(v);
4646 if (x == -1.0 && PyErr_Occurred())
4647 return -1;
4648 if (prec < 0)
4649 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4651 type = 'g';
4652 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004653 /* worst case length calc to ensure no buffer overrun:
4654 fmt = %#.<prec>g
4655 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4656 for any double rep.)
4657 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4658 If prec=0 the effective precision is 1 (the leading digit is
4659 always given), therefore increase by one to 10+prec. */
4660 if (buflen <= (size_t)10 + (size_t)prec) {
4661 PyErr_SetString(PyExc_OverflowError,
4662 "formatted float is too long (precision too long?)");
4663 return -1;
4664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665 return usprintf(buf, fmt, x);
4666}
4667
Tim Peters38fd5b62000-09-21 05:43:11 +00004668static PyObject*
4669formatlong(PyObject *val, int flags, int prec, int type)
4670{
4671 char *buf;
4672 int i, len;
4673 PyObject *str; /* temporary string object. */
4674 PyUnicodeObject *result;
4675
4676 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4677 if (!str)
4678 return NULL;
4679 result = _PyUnicode_New(len);
4680 for (i = 0; i < len; i++)
4681 result->str[i] = buf[i];
4682 result->str[len] = 0;
4683 Py_DECREF(str);
4684 return (PyObject*)result;
4685}
4686
Guido van Rossumd57fd912000-03-10 22:53:23 +00004687static int
4688formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004689 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004690 int flags,
4691 int prec,
4692 int type,
4693 PyObject *v)
4694{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004695 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004696 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4697 + 1 + 1 = 24*/
4698 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699 long x;
4700
4701 x = PyInt_AsLong(v);
4702 if (x == -1 && PyErr_Occurred())
4703 return -1;
4704 if (prec < 0)
4705 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004706 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4707 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4708 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4709 PyErr_SetString(PyExc_OverflowError,
4710 "formatted integer is too long (precision too long?)");
4711 return -1;
4712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4714 return usprintf(buf, fmt, x);
4715}
4716
4717static int
4718formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004719 size_t buflen,
4720 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004722 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004723 if (PyUnicode_Check(v)) {
4724 if (PyUnicode_GET_SIZE(v) != 1)
4725 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004727 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004729 else if (PyString_Check(v)) {
4730 if (PyString_GET_SIZE(v) != 1)
4731 goto onError;
4732 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4733 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734
4735 else {
4736 /* Integer input truncated to a character */
4737 long x;
4738 x = PyInt_AsLong(v);
4739 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004740 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004741 buf[0] = (char) x;
4742 }
4743 buf[1] = '\0';
4744 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004745
4746 onError:
4747 PyErr_SetString(PyExc_TypeError,
4748 "%c requires int or char");
4749 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004750}
4751
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004752/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4753
4754 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4755 chars are formatted. XXX This is a magic number. Each formatting
4756 routine does bounds checking to ensure no overflow, but a better
4757 solution may be to malloc a buffer of appropriate size for each
4758 format. For now, the current solution is sufficient.
4759*/
4760#define FORMATBUFLEN (size_t)120
4761
Guido van Rossumd57fd912000-03-10 22:53:23 +00004762PyObject *PyUnicode_Format(PyObject *format,
4763 PyObject *args)
4764{
4765 Py_UNICODE *fmt, *res;
4766 int fmtcnt, rescnt, reslen, arglen, argidx;
4767 int args_owned = 0;
4768 PyUnicodeObject *result = NULL;
4769 PyObject *dict = NULL;
4770 PyObject *uformat;
4771
4772 if (format == NULL || args == NULL) {
4773 PyErr_BadInternalCall();
4774 return NULL;
4775 }
4776 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004777 if (uformat == NULL)
4778 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779 fmt = PyUnicode_AS_UNICODE(uformat);
4780 fmtcnt = PyUnicode_GET_SIZE(uformat);
4781
4782 reslen = rescnt = fmtcnt + 100;
4783 result = _PyUnicode_New(reslen);
4784 if (result == NULL)
4785 goto onError;
4786 res = PyUnicode_AS_UNICODE(result);
4787
4788 if (PyTuple_Check(args)) {
4789 arglen = PyTuple_Size(args);
4790 argidx = 0;
4791 }
4792 else {
4793 arglen = -1;
4794 argidx = -2;
4795 }
4796 if (args->ob_type->tp_as_mapping)
4797 dict = args;
4798
4799 while (--fmtcnt >= 0) {
4800 if (*fmt != '%') {
4801 if (--rescnt < 0) {
4802 rescnt = fmtcnt + 100;
4803 reslen += rescnt;
4804 if (_PyUnicode_Resize(result, reslen) < 0)
4805 return NULL;
4806 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4807 --rescnt;
4808 }
4809 *res++ = *fmt++;
4810 }
4811 else {
4812 /* Got a format specifier */
4813 int flags = 0;
4814 int width = -1;
4815 int prec = -1;
4816 int size = 0;
4817 Py_UNICODE c = '\0';
4818 Py_UNICODE fill;
4819 PyObject *v = NULL;
4820 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004821 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004822 Py_UNICODE sign;
4823 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004824 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004825
4826 fmt++;
4827 if (*fmt == '(') {
4828 Py_UNICODE *keystart;
4829 int keylen;
4830 PyObject *key;
4831 int pcount = 1;
4832
4833 if (dict == NULL) {
4834 PyErr_SetString(PyExc_TypeError,
4835 "format requires a mapping");
4836 goto onError;
4837 }
4838 ++fmt;
4839 --fmtcnt;
4840 keystart = fmt;
4841 /* Skip over balanced parentheses */
4842 while (pcount > 0 && --fmtcnt >= 0) {
4843 if (*fmt == ')')
4844 --pcount;
4845 else if (*fmt == '(')
4846 ++pcount;
4847 fmt++;
4848 }
4849 keylen = fmt - keystart - 1;
4850 if (fmtcnt < 0 || pcount > 0) {
4851 PyErr_SetString(PyExc_ValueError,
4852 "incomplete format key");
4853 goto onError;
4854 }
Fred Drakee4315f52000-05-09 19:53:39 +00004855 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856 then looked up since Python uses strings to hold
4857 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004858 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004859 key = PyUnicode_EncodeUTF8(keystart,
4860 keylen,
4861 NULL);
4862 if (key == NULL)
4863 goto onError;
4864 if (args_owned) {
4865 Py_DECREF(args);
4866 args_owned = 0;
4867 }
4868 args = PyObject_GetItem(dict, key);
4869 Py_DECREF(key);
4870 if (args == NULL) {
4871 goto onError;
4872 }
4873 args_owned = 1;
4874 arglen = -1;
4875 argidx = -2;
4876 }
4877 while (--fmtcnt >= 0) {
4878 switch (c = *fmt++) {
4879 case '-': flags |= F_LJUST; continue;
4880 case '+': flags |= F_SIGN; continue;
4881 case ' ': flags |= F_BLANK; continue;
4882 case '#': flags |= F_ALT; continue;
4883 case '0': flags |= F_ZERO; continue;
4884 }
4885 break;
4886 }
4887 if (c == '*') {
4888 v = getnextarg(args, arglen, &argidx);
4889 if (v == NULL)
4890 goto onError;
4891 if (!PyInt_Check(v)) {
4892 PyErr_SetString(PyExc_TypeError,
4893 "* wants int");
4894 goto onError;
4895 }
4896 width = PyInt_AsLong(v);
4897 if (width < 0) {
4898 flags |= F_LJUST;
4899 width = -width;
4900 }
4901 if (--fmtcnt >= 0)
4902 c = *fmt++;
4903 }
4904 else if (c >= '0' && c <= '9') {
4905 width = c - '0';
4906 while (--fmtcnt >= 0) {
4907 c = *fmt++;
4908 if (c < '0' || c > '9')
4909 break;
4910 if ((width*10) / 10 != width) {
4911 PyErr_SetString(PyExc_ValueError,
4912 "width too big");
4913 goto onError;
4914 }
4915 width = width*10 + (c - '0');
4916 }
4917 }
4918 if (c == '.') {
4919 prec = 0;
4920 if (--fmtcnt >= 0)
4921 c = *fmt++;
4922 if (c == '*') {
4923 v = getnextarg(args, arglen, &argidx);
4924 if (v == NULL)
4925 goto onError;
4926 if (!PyInt_Check(v)) {
4927 PyErr_SetString(PyExc_TypeError,
4928 "* wants int");
4929 goto onError;
4930 }
4931 prec = PyInt_AsLong(v);
4932 if (prec < 0)
4933 prec = 0;
4934 if (--fmtcnt >= 0)
4935 c = *fmt++;
4936 }
4937 else if (c >= '0' && c <= '9') {
4938 prec = c - '0';
4939 while (--fmtcnt >= 0) {
4940 c = Py_CHARMASK(*fmt++);
4941 if (c < '0' || c > '9')
4942 break;
4943 if ((prec*10) / 10 != prec) {
4944 PyErr_SetString(PyExc_ValueError,
4945 "prec too big");
4946 goto onError;
4947 }
4948 prec = prec*10 + (c - '0');
4949 }
4950 }
4951 } /* prec */
4952 if (fmtcnt >= 0) {
4953 if (c == 'h' || c == 'l' || c == 'L') {
4954 size = c;
4955 if (--fmtcnt >= 0)
4956 c = *fmt++;
4957 }
4958 }
4959 if (fmtcnt < 0) {
4960 PyErr_SetString(PyExc_ValueError,
4961 "incomplete format");
4962 goto onError;
4963 }
4964 if (c != '%') {
4965 v = getnextarg(args, arglen, &argidx);
4966 if (v == NULL)
4967 goto onError;
4968 }
4969 sign = 0;
4970 fill = ' ';
4971 switch (c) {
4972
4973 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004974 pbuf = formatbuf;
4975 /* presume that buffer length is at least 1 */
4976 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 len = 1;
4978 break;
4979
4980 case 's':
4981 case 'r':
4982 if (PyUnicode_Check(v) && c == 's') {
4983 temp = v;
4984 Py_INCREF(temp);
4985 }
4986 else {
4987 PyObject *unicode;
4988 if (c == 's')
4989 temp = PyObject_Str(v);
4990 else
4991 temp = PyObject_Repr(v);
4992 if (temp == NULL)
4993 goto onError;
4994 if (!PyString_Check(temp)) {
4995 /* XXX Note: this should never happen, since
4996 PyObject_Repr() and PyObject_Str() assure
4997 this */
4998 Py_DECREF(temp);
4999 PyErr_SetString(PyExc_TypeError,
5000 "%s argument has non-string str()");
5001 goto onError;
5002 }
Fred Drakee4315f52000-05-09 19:53:39 +00005003 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005004 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005005 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005006 "strict");
5007 Py_DECREF(temp);
5008 temp = unicode;
5009 if (temp == NULL)
5010 goto onError;
5011 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005012 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005013 len = PyUnicode_GET_SIZE(temp);
5014 if (prec >= 0 && len > prec)
5015 len = prec;
5016 break;
5017
5018 case 'i':
5019 case 'd':
5020 case 'u':
5021 case 'o':
5022 case 'x':
5023 case 'X':
5024 if (c == 'i')
5025 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005026 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005027 temp = formatlong(v, flags, prec, c);
5028 if (!temp)
5029 goto onError;
5030 pbuf = PyUnicode_AS_UNICODE(temp);
5031 len = PyUnicode_GET_SIZE(temp);
5032 /* unbounded ints can always produce
5033 a sign character! */
5034 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005035 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005036 else {
5037 pbuf = formatbuf;
5038 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5039 flags, prec, c, v);
5040 if (len < 0)
5041 goto onError;
5042 /* only d conversion is signed */
5043 sign = c == 'd';
5044 }
5045 if (flags & F_ZERO)
5046 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005047 break;
5048
5049 case 'e':
5050 case 'E':
5051 case 'f':
5052 case 'g':
5053 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005054 pbuf = formatbuf;
5055 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5056 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 if (len < 0)
5058 goto onError;
5059 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005060 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005061 fill = '0';
5062 break;
5063
5064 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005065 pbuf = formatbuf;
5066 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 if (len < 0)
5068 goto onError;
5069 break;
5070
5071 default:
5072 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005073 "unsupported format character '%c' (0x%x) "
5074 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005075 (31<=c && c<=126) ? c : '?',
5076 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005077 goto onError;
5078 }
5079 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005080 if (*pbuf == '-' || *pbuf == '+') {
5081 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005082 len--;
5083 }
5084 else if (flags & F_SIGN)
5085 sign = '+';
5086 else if (flags & F_BLANK)
5087 sign = ' ';
5088 else
5089 sign = 0;
5090 }
5091 if (width < len)
5092 width = len;
5093 if (rescnt < width + (sign != 0)) {
5094 reslen -= rescnt;
5095 rescnt = width + fmtcnt + 100;
5096 reslen += rescnt;
5097 if (_PyUnicode_Resize(result, reslen) < 0)
5098 return NULL;
5099 res = PyUnicode_AS_UNICODE(result)
5100 + reslen - rescnt;
5101 }
5102 if (sign) {
5103 if (fill != ' ')
5104 *res++ = sign;
5105 rescnt--;
5106 if (width > len)
5107 width--;
5108 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005109 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5110 assert(pbuf[0] == '0');
5111 assert(pbuf[1] == c);
5112 if (fill != ' ') {
5113 *res++ = *pbuf++;
5114 *res++ = *pbuf++;
5115 }
5116 rescnt -= 2;
5117 width -= 2;
5118 if (width < 0)
5119 width = 0;
5120 len -= 2;
5121 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005122 if (width > len && !(flags & F_LJUST)) {
5123 do {
5124 --rescnt;
5125 *res++ = fill;
5126 } while (--width > len);
5127 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005128 if (fill == ' ') {
5129 if (sign)
5130 *res++ = sign;
5131 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5132 assert(pbuf[0] == '0');
5133 assert(pbuf[1] == c);
5134 *res++ = *pbuf++;
5135 *res++ = *pbuf++;
5136 }
5137 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005138 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005139 res += len;
5140 rescnt -= len;
5141 while (--width >= len) {
5142 --rescnt;
5143 *res++ = ' ';
5144 }
5145 if (dict && (argidx < arglen) && c != '%') {
5146 PyErr_SetString(PyExc_TypeError,
5147 "not all arguments converted");
5148 goto onError;
5149 }
5150 Py_XDECREF(temp);
5151 } /* '%' */
5152 } /* until end */
5153 if (argidx < arglen && !dict) {
5154 PyErr_SetString(PyExc_TypeError,
5155 "not all arguments converted");
5156 goto onError;
5157 }
5158
5159 if (args_owned) {
5160 Py_DECREF(args);
5161 }
5162 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005163 if (_PyUnicode_Resize(result, reslen - rescnt))
5164 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005165 return (PyObject *)result;
5166
5167 onError:
5168 Py_XDECREF(result);
5169 Py_DECREF(uformat);
5170 if (args_owned) {
5171 Py_DECREF(args);
5172 }
5173 return NULL;
5174}
5175
5176static PyBufferProcs unicode_as_buffer = {
5177 (getreadbufferproc) unicode_buffer_getreadbuf,
5178 (getwritebufferproc) unicode_buffer_getwritebuf,
5179 (getsegcountproc) unicode_buffer_getsegcount,
5180 (getcharbufferproc) unicode_buffer_getcharbuf,
5181};
5182
5183PyTypeObject PyUnicode_Type = {
5184 PyObject_HEAD_INIT(&PyType_Type)
5185 0, /* ob_size */
5186 "unicode", /* tp_name */
5187 sizeof(PyUnicodeObject), /* tp_size */
5188 0, /* tp_itemsize */
5189 /* Slots */
5190 (destructor)_PyUnicode_Free, /* tp_dealloc */
5191 0, /* tp_print */
5192 (getattrfunc)unicode_getattr, /* tp_getattr */
5193 0, /* tp_setattr */
5194 (cmpfunc) unicode_compare, /* tp_compare */
5195 (reprfunc) unicode_repr, /* tp_repr */
5196 0, /* tp_as_number */
5197 &unicode_as_sequence, /* tp_as_sequence */
5198 0, /* tp_as_mapping */
5199 (hashfunc) unicode_hash, /* tp_hash*/
5200 0, /* tp_call*/
5201 (reprfunc) unicode_str, /* tp_str */
5202 (getattrofunc) NULL, /* tp_getattro */
5203 (setattrofunc) NULL, /* tp_setattro */
5204 &unicode_as_buffer, /* tp_as_buffer */
5205 Py_TPFLAGS_DEFAULT, /* tp_flags */
5206};
5207
5208/* Initialize the Unicode implementation */
5209
Thomas Wouters78890102000-07-22 19:25:51 +00005210void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005211{
5212 /* Doublecheck the configuration... */
5213 if (sizeof(Py_UNICODE) != 2)
5214 Py_FatalError("Unicode configuration error: "
5215 "sizeof(Py_UNICODE) != 2 bytes");
5216
Fred Drakee4315f52000-05-09 19:53:39 +00005217 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005218 unicode_freelist = NULL;
5219 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005221 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222}
5223
5224/* Finalize the Unicode implementation */
5225
5226void
Thomas Wouters78890102000-07-22 19:25:51 +00005227_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005229 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005230
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005231 Py_XDECREF(unicode_empty);
5232 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005233
5234 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005235 PyUnicodeObject *v = u;
5236 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005237 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005238 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005239 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005240 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005241 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005242 unicode_freelist = NULL;
5243 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005244}