blob: 39ea071f2005ec99808fe1e09994b23aa17f70eb [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
86/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
89/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000090static PyUnicodeObject *unicode_freelist;
91static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Fred Drakee4315f52000-05-09 19:53:39 +000093/* Default encoding to use and assume when NULL is passed as encoding
94 parameter; it is initialized by _PyUnicode_Init().
95
96 Always use the PyUnicode_SetDefaultEncoding() and
97 PyUnicode_GetDefaultEncoding() APIs to access this global.
98
99*/
100
101static char unicode_default_encoding[100];
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* --- Unicode Object ----------------------------------------------------- */
104
105static
106int _PyUnicode_Resize(register PyUnicodeObject *unicode,
107 int length)
108{
109 void *oldstr;
110
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000111 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000112 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000113 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115 /* Resizing unicode_empty is not allowed. */
116 if (unicode == unicode_empty) {
117 PyErr_SetString(PyExc_SystemError,
118 "can't resize empty unicode object");
119 return -1;
120 }
121
122 /* We allocate one more byte to make sure the string is
123 Ux0000 terminated -- XXX is this needed ? */
124 oldstr = unicode->str;
125 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
126 if (!unicode->str) {
127 unicode->str = oldstr;
128 PyErr_NoMemory();
129 return -1;
130 }
131 unicode->str[length] = 0;
132 unicode->length = length;
133
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000136 if (unicode->defenc) {
137 Py_DECREF(unicode->defenc);
138 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 }
140 unicode->hash = -1;
141
142 return 0;
143}
144
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145int PyUnicode_Resize(PyObject **unicode,
146 int length)
147{
148 PyUnicodeObject *v;
149
150 if (unicode == NULL) {
151 PyErr_BadInternalCall();
152 return -1;
153 }
154 v = (PyUnicodeObject *)*unicode;
155 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
156 PyErr_BadInternalCall();
157 return -1;
158 }
159 return _PyUnicode_Resize(v, length);
160}
161
Guido van Rossumd57fd912000-03-10 22:53:23 +0000162/* We allocate one more byte to make sure the string is
163 Ux0000 terminated -- XXX is this needed ?
164
165 XXX This allocator could further be enhanced by assuring that the
166 free list never reduces its size below 1.
167
168*/
169
170static
171PyUnicodeObject *_PyUnicode_New(int length)
172{
173 register PyUnicodeObject *unicode;
174
175 /* Optimization for empty strings */
176 if (length == 0 && unicode_empty != NULL) {
177 Py_INCREF(unicode_empty);
178 return unicode_empty;
179 }
180
181 /* Unicode freelist & memory allocation */
182 if (unicode_freelist) {
183 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000184 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000187 /* Keep-Alive optimization: we only upsize the buffer,
188 never downsize it. */
189 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000191 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 }
194 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000195 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000197 }
198 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 }
200 else {
201 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
202 if (unicode == NULL)
203 return NULL;
204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
205 }
206
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000207 if (!unicode->str) {
208 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000209 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode->str[length] = 0;
212 unicode->length = length;
213 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000216
217 onError:
218 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000219 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221}
222
223static
224void _PyUnicode_Free(register PyUnicodeObject *unicode)
225{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 /* Keep-Alive optimization */
228 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->str = NULL;
231 unicode->length = 0;
232 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000233 if (unicode->defenc) {
234 Py_DECREF(unicode->defenc);
235 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000236 }
237 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 *(PyUnicodeObject **)unicode = unicode_freelist;
239 unicode_freelist = unicode;
240 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 }
242 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000243 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000244 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 }
247}
248
249PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
250 int size)
251{
252 PyUnicodeObject *unicode;
253
254 unicode = _PyUnicode_New(size);
255 if (!unicode)
256 return NULL;
257
258 /* Copy the Unicode data into the new object */
259 if (u != NULL)
260 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
261
262 return (PyObject *)unicode;
263}
264
265#ifdef HAVE_WCHAR_H
266
267PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
268 int size)
269{
270 PyUnicodeObject *unicode;
271
272 if (w == NULL) {
273 PyErr_BadInternalCall();
274 return NULL;
275 }
276
277 unicode = _PyUnicode_New(size);
278 if (!unicode)
279 return NULL;
280
281 /* Copy the wchar_t data into the new object */
282#ifdef HAVE_USABLE_WCHAR_T
283 memcpy(unicode->str, w, size * sizeof(wchar_t));
284#else
285 {
286 register Py_UNICODE *u;
287 register int i;
288 u = PyUnicode_AS_UNICODE(unicode);
289 for (i = size; i >= 0; i--)
290 *u++ = *w++;
291 }
292#endif
293
294 return (PyObject *)unicode;
295}
296
297int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
298 register wchar_t *w,
299 int size)
300{
301 if (unicode == NULL) {
302 PyErr_BadInternalCall();
303 return -1;
304 }
305 if (size > PyUnicode_GET_SIZE(unicode))
306 size = PyUnicode_GET_SIZE(unicode);
307#ifdef HAVE_USABLE_WCHAR_T
308 memcpy(w, unicode->str, size * sizeof(wchar_t));
309#else
310 {
311 register Py_UNICODE *u;
312 register int i;
313 u = PyUnicode_AS_UNICODE(unicode);
314 for (i = size; i >= 0; i--)
315 *w++ = *u++;
316 }
317#endif
318
319 return size;
320}
321
322#endif
323
324PyObject *PyUnicode_FromObject(register PyObject *obj)
325{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000326 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
327}
328
329PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
330 const char *encoding,
331 const char *errors)
332{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 const char *s;
334 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000335 int owned = 0;
336 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337
338 if (obj == NULL) {
339 PyErr_BadInternalCall();
340 return NULL;
341 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000342
343 /* Coerce object */
344 if (PyInstance_Check(obj)) {
345 PyObject *func;
346 func = PyObject_GetAttrString(obj, "__str__");
347 if (func == NULL) {
348 PyErr_SetString(PyExc_TypeError,
349 "coercing to Unicode: instance doesn't define __str__");
350 return NULL;
351 }
352 obj = PyEval_CallObject(func, NULL);
353 Py_DECREF(func);
354 if (obj == NULL)
355 return NULL;
356 owned = 1;
357 }
358 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000360 v = obj;
361 if (encoding) {
362 PyErr_SetString(PyExc_TypeError,
363 "decoding Unicode is not supported");
364 return NULL;
365 }
366 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 }
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
371 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000376 PyErr_Format(PyExc_TypeError,
377 "coercing to Unicode: need string or buffer, "
378 "%.80s found",
379 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000380 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000381 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382
383 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 if (len == 0) {
385 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000388 else
389 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000390
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000392 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000394 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000395 return v;
396
397 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000398 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000399 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000400 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402}
403
404PyObject *PyUnicode_Decode(const char *s,
405 int size,
406 const char *encoding,
407 const char *errors)
408{
409 PyObject *buffer = NULL, *unicode;
410
Fred Drakee4315f52000-05-09 19:53:39 +0000411 if (encoding == NULL)
412 encoding = PyUnicode_GetDefaultEncoding();
413
414 /* Shortcuts for common default encodings */
415 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000417 else if (strcmp(encoding, "latin-1") == 0)
418 return PyUnicode_DecodeLatin1(s, size, errors);
419 else if (strcmp(encoding, "ascii") == 0)
420 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421
422 /* Decode via the codec registry */
423 buffer = PyBuffer_FromMemory((void *)s, size);
424 if (buffer == NULL)
425 goto onError;
426 unicode = PyCodec_Decode(buffer, encoding, errors);
427 if (unicode == NULL)
428 goto onError;
429 if (!PyUnicode_Check(unicode)) {
430 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000431 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 unicode->ob_type->tp_name);
433 Py_DECREF(unicode);
434 goto onError;
435 }
436 Py_DECREF(buffer);
437 return unicode;
438
439 onError:
440 Py_XDECREF(buffer);
441 return NULL;
442}
443
444PyObject *PyUnicode_Encode(const Py_UNICODE *s,
445 int size,
446 const char *encoding,
447 const char *errors)
448{
449 PyObject *v, *unicode;
450
451 unicode = PyUnicode_FromUnicode(s, size);
452 if (unicode == NULL)
453 return NULL;
454 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
455 Py_DECREF(unicode);
456 return v;
457}
458
459PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
460 const char *encoding,
461 const char *errors)
462{
463 PyObject *v;
464
465 if (!PyUnicode_Check(unicode)) {
466 PyErr_BadArgument();
467 goto onError;
468 }
Fred Drakee4315f52000-05-09 19:53:39 +0000469
470 if (encoding == NULL)
471 encoding = PyUnicode_GetDefaultEncoding();
472
473 /* Shortcuts for common default encodings */
474 if (errors == NULL) {
475 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000477 else if (strcmp(encoding, "latin-1") == 0)
478 return PyUnicode_AsLatin1String(unicode);
479 else if (strcmp(encoding, "ascii") == 0)
480 return PyUnicode_AsASCIIString(unicode);
481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482
483 /* Encode via the codec registry */
484 v = PyCodec_Encode(unicode, encoding, errors);
485 if (v == NULL)
486 goto onError;
487 /* XXX Should we really enforce this ? */
488 if (!PyString_Check(v)) {
489 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000490 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 v->ob_type->tp_name);
492 Py_DECREF(v);
493 goto onError;
494 }
495 return v;
496
497 onError:
498 return NULL;
499}
500
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000501/* Return a Python string holding the default encoded value of the
502 Unicode object.
503
504 The resulting string is cached in the Unicode object for subsequent
505 usage by this function. The cached version is needed to implement
506 the character buffer interface and will live (at least) as long as
507 the Unicode object itself.
508
509 The refcount of the string is *not* incremented.
510
511 *** Exported for internal use by the interpreter only !!! ***
512
513*/
514
515PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
516 const char *errors)
517{
518 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
519
520 if (v)
521 return v;
522 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
523 if (v && errors == NULL)
524 ((PyUnicodeObject *)unicode)->defenc = v;
525 return v;
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
529{
530 if (!PyUnicode_Check(unicode)) {
531 PyErr_BadArgument();
532 goto onError;
533 }
534 return PyUnicode_AS_UNICODE(unicode);
535
536 onError:
537 return NULL;
538}
539
540int PyUnicode_GetSize(PyObject *unicode)
541{
542 if (!PyUnicode_Check(unicode)) {
543 PyErr_BadArgument();
544 goto onError;
545 }
546 return PyUnicode_GET_SIZE(unicode);
547
548 onError:
549 return -1;
550}
551
Thomas Wouters78890102000-07-22 19:25:51 +0000552const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000553{
554 return unicode_default_encoding;
555}
556
557int PyUnicode_SetDefaultEncoding(const char *encoding)
558{
559 PyObject *v;
560
561 /* Make sure the encoding is valid. As side effect, this also
562 loads the encoding into the codec registry cache. */
563 v = _PyCodec_Lookup(encoding);
564 if (v == NULL)
565 goto onError;
566 Py_DECREF(v);
567 strncpy(unicode_default_encoding,
568 encoding,
569 sizeof(unicode_default_encoding));
570 return 0;
571
572 onError:
573 return -1;
574}
575
Guido van Rossumd57fd912000-03-10 22:53:23 +0000576/* --- UTF-8 Codec -------------------------------------------------------- */
577
578static
579char utf8_code_length[256] = {
580 /* Map UTF-8 encoded prefix byte to sequence length. zero means
581 illegal prefix. see RFC 2279 for details */
582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
597 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
598};
599
600static
601int utf8_decoding_error(const char **source,
602 Py_UNICODE **dest,
603 const char *errors,
604 const char *details)
605{
606 if ((errors == NULL) ||
607 (strcmp(errors,"strict") == 0)) {
608 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000609 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 details);
611 return -1;
612 }
613 else if (strcmp(errors,"ignore") == 0) {
614 (*source)++;
615 return 0;
616 }
617 else if (strcmp(errors,"replace") == 0) {
618 (*source)++;
619 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
620 (*dest)++;
621 return 0;
622 }
623 else {
624 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 errors);
627 return -1;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631PyObject *PyUnicode_DecodeUTF8(const char *s,
632 int size,
633 const char *errors)
634{
635 int n;
636 const char *e;
637 PyUnicodeObject *unicode;
638 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000639 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Note: size will always be longer than the resulting Unicode
642 character count */
643 unicode = _PyUnicode_New(size);
644 if (!unicode)
645 return NULL;
646 if (size == 0)
647 return (PyObject *)unicode;
648
649 /* Unpack UTF-8 encoded data */
650 p = unicode->str;
651 e = s + size;
652
653 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000654 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655
656 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000657 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 s++;
659 continue;
660 }
661
662 n = utf8_code_length[ch];
663
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 if (s + n > e) {
665 errmsg = "unexpected end of data";
666 goto utf8Error;
667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668
669 switch (n) {
670
671 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000672 errmsg = "unexpected code byte";
673 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000674 break;
675
676 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000677 errmsg = "internal error";
678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000679 break;
680
681 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000682 if ((s[1] & 0xc0) != 0x80) {
683 errmsg = "invalid data";
684 goto utf8Error;
685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000687 if (ch < 0x80) {
688 errmsg = "illegal encoding";
689 goto utf8Error;
690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000692 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 break;
694
695 case 3:
696 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 (s[2] & 0xc0) != 0x80) {
698 errmsg = "invalid data";
699 goto utf8Error;
700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
703 errmsg = "illegal encoding";
704 goto utf8Error;
705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000707 *p++ = (Py_UNICODE)ch;
708 break;
709
710 case 4:
711 if ((s[1] & 0xc0) != 0x80 ||
712 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 (s[3] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000717 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
718 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
719 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if ((ch < 0x10000) || /* minimum value allowed for 4
721 byte encoding */
722 (ch > 0x10ffff)) { /* maximum value allowed for
723 UTF-16 */
724 errmsg = "illegal encoding";
725 goto utf8Error;
726 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000727 /* compute and append the two surrogates: */
728
729 /* translate from 10000..10FFFF to 0..FFFF */
730 ch -= 0x10000;
731
732 /* high surrogate = top 10 bits added to D800 */
733 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
734
735 /* low surrogate = bottom 10 bits added to DC00 */
736 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 break;
738
739 default:
740 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 errmsg = "unsupported Unicode code range";
742 goto utf8Error;
743 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 }
745 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 continue;
747
748 utf8Error:
749 if (utf8_decoding_error(&s, &p, errors, errmsg))
750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000751 }
752
753 /* Adjust length */
754 if (_PyUnicode_Resize(unicode, p - unicode->str))
755 goto onError;
756
757 return (PyObject *)unicode;
758
759onError:
760 Py_DECREF(unicode);
761 return NULL;
762}
763
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000764/* Not used anymore, now that the encoder supports UTF-16
765 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000766#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767static
768int utf8_encoding_error(const Py_UNICODE **source,
769 char **dest,
770 const char *errors,
771 const char *details)
772{
773 if ((errors == NULL) ||
774 (strcmp(errors,"strict") == 0)) {
775 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000776 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777 details);
778 return -1;
779 }
780 else if (strcmp(errors,"ignore") == 0) {
781 return 0;
782 }
783 else if (strcmp(errors,"replace") == 0) {
784 **dest = '?';
785 (*dest)++;
786 return 0;
787 }
788 else {
789 PyErr_Format(PyExc_ValueError,
790 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000791 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792 errors);
793 return -1;
794 }
795}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000796#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
799 int size,
800 const char *errors)
801{
802 PyObject *v;
803 char *p;
804 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000805 Py_UCS4 ch2;
806 unsigned int cbAllocated = 3 * size;
807 unsigned int cbWritten = 0;
808 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000810 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (v == NULL)
812 return NULL;
813 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000814 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815
816 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000817 while (i < size) {
818 Py_UCS4 ch = s[i++];
819 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000821 cbWritten++;
822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 else if (ch < 0x0800) {
824 *p++ = 0xc0 | (ch >> 6);
825 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000826 cbWritten += 2;
827 }
828 else {
829 /* Check for high surrogate */
830 if (0xD800 <= ch && ch <= 0xDBFF) {
831 if (i != size) {
832 ch2 = s[i];
833 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
834
835 if (cbWritten >= (cbAllocated - 4)) {
836 /* Provide enough room for some more
837 surrogates */
838 cbAllocated += 4*10;
839 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000840 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 }
842
843 /* combine the two values */
844 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
845
846 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000847 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 i++;
849 cbWritten += 4;
850 }
851 }
852 }
853 else {
854 *p++ = (char)(0xe0 | (ch >> 12));
855 cbWritten += 3;
856 }
857 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
858 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859 }
860 }
861 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000862 if (_PyString_Resize(&v, p - q))
863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 return v;
865
866 onError:
867 Py_DECREF(v);
868 return NULL;
869}
870
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
872{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 if (!PyUnicode_Check(unicode)) {
874 PyErr_BadArgument();
875 return NULL;
876 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000877 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
878 PyUnicode_GET_SIZE(unicode),
879 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880}
881
882/* --- UTF-16 Codec ------------------------------------------------------- */
883
884static
885int utf16_decoding_error(const Py_UNICODE **source,
886 Py_UNICODE **dest,
887 const char *errors,
888 const char *details)
889{
890 if ((errors == NULL) ||
891 (strcmp(errors,"strict") == 0)) {
892 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000893 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000894 details);
895 return -1;
896 }
897 else if (strcmp(errors,"ignore") == 0) {
898 return 0;
899 }
900 else if (strcmp(errors,"replace") == 0) {
901 if (dest) {
902 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
903 (*dest)++;
904 }
905 return 0;
906 }
907 else {
908 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000909 "UTF-16 decoding error; "
910 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 errors);
912 return -1;
913 }
914}
915
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916PyObject *PyUnicode_DecodeUTF16(const char *s,
917 int size,
918 const char *errors,
919 int *byteorder)
920{
921 PyUnicodeObject *unicode;
922 Py_UNICODE *p;
923 const Py_UNICODE *q, *e;
924 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000925 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926
927 /* size should be an even number */
928 if (size % sizeof(Py_UNICODE) != 0) {
929 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
930 return NULL;
931 /* The remaining input chars are ignored if we fall through
932 here... */
933 }
934
935 /* Note: size will always be longer than the resulting Unicode
936 character count */
937 unicode = _PyUnicode_New(size);
938 if (!unicode)
939 return NULL;
940 if (size == 0)
941 return (PyObject *)unicode;
942
943 /* Unpack UTF-16 encoded data */
944 p = unicode->str;
945 q = (Py_UNICODE *)s;
946 e = q + (size / sizeof(Py_UNICODE));
947
948 if (byteorder)
949 bo = *byteorder;
950
951 while (q < e) {
952 register Py_UNICODE ch = *q++;
953
954 /* Check for BOM marks (U+FEFF) in the input and adjust
955 current byte order setting accordingly. Swap input
956 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
957 !) */
958#ifdef BYTEORDER_IS_LITTLE_ENDIAN
959 if (ch == 0xFEFF) {
960 bo = -1;
961 continue;
962 } else if (ch == 0xFFFE) {
963 bo = 1;
964 continue;
965 }
966 if (bo == 1)
967 ch = (ch >> 8) | (ch << 8);
968#else
969 if (ch == 0xFEFF) {
970 bo = 1;
971 continue;
972 } else if (ch == 0xFFFE) {
973 bo = -1;
974 continue;
975 }
976 if (bo == -1)
977 ch = (ch >> 8) | (ch << 8);
978#endif
979 if (ch < 0xD800 || ch > 0xDFFF) {
980 *p++ = ch;
981 continue;
982 }
983
984 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000985 if (q >= e) {
986 errmsg = "unexpected end of data";
987 goto utf16Error;
988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 if (0xDC00 <= *q && *q <= 0xDFFF) {
990 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000991 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992 /* This is valid data (a UTF-16 surrogate pair), but
993 we are not able to store this information since our
994 Py_UNICODE type only has 16 bits... this might
995 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000996 errmsg = "code pairs are not supported";
997 goto utf16Error;
998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999 else
1000 continue;
1001 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001002 errmsg = "illegal encoding";
1003 /* Fall through to report the error */
1004
1005 utf16Error:
1006 if (utf16_decoding_error(&q, &p, errors, errmsg))
1007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 }
1009
1010 if (byteorder)
1011 *byteorder = bo;
1012
1013 /* Adjust length */
1014 if (_PyUnicode_Resize(unicode, p - unicode->str))
1015 goto onError;
1016
1017 return (PyObject *)unicode;
1018
1019onError:
1020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024#undef UTF16_ERROR
1025
1026PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1027 int size,
1028 const char *errors,
1029 int byteorder)
1030{
1031 PyObject *v;
1032 Py_UNICODE *p;
1033 char *q;
1034
1035 /* We don't create UTF-16 pairs... */
1036 v = PyString_FromStringAndSize(NULL,
1037 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1038 if (v == NULL)
1039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040
1041 q = PyString_AS_STRING(v);
1042 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (byteorder == 0)
1044 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001045 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001046 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (byteorder == 0 ||
1048#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1049 byteorder == -1
1050#else
1051 byteorder == 1
1052#endif
1053 )
1054 memcpy(p, s, size * sizeof(Py_UNICODE));
1055 else
1056 while (size-- > 0) {
1057 Py_UNICODE ch = *s++;
1058 *p++ = (ch >> 8) | (ch << 8);
1059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return v;
1061}
1062
1063PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1064{
1065 if (!PyUnicode_Check(unicode)) {
1066 PyErr_BadArgument();
1067 return NULL;
1068 }
1069 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1070 PyUnicode_GET_SIZE(unicode),
1071 NULL,
1072 0);
1073}
1074
1075/* --- Unicode Escape Codec ----------------------------------------------- */
1076
1077static
1078int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001079 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 const char *errors,
1081 const char *details)
1082{
1083 if ((errors == NULL) ||
1084 (strcmp(errors,"strict") == 0)) {
1085 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001086 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 details);
1088 return -1;
1089 }
1090 else if (strcmp(errors,"ignore") == 0) {
1091 return 0;
1092 }
1093 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001094 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 return 0;
1096 }
1097 else {
1098 PyErr_Format(PyExc_ValueError,
1099 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001100 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 errors);
1102 return -1;
1103 }
1104}
1105
Fredrik Lundh06d12682001-01-24 07:59:11 +00001106static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001107
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1109 int size,
1110 const char *errors)
1111{
1112 PyUnicodeObject *v;
1113 Py_UNICODE *p = NULL, *buf = NULL;
1114 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001115 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116
1117 /* Escaped strings will always be longer than the resulting
1118 Unicode string, so we start with size here and then reduce the
1119 length after conversion to the true value. */
1120 v = _PyUnicode_New(size);
1121 if (v == NULL)
1122 goto onError;
1123 if (size == 0)
1124 return (PyObject *)v;
1125 p = buf = PyUnicode_AS_UNICODE(v);
1126 end = s + size;
1127 while (s < end) {
1128 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001129 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130 int i;
1131
1132 /* Non-escape characters are interpreted as Unicode ordinals */
1133 if (*s != '\\') {
1134 *p++ = (unsigned char)*s++;
1135 continue;
1136 }
1137
1138 /* \ - Escapes */
1139 s++;
1140 switch (*s++) {
1141
1142 /* \x escapes */
1143 case '\n': break;
1144 case '\\': *p++ = '\\'; break;
1145 case '\'': *p++ = '\''; break;
1146 case '\"': *p++ = '\"'; break;
1147 case 'b': *p++ = '\b'; break;
1148 case 'f': *p++ = '\014'; break; /* FF */
1149 case 't': *p++ = '\t'; break;
1150 case 'n': *p++ = '\n'; break;
1151 case 'r': *p++ = '\r'; break;
1152 case 'v': *p++ = '\013'; break; /* VT */
1153 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1154
1155 /* \OOO (octal) escapes */
1156 case '0': case '1': case '2': case '3':
1157 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001158 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001160 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001162 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001164 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 break;
1166
Fredrik Lundhdf846752000-09-03 11:29:49 +00001167 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001169 for (x = 0, i = 0; i < 2; i++) {
1170 c = (unsigned char)s[i];
1171 if (!isxdigit(c)) {
1172 if (unicodeescape_decoding_error(&s, &x, errors,
1173 "truncated \\xXX"))
1174 goto onError;
1175 i++;
1176 break;
1177 }
1178 x = (x<<4) & ~0xF;
1179 if (c >= '0' && c <= '9')
1180 x += c - '0';
1181 else if (c >= 'a' && c <= 'f')
1182 x += 10 + c - 'a';
1183 else
1184 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001186 s += i;
1187 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 break;
1189
1190 /* \uXXXX with 4 hex digits */
1191 case 'u':
1192 for (x = 0, i = 0; i < 4; i++) {
1193 c = (unsigned char)s[i];
1194 if (!isxdigit(c)) {
1195 if (unicodeescape_decoding_error(&s, &x, errors,
1196 "truncated \\uXXXX"))
1197 goto onError;
1198 i++;
1199 break;
1200 }
1201 x = (x<<4) & ~0xF;
1202 if (c >= '0' && c <= '9')
1203 x += c - '0';
1204 else if (c >= 'a' && c <= 'f')
1205 x += 10 + c - 'a';
1206 else
1207 x += 10 + c - 'A';
1208 }
1209 s += i;
1210 *p++ = x;
1211 break;
1212
Fredrik Lundhdf846752000-09-03 11:29:49 +00001213 /* \UXXXXXXXX with 8 hex digits */
1214 case 'U':
1215 for (chr = 0, i = 0; i < 8; i++) {
1216 c = (unsigned char)s[i];
1217 if (!isxdigit(c)) {
1218 if (unicodeescape_decoding_error(&s, &x, errors,
1219 "truncated \\uXXXX"))
1220 goto onError;
1221 i++;
1222 break;
1223 }
1224 chr = (chr<<4) & ~0xF;
1225 if (c >= '0' && c <= '9')
1226 chr += c - '0';
1227 else if (c >= 'a' && c <= 'f')
1228 chr += 10 + c - 'a';
1229 else
1230 chr += 10 + c - 'A';
1231 }
1232 s += i;
1233 goto store;
1234
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001235 case 'N':
1236 /* Ok, we need to deal with Unicode Character Names now,
1237 * make sure we've imported the hash table data...
1238 */
Fredrik Lundh06d12682001-01-24 07:59:11 +00001239 if (ucnhash_CAPI == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001240 PyObject *mod = 0, *v = 0;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001241 mod = PyImport_ImportModule("unicodedata");
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001242 if (mod == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001243 goto ucnhashError;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001244 v = PyObject_GetAttrString(mod,"ucnhash_CAPI");
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001245 Py_DECREF(mod);
1246 if (v == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001247 goto ucnhashError;
Fredrik Lundh06d12682001-01-24 07:59:11 +00001248 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001249 Py_DECREF(v);
Fredrik Lundh06d12682001-01-24 07:59:11 +00001250 if (ucnhash_CAPI == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001251 goto ucnhashError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001252 }
1253
Fredrik Lundhdf846752000-09-03 11:29:49 +00001254 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001255 const char *start = s + 1;
1256 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001257
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001258 /* look for the closing brace */
1259 while (*endBrace != '}' && endBrace < end)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001260 endBrace++;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001261 if (endBrace != end && *endBrace == '}') {
Fredrik Lundh06d12682001-01-24 07:59:11 +00001262 if (!ucnhash_CAPI->getcode(start, endBrace-start, &chr)) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001263 if (unicodeescape_decoding_error(
1264 &s, &x, errors,
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001265 "Invalid Unicode Character Name")
1266 )
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001267 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001268 goto ucnFallthrough;
1269 }
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001270 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001271 goto store;
1272 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001273 if (unicodeescape_decoding_error(
1274 &s, &x, errors,
1275 "Unicode name missing closing brace"))
1276 goto onError;
1277 goto ucnFallthrough;
1278 }
1279 break;
1280 }
1281 if (unicodeescape_decoding_error(
1282 &s, &x, errors,
1283 "Missing opening brace for Unicode Character Name escape"))
1284 goto onError;
1285ucnFallthrough:
1286 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001287 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 *p++ = '\\';
1289 *p++ = (unsigned char)s[-1];
1290 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001291store:
1292 /* when we get here, chr is a 32-bit unicode character */
1293 if (chr <= 0xffff)
1294 /* UCS-2 character */
1295 *p++ = (Py_UNICODE) chr;
1296 else if (chr <= 0x10ffff) {
1297 /* UCS-4 character. store as two surrogate characters */
1298 chr -= 0x10000L;
1299 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1300 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1301 } else {
1302 if (unicodeescape_decoding_error(
1303 &s, &x, errors,
1304 "Illegal Unicode character")
1305 )
1306 goto onError;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 }
1309 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001310 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001311 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 return (PyObject *)v;
1313
Fredrik Lundhf6056062001-01-20 11:15:25 +00001314 ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001315 PyErr_SetString(
1316 PyExc_UnicodeError,
1317 "\\N escapes not supported (can't load unicodedata module)"
1318 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001319 return NULL;
1320
Guido van Rossumd57fd912000-03-10 22:53:23 +00001321 onError:
1322 Py_XDECREF(v);
1323 return NULL;
1324}
1325
1326/* Return a Unicode-Escape string version of the Unicode object.
1327
1328 If quotes is true, the string is enclosed in u"" or u'' quotes as
1329 appropriate.
1330
1331*/
1332
Barry Warsaw51ac5802000-03-20 16:36:48 +00001333static const Py_UNICODE *findchar(const Py_UNICODE *s,
1334 int size,
1335 Py_UNICODE ch);
1336
Guido van Rossumd57fd912000-03-10 22:53:23 +00001337static
1338PyObject *unicodeescape_string(const Py_UNICODE *s,
1339 int size,
1340 int quotes)
1341{
1342 PyObject *repr;
1343 char *p;
1344 char *q;
1345
1346 static const char *hexdigit = "0123456789ABCDEF";
1347
1348 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1349 if (repr == NULL)
1350 return NULL;
1351
1352 p = q = PyString_AS_STRING(repr);
1353
1354 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 *p++ = 'u';
1356 *p++ = (findchar(s, size, '\'') &&
1357 !findchar(s, size, '"')) ? '"' : '\'';
1358 }
1359 while (size-- > 0) {
1360 Py_UNICODE ch = *s++;
1361 /* Escape quotes */
1362 if (quotes && (ch == q[1] || ch == '\\')) {
1363 *p++ = '\\';
1364 *p++ = (char) ch;
1365 }
1366 /* Map 16-bit characters to '\uxxxx' */
1367 else if (ch >= 256) {
1368 *p++ = '\\';
1369 *p++ = 'u';
1370 *p++ = hexdigit[(ch >> 12) & 0xf];
1371 *p++ = hexdigit[(ch >> 8) & 0xf];
1372 *p++ = hexdigit[(ch >> 4) & 0xf];
1373 *p++ = hexdigit[ch & 15];
1374 }
1375 /* Map non-printable US ASCII to '\ooo' */
1376 else if (ch < ' ' || ch >= 128) {
1377 *p++ = '\\';
1378 *p++ = hexdigit[(ch >> 6) & 7];
1379 *p++ = hexdigit[(ch >> 3) & 7];
1380 *p++ = hexdigit[ch & 7];
1381 }
1382 /* Copy everything else as-is */
1383 else
1384 *p++ = (char) ch;
1385 }
1386 if (quotes)
1387 *p++ = q[1];
1388
1389 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001390 if (_PyString_Resize(&repr, p - q))
1391 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001392
1393 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001394
1395 onError:
1396 Py_DECREF(repr);
1397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398}
1399
1400PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1401 int size)
1402{
1403 return unicodeescape_string(s, size, 0);
1404}
1405
1406PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1407{
1408 if (!PyUnicode_Check(unicode)) {
1409 PyErr_BadArgument();
1410 return NULL;
1411 }
1412 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1413 PyUnicode_GET_SIZE(unicode));
1414}
1415
1416/* --- Raw Unicode Escape Codec ------------------------------------------- */
1417
1418PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1419 int size,
1420 const char *errors)
1421{
1422 PyUnicodeObject *v;
1423 Py_UNICODE *p, *buf;
1424 const char *end;
1425 const char *bs;
1426
1427 /* Escaped strings will always be longer than the resulting
1428 Unicode string, so we start with size here and then reduce the
1429 length after conversion to the true value. */
1430 v = _PyUnicode_New(size);
1431 if (v == NULL)
1432 goto onError;
1433 if (size == 0)
1434 return (PyObject *)v;
1435 p = buf = PyUnicode_AS_UNICODE(v);
1436 end = s + size;
1437 while (s < end) {
1438 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001439 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 int i;
1441
1442 /* Non-escape characters are interpreted as Unicode ordinals */
1443 if (*s != '\\') {
1444 *p++ = (unsigned char)*s++;
1445 continue;
1446 }
1447
1448 /* \u-escapes are only interpreted iff the number of leading
1449 backslashes if odd */
1450 bs = s;
1451 for (;s < end;) {
1452 if (*s != '\\')
1453 break;
1454 *p++ = (unsigned char)*s++;
1455 }
1456 if (((s - bs) & 1) == 0 ||
1457 s >= end ||
1458 *s != 'u') {
1459 continue;
1460 }
1461 p--;
1462 s++;
1463
1464 /* \uXXXX with 4 hex digits */
1465 for (x = 0, i = 0; i < 4; i++) {
1466 c = (unsigned char)s[i];
1467 if (!isxdigit(c)) {
1468 if (unicodeescape_decoding_error(&s, &x, errors,
1469 "truncated \\uXXXX"))
1470 goto onError;
1471 i++;
1472 break;
1473 }
1474 x = (x<<4) & ~0xF;
1475 if (c >= '0' && c <= '9')
1476 x += c - '0';
1477 else if (c >= 'a' && c <= 'f')
1478 x += 10 + c - 'a';
1479 else
1480 x += 10 + c - 'A';
1481 }
1482 s += i;
1483 *p++ = x;
1484 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001485 if (_PyUnicode_Resize(v, (int)(p - buf)))
1486 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001487 return (PyObject *)v;
1488
1489 onError:
1490 Py_XDECREF(v);
1491 return NULL;
1492}
1493
1494PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1495 int size)
1496{
1497 PyObject *repr;
1498 char *p;
1499 char *q;
1500
1501 static const char *hexdigit = "0123456789ABCDEF";
1502
1503 repr = PyString_FromStringAndSize(NULL, 6 * size);
1504 if (repr == NULL)
1505 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001506 if (size == 0)
1507 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001508
1509 p = q = PyString_AS_STRING(repr);
1510 while (size-- > 0) {
1511 Py_UNICODE ch = *s++;
1512 /* Map 16-bit characters to '\uxxxx' */
1513 if (ch >= 256) {
1514 *p++ = '\\';
1515 *p++ = 'u';
1516 *p++ = hexdigit[(ch >> 12) & 0xf];
1517 *p++ = hexdigit[(ch >> 8) & 0xf];
1518 *p++ = hexdigit[(ch >> 4) & 0xf];
1519 *p++ = hexdigit[ch & 15];
1520 }
1521 /* Copy everything else as-is */
1522 else
1523 *p++ = (char) ch;
1524 }
1525 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001526 if (_PyString_Resize(&repr, p - q))
1527 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001528
1529 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001530
1531 onError:
1532 Py_DECREF(repr);
1533 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534}
1535
1536PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1537{
1538 if (!PyUnicode_Check(unicode)) {
1539 PyErr_BadArgument();
1540 return NULL;
1541 }
1542 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1543 PyUnicode_GET_SIZE(unicode));
1544}
1545
1546/* --- Latin-1 Codec ------------------------------------------------------ */
1547
1548PyObject *PyUnicode_DecodeLatin1(const char *s,
1549 int size,
1550 const char *errors)
1551{
1552 PyUnicodeObject *v;
1553 Py_UNICODE *p;
1554
1555 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1556 v = _PyUnicode_New(size);
1557 if (v == NULL)
1558 goto onError;
1559 if (size == 0)
1560 return (PyObject *)v;
1561 p = PyUnicode_AS_UNICODE(v);
1562 while (size-- > 0)
1563 *p++ = (unsigned char)*s++;
1564 return (PyObject *)v;
1565
1566 onError:
1567 Py_XDECREF(v);
1568 return NULL;
1569}
1570
1571static
1572int latin1_encoding_error(const Py_UNICODE **source,
1573 char **dest,
1574 const char *errors,
1575 const char *details)
1576{
1577 if ((errors == NULL) ||
1578 (strcmp(errors,"strict") == 0)) {
1579 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001580 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001581 details);
1582 return -1;
1583 }
1584 else if (strcmp(errors,"ignore") == 0) {
1585 return 0;
1586 }
1587 else if (strcmp(errors,"replace") == 0) {
1588 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001589 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001590 return 0;
1591 }
1592 else {
1593 PyErr_Format(PyExc_ValueError,
1594 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001595 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001596 errors);
1597 return -1;
1598 }
1599}
1600
1601PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1602 int size,
1603 const char *errors)
1604{
1605 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001606 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001607
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608 repr = PyString_FromStringAndSize(NULL, size);
1609 if (repr == NULL)
1610 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001611 if (size == 0)
1612 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001613
1614 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001615 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 while (size-- > 0) {
1617 Py_UNICODE ch = *p++;
1618 if (ch >= 256) {
1619 if (latin1_encoding_error(&p, &s, errors,
1620 "ordinal not in range(256)"))
1621 goto onError;
1622 }
1623 else
1624 *s++ = (char)ch;
1625 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001626 /* Resize if error handling skipped some characters */
1627 if (s - start < PyString_GET_SIZE(repr))
1628 if (_PyString_Resize(&repr, s - start))
1629 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 return repr;
1631
1632 onError:
1633 Py_DECREF(repr);
1634 return NULL;
1635}
1636
1637PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1638{
1639 if (!PyUnicode_Check(unicode)) {
1640 PyErr_BadArgument();
1641 return NULL;
1642 }
1643 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1644 PyUnicode_GET_SIZE(unicode),
1645 NULL);
1646}
1647
1648/* --- 7-bit ASCII Codec -------------------------------------------------- */
1649
1650static
1651int ascii_decoding_error(const char **source,
1652 Py_UNICODE **dest,
1653 const char *errors,
1654 const char *details)
1655{
1656 if ((errors == NULL) ||
1657 (strcmp(errors,"strict") == 0)) {
1658 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001659 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001660 details);
1661 return -1;
1662 }
1663 else if (strcmp(errors,"ignore") == 0) {
1664 return 0;
1665 }
1666 else if (strcmp(errors,"replace") == 0) {
1667 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1668 (*dest)++;
1669 return 0;
1670 }
1671 else {
1672 PyErr_Format(PyExc_ValueError,
1673 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001674 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001675 errors);
1676 return -1;
1677 }
1678}
1679
1680PyObject *PyUnicode_DecodeASCII(const char *s,
1681 int size,
1682 const char *errors)
1683{
1684 PyUnicodeObject *v;
1685 Py_UNICODE *p;
1686
1687 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1688 v = _PyUnicode_New(size);
1689 if (v == NULL)
1690 goto onError;
1691 if (size == 0)
1692 return (PyObject *)v;
1693 p = PyUnicode_AS_UNICODE(v);
1694 while (size-- > 0) {
1695 register unsigned char c;
1696
1697 c = (unsigned char)*s++;
1698 if (c < 128)
1699 *p++ = c;
1700 else if (ascii_decoding_error(&s, &p, errors,
1701 "ordinal not in range(128)"))
1702 goto onError;
1703 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001704 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1705 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1706 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001707 return (PyObject *)v;
1708
1709 onError:
1710 Py_XDECREF(v);
1711 return NULL;
1712}
1713
1714static
1715int ascii_encoding_error(const Py_UNICODE **source,
1716 char **dest,
1717 const char *errors,
1718 const char *details)
1719{
1720 if ((errors == NULL) ||
1721 (strcmp(errors,"strict") == 0)) {
1722 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001723 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001724 details);
1725 return -1;
1726 }
1727 else if (strcmp(errors,"ignore") == 0) {
1728 return 0;
1729 }
1730 else if (strcmp(errors,"replace") == 0) {
1731 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001732 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001733 return 0;
1734 }
1735 else {
1736 PyErr_Format(PyExc_ValueError,
1737 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001738 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001739 errors);
1740 return -1;
1741 }
1742}
1743
1744PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1745 int size,
1746 const char *errors)
1747{
1748 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001749 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001750
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 repr = PyString_FromStringAndSize(NULL, size);
1752 if (repr == NULL)
1753 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001754 if (size == 0)
1755 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001756
1757 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001758 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001759 while (size-- > 0) {
1760 Py_UNICODE ch = *p++;
1761 if (ch >= 128) {
1762 if (ascii_encoding_error(&p, &s, errors,
1763 "ordinal not in range(128)"))
1764 goto onError;
1765 }
1766 else
1767 *s++ = (char)ch;
1768 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001769 /* Resize if error handling skipped some characters */
1770 if (s - start < PyString_GET_SIZE(repr))
1771 if (_PyString_Resize(&repr, s - start))
1772 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001773 return repr;
1774
1775 onError:
1776 Py_DECREF(repr);
1777 return NULL;
1778}
1779
1780PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1781{
1782 if (!PyUnicode_Check(unicode)) {
1783 PyErr_BadArgument();
1784 return NULL;
1785 }
1786 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1787 PyUnicode_GET_SIZE(unicode),
1788 NULL);
1789}
1790
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001791#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001793/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001794
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001795PyObject *PyUnicode_DecodeMBCS(const char *s,
1796 int size,
1797 const char *errors)
1798{
1799 PyUnicodeObject *v;
1800 Py_UNICODE *p;
1801
1802 /* First get the size of the result */
1803 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001804 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001805 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1806
1807 v = _PyUnicode_New(usize);
1808 if (v == NULL)
1809 return NULL;
1810 if (usize == 0)
1811 return (PyObject *)v;
1812 p = PyUnicode_AS_UNICODE(v);
1813 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1814 Py_DECREF(v);
1815 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1816 }
1817
1818 return (PyObject *)v;
1819}
1820
1821PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1822 int size,
1823 const char *errors)
1824{
1825 PyObject *repr;
1826 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001827 DWORD mbcssize;
1828
1829 /* If there are no characters, bail now! */
1830 if (size==0)
1831 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001832
1833 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001834 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001835 if (mbcssize==0)
1836 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1837
1838 repr = PyString_FromStringAndSize(NULL, mbcssize);
1839 if (repr == NULL)
1840 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001841 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001842 return repr;
1843
1844 /* Do the conversion */
1845 s = PyString_AS_STRING(repr);
1846 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1847 Py_DECREF(repr);
1848 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1849 }
1850 return repr;
1851}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001852
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001853#endif /* MS_WIN32 */
1854
Guido van Rossumd57fd912000-03-10 22:53:23 +00001855/* --- Character Mapping Codec -------------------------------------------- */
1856
1857static
1858int charmap_decoding_error(const char **source,
1859 Py_UNICODE **dest,
1860 const char *errors,
1861 const char *details)
1862{
1863 if ((errors == NULL) ||
1864 (strcmp(errors,"strict") == 0)) {
1865 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001866 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867 details);
1868 return -1;
1869 }
1870 else if (strcmp(errors,"ignore") == 0) {
1871 return 0;
1872 }
1873 else if (strcmp(errors,"replace") == 0) {
1874 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1875 (*dest)++;
1876 return 0;
1877 }
1878 else {
1879 PyErr_Format(PyExc_ValueError,
1880 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001881 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001882 errors);
1883 return -1;
1884 }
1885}
1886
1887PyObject *PyUnicode_DecodeCharmap(const char *s,
1888 int size,
1889 PyObject *mapping,
1890 const char *errors)
1891{
1892 PyUnicodeObject *v;
1893 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001894 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895
1896 /* Default to Latin-1 */
1897 if (mapping == NULL)
1898 return PyUnicode_DecodeLatin1(s, size, errors);
1899
1900 v = _PyUnicode_New(size);
1901 if (v == NULL)
1902 goto onError;
1903 if (size == 0)
1904 return (PyObject *)v;
1905 p = PyUnicode_AS_UNICODE(v);
1906 while (size-- > 0) {
1907 unsigned char ch = *s++;
1908 PyObject *w, *x;
1909
1910 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1911 w = PyInt_FromLong((long)ch);
1912 if (w == NULL)
1913 goto onError;
1914 x = PyObject_GetItem(mapping, w);
1915 Py_DECREF(w);
1916 if (x == NULL) {
1917 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001918 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001919 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001920 x = Py_None;
1921 Py_INCREF(x);
1922 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001923 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 }
1925
1926 /* Apply mapping */
1927 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001928 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001929 if (value < 0 || value > 65535) {
1930 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001931 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 Py_DECREF(x);
1933 goto onError;
1934 }
1935 *p++ = (Py_UNICODE)value;
1936 }
1937 else if (x == Py_None) {
1938 /* undefined mapping */
1939 if (charmap_decoding_error(&s, &p, errors,
1940 "character maps to <undefined>")) {
1941 Py_DECREF(x);
1942 goto onError;
1943 }
1944 }
1945 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001946 int targetsize = PyUnicode_GET_SIZE(x);
1947
1948 if (targetsize == 1)
1949 /* 1-1 mapping */
1950 *p++ = *PyUnicode_AS_UNICODE(x);
1951
1952 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001953 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001954 if (targetsize > extrachars) {
1955 /* resize first */
1956 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1957 int needed = (targetsize - extrachars) + \
1958 (targetsize << 2);
1959 extrachars += needed;
1960 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001961 Py_DECREF(x);
1962 goto onError;
1963 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001964 p = PyUnicode_AS_UNICODE(v) + oldpos;
1965 }
1966 Py_UNICODE_COPY(p,
1967 PyUnicode_AS_UNICODE(x),
1968 targetsize);
1969 p += targetsize;
1970 extrachars -= targetsize;
1971 }
1972 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001973 }
1974 else {
1975 /* wrong return value */
1976 PyErr_SetString(PyExc_TypeError,
1977 "character mapping must return integer, None or unicode");
1978 Py_DECREF(x);
1979 goto onError;
1980 }
1981 Py_DECREF(x);
1982 }
1983 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1984 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1985 goto onError;
1986 return (PyObject *)v;
1987
1988 onError:
1989 Py_XDECREF(v);
1990 return NULL;
1991}
1992
1993static
1994int charmap_encoding_error(const Py_UNICODE **source,
1995 char **dest,
1996 const char *errors,
1997 const char *details)
1998{
1999 if ((errors == NULL) ||
2000 (strcmp(errors,"strict") == 0)) {
2001 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002002 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003 details);
2004 return -1;
2005 }
2006 else if (strcmp(errors,"ignore") == 0) {
2007 return 0;
2008 }
2009 else if (strcmp(errors,"replace") == 0) {
2010 **dest = '?';
2011 (*dest)++;
2012 return 0;
2013 }
2014 else {
2015 PyErr_Format(PyExc_ValueError,
2016 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002017 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002018 errors);
2019 return -1;
2020 }
2021}
2022
2023PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2024 int size,
2025 PyObject *mapping,
2026 const char *errors)
2027{
2028 PyObject *v;
2029 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002030 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002031
2032 /* Default to Latin-1 */
2033 if (mapping == NULL)
2034 return PyUnicode_EncodeLatin1(p, size, errors);
2035
2036 v = PyString_FromStringAndSize(NULL, size);
2037 if (v == NULL)
2038 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002039 if (size == 0)
2040 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041 s = PyString_AS_STRING(v);
2042 while (size-- > 0) {
2043 Py_UNICODE ch = *p++;
2044 PyObject *w, *x;
2045
2046 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2047 w = PyInt_FromLong((long)ch);
2048 if (w == NULL)
2049 goto onError;
2050 x = PyObject_GetItem(mapping, w);
2051 Py_DECREF(w);
2052 if (x == NULL) {
2053 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002054 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002055 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002056 x = Py_None;
2057 Py_INCREF(x);
2058 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002059 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 }
2061
2062 /* Apply mapping */
2063 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002064 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002065 if (value < 0 || value > 255) {
2066 PyErr_SetString(PyExc_TypeError,
2067 "character mapping must be in range(256)");
2068 Py_DECREF(x);
2069 goto onError;
2070 }
2071 *s++ = (char)value;
2072 }
2073 else if (x == Py_None) {
2074 /* undefined mapping */
2075 if (charmap_encoding_error(&p, &s, errors,
2076 "character maps to <undefined>")) {
2077 Py_DECREF(x);
2078 goto onError;
2079 }
2080 }
2081 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002082 int targetsize = PyString_GET_SIZE(x);
2083
2084 if (targetsize == 1)
2085 /* 1-1 mapping */
2086 *s++ = *PyString_AS_STRING(x);
2087
2088 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002089 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002090 if (targetsize > extrachars) {
2091 /* resize first */
2092 int oldpos = (int)(s - PyString_AS_STRING(v));
2093 int needed = (targetsize - extrachars) + \
2094 (targetsize << 2);
2095 extrachars += needed;
2096 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002097 Py_DECREF(x);
2098 goto onError;
2099 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002100 s = PyString_AS_STRING(v) + oldpos;
2101 }
2102 memcpy(s,
2103 PyString_AS_STRING(x),
2104 targetsize);
2105 s += targetsize;
2106 extrachars -= targetsize;
2107 }
2108 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002109 }
2110 else {
2111 /* wrong return value */
2112 PyErr_SetString(PyExc_TypeError,
2113 "character mapping must return integer, None or unicode");
2114 Py_DECREF(x);
2115 goto onError;
2116 }
2117 Py_DECREF(x);
2118 }
2119 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2120 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2121 goto onError;
2122 return v;
2123
2124 onError:
2125 Py_DECREF(v);
2126 return NULL;
2127}
2128
2129PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2130 PyObject *mapping)
2131{
2132 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2133 PyErr_BadArgument();
2134 return NULL;
2135 }
2136 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2137 PyUnicode_GET_SIZE(unicode),
2138 mapping,
2139 NULL);
2140}
2141
2142static
2143int translate_error(const Py_UNICODE **source,
2144 Py_UNICODE **dest,
2145 const char *errors,
2146 const char *details)
2147{
2148 if ((errors == NULL) ||
2149 (strcmp(errors,"strict") == 0)) {
2150 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002151 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002152 details);
2153 return -1;
2154 }
2155 else if (strcmp(errors,"ignore") == 0) {
2156 return 0;
2157 }
2158 else if (strcmp(errors,"replace") == 0) {
2159 **dest = '?';
2160 (*dest)++;
2161 return 0;
2162 }
2163 else {
2164 PyErr_Format(PyExc_ValueError,
2165 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002166 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002167 errors);
2168 return -1;
2169 }
2170}
2171
2172PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2173 int size,
2174 PyObject *mapping,
2175 const char *errors)
2176{
2177 PyUnicodeObject *v;
2178 Py_UNICODE *p;
2179
2180 if (mapping == NULL) {
2181 PyErr_BadArgument();
2182 return NULL;
2183 }
2184
2185 /* Output will never be longer than input */
2186 v = _PyUnicode_New(size);
2187 if (v == NULL)
2188 goto onError;
2189 if (size == 0)
2190 goto done;
2191 p = PyUnicode_AS_UNICODE(v);
2192 while (size-- > 0) {
2193 Py_UNICODE ch = *s++;
2194 PyObject *w, *x;
2195
2196 /* Get mapping */
2197 w = PyInt_FromLong(ch);
2198 if (w == NULL)
2199 goto onError;
2200 x = PyObject_GetItem(mapping, w);
2201 Py_DECREF(w);
2202 if (x == NULL) {
2203 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2204 /* No mapping found: default to 1-1 mapping */
2205 PyErr_Clear();
2206 *p++ = ch;
2207 continue;
2208 }
2209 goto onError;
2210 }
2211
2212 /* Apply mapping */
2213 if (PyInt_Check(x))
2214 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2215 else if (x == Py_None) {
2216 /* undefined mapping */
2217 if (translate_error(&s, &p, errors,
2218 "character maps to <undefined>")) {
2219 Py_DECREF(x);
2220 goto onError;
2221 }
2222 }
2223 else if (PyUnicode_Check(x)) {
2224 if (PyUnicode_GET_SIZE(x) != 1) {
2225 /* 1-n mapping */
2226 PyErr_SetString(PyExc_NotImplementedError,
2227 "1-n mappings are currently not implemented");
2228 Py_DECREF(x);
2229 goto onError;
2230 }
2231 *p++ = *PyUnicode_AS_UNICODE(x);
2232 }
2233 else {
2234 /* wrong return value */
2235 PyErr_SetString(PyExc_TypeError,
2236 "translate mapping must return integer, None or unicode");
2237 Py_DECREF(x);
2238 goto onError;
2239 }
2240 Py_DECREF(x);
2241 }
2242 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002243 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2244 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002245
2246 done:
2247 return (PyObject *)v;
2248
2249 onError:
2250 Py_XDECREF(v);
2251 return NULL;
2252}
2253
2254PyObject *PyUnicode_Translate(PyObject *str,
2255 PyObject *mapping,
2256 const char *errors)
2257{
2258 PyObject *result;
2259
2260 str = PyUnicode_FromObject(str);
2261 if (str == NULL)
2262 goto onError;
2263 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2264 PyUnicode_GET_SIZE(str),
2265 mapping,
2266 errors);
2267 Py_DECREF(str);
2268 return result;
2269
2270 onError:
2271 Py_XDECREF(str);
2272 return NULL;
2273}
2274
Guido van Rossum9e896b32000-04-05 20:11:21 +00002275/* --- Decimal Encoder ---------------------------------------------------- */
2276
2277int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2278 int length,
2279 char *output,
2280 const char *errors)
2281{
2282 Py_UNICODE *p, *end;
2283
2284 if (output == NULL) {
2285 PyErr_BadArgument();
2286 return -1;
2287 }
2288
2289 p = s;
2290 end = s + length;
2291 while (p < end) {
2292 register Py_UNICODE ch = *p++;
2293 int decimal;
2294
2295 if (Py_UNICODE_ISSPACE(ch)) {
2296 *output++ = ' ';
2297 continue;
2298 }
2299 decimal = Py_UNICODE_TODECIMAL(ch);
2300 if (decimal >= 0) {
2301 *output++ = '0' + decimal;
2302 continue;
2303 }
Guido van Rossumba477042000-04-06 18:18:10 +00002304 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002305 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002306 continue;
2307 }
2308 /* All other characters are considered invalid */
2309 if (errors == NULL || strcmp(errors, "strict") == 0) {
2310 PyErr_SetString(PyExc_ValueError,
2311 "invalid decimal Unicode string");
2312 goto onError;
2313 }
2314 else if (strcmp(errors, "ignore") == 0)
2315 continue;
2316 else if (strcmp(errors, "replace") == 0) {
2317 *output++ = '?';
2318 continue;
2319 }
2320 }
2321 /* 0-terminate the output string */
2322 *output++ = '\0';
2323 return 0;
2324
2325 onError:
2326 return -1;
2327}
2328
Guido van Rossumd57fd912000-03-10 22:53:23 +00002329/* --- Helpers ------------------------------------------------------------ */
2330
2331static
2332int count(PyUnicodeObject *self,
2333 int start,
2334 int end,
2335 PyUnicodeObject *substring)
2336{
2337 int count = 0;
2338
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002339 if (start < 0)
2340 start += self->length;
2341 if (start < 0)
2342 start = 0;
2343 if (end > self->length)
2344 end = self->length;
2345 if (end < 0)
2346 end += self->length;
2347 if (end < 0)
2348 end = 0;
2349
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002350 if (substring->length == 0)
2351 return (end - start + 1);
2352
Guido van Rossumd57fd912000-03-10 22:53:23 +00002353 end -= substring->length;
2354
2355 while (start <= end)
2356 if (Py_UNICODE_MATCH(self, start, substring)) {
2357 count++;
2358 start += substring->length;
2359 } else
2360 start++;
2361
2362 return count;
2363}
2364
2365int PyUnicode_Count(PyObject *str,
2366 PyObject *substr,
2367 int start,
2368 int end)
2369{
2370 int result;
2371
2372 str = PyUnicode_FromObject(str);
2373 if (str == NULL)
2374 return -1;
2375 substr = PyUnicode_FromObject(substr);
2376 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002377 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002378 return -1;
2379 }
2380
2381 result = count((PyUnicodeObject *)str,
2382 start, end,
2383 (PyUnicodeObject *)substr);
2384
2385 Py_DECREF(str);
2386 Py_DECREF(substr);
2387 return result;
2388}
2389
2390static
2391int findstring(PyUnicodeObject *self,
2392 PyUnicodeObject *substring,
2393 int start,
2394 int end,
2395 int direction)
2396{
2397 if (start < 0)
2398 start += self->length;
2399 if (start < 0)
2400 start = 0;
2401
2402 if (substring->length == 0)
2403 return start;
2404
2405 if (end > self->length)
2406 end = self->length;
2407 if (end < 0)
2408 end += self->length;
2409 if (end < 0)
2410 end = 0;
2411
2412 end -= substring->length;
2413
2414 if (direction < 0) {
2415 for (; end >= start; end--)
2416 if (Py_UNICODE_MATCH(self, end, substring))
2417 return end;
2418 } else {
2419 for (; start <= end; start++)
2420 if (Py_UNICODE_MATCH(self, start, substring))
2421 return start;
2422 }
2423
2424 return -1;
2425}
2426
2427int PyUnicode_Find(PyObject *str,
2428 PyObject *substr,
2429 int start,
2430 int end,
2431 int direction)
2432{
2433 int result;
2434
2435 str = PyUnicode_FromObject(str);
2436 if (str == NULL)
2437 return -1;
2438 substr = PyUnicode_FromObject(substr);
2439 if (substr == NULL) {
2440 Py_DECREF(substr);
2441 return -1;
2442 }
2443
2444 result = findstring((PyUnicodeObject *)str,
2445 (PyUnicodeObject *)substr,
2446 start, end, direction);
2447 Py_DECREF(str);
2448 Py_DECREF(substr);
2449 return result;
2450}
2451
2452static
2453int tailmatch(PyUnicodeObject *self,
2454 PyUnicodeObject *substring,
2455 int start,
2456 int end,
2457 int direction)
2458{
2459 if (start < 0)
2460 start += self->length;
2461 if (start < 0)
2462 start = 0;
2463
2464 if (substring->length == 0)
2465 return 1;
2466
2467 if (end > self->length)
2468 end = self->length;
2469 if (end < 0)
2470 end += self->length;
2471 if (end < 0)
2472 end = 0;
2473
2474 end -= substring->length;
2475 if (end < start)
2476 return 0;
2477
2478 if (direction > 0) {
2479 if (Py_UNICODE_MATCH(self, end, substring))
2480 return 1;
2481 } else {
2482 if (Py_UNICODE_MATCH(self, start, substring))
2483 return 1;
2484 }
2485
2486 return 0;
2487}
2488
2489int PyUnicode_Tailmatch(PyObject *str,
2490 PyObject *substr,
2491 int start,
2492 int end,
2493 int direction)
2494{
2495 int result;
2496
2497 str = PyUnicode_FromObject(str);
2498 if (str == NULL)
2499 return -1;
2500 substr = PyUnicode_FromObject(substr);
2501 if (substr == NULL) {
2502 Py_DECREF(substr);
2503 return -1;
2504 }
2505
2506 result = tailmatch((PyUnicodeObject *)str,
2507 (PyUnicodeObject *)substr,
2508 start, end, direction);
2509 Py_DECREF(str);
2510 Py_DECREF(substr);
2511 return result;
2512}
2513
2514static
2515const Py_UNICODE *findchar(const Py_UNICODE *s,
2516 int size,
2517 Py_UNICODE ch)
2518{
2519 /* like wcschr, but doesn't stop at NULL characters */
2520
2521 while (size-- > 0) {
2522 if (*s == ch)
2523 return s;
2524 s++;
2525 }
2526
2527 return NULL;
2528}
2529
2530/* Apply fixfct filter to the Unicode object self and return a
2531 reference to the modified object */
2532
2533static
2534PyObject *fixup(PyUnicodeObject *self,
2535 int (*fixfct)(PyUnicodeObject *s))
2536{
2537
2538 PyUnicodeObject *u;
2539
2540 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2541 self->length);
2542 if (u == NULL)
2543 return NULL;
2544 if (!fixfct(u)) {
2545 /* fixfct should return TRUE if it modified the buffer. If
2546 FALSE, return a reference to the original buffer instead
2547 (to save space, not time) */
2548 Py_INCREF(self);
2549 Py_DECREF(u);
2550 return (PyObject*) self;
2551 }
2552 return (PyObject*) u;
2553}
2554
2555static
2556int fixupper(PyUnicodeObject *self)
2557{
2558 int len = self->length;
2559 Py_UNICODE *s = self->str;
2560 int status = 0;
2561
2562 while (len-- > 0) {
2563 register Py_UNICODE ch;
2564
2565 ch = Py_UNICODE_TOUPPER(*s);
2566 if (ch != *s) {
2567 status = 1;
2568 *s = ch;
2569 }
2570 s++;
2571 }
2572
2573 return status;
2574}
2575
2576static
2577int fixlower(PyUnicodeObject *self)
2578{
2579 int len = self->length;
2580 Py_UNICODE *s = self->str;
2581 int status = 0;
2582
2583 while (len-- > 0) {
2584 register Py_UNICODE ch;
2585
2586 ch = Py_UNICODE_TOLOWER(*s);
2587 if (ch != *s) {
2588 status = 1;
2589 *s = ch;
2590 }
2591 s++;
2592 }
2593
2594 return status;
2595}
2596
2597static
2598int fixswapcase(PyUnicodeObject *self)
2599{
2600 int len = self->length;
2601 Py_UNICODE *s = self->str;
2602 int status = 0;
2603
2604 while (len-- > 0) {
2605 if (Py_UNICODE_ISUPPER(*s)) {
2606 *s = Py_UNICODE_TOLOWER(*s);
2607 status = 1;
2608 } else if (Py_UNICODE_ISLOWER(*s)) {
2609 *s = Py_UNICODE_TOUPPER(*s);
2610 status = 1;
2611 }
2612 s++;
2613 }
2614
2615 return status;
2616}
2617
2618static
2619int fixcapitalize(PyUnicodeObject *self)
2620{
2621 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2622 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2623 return 1;
2624 }
2625 return 0;
2626}
2627
2628static
2629int fixtitle(PyUnicodeObject *self)
2630{
2631 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2632 register Py_UNICODE *e;
2633 int previous_is_cased;
2634
2635 /* Shortcut for single character strings */
2636 if (PyUnicode_GET_SIZE(self) == 1) {
2637 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2638 if (*p != ch) {
2639 *p = ch;
2640 return 1;
2641 }
2642 else
2643 return 0;
2644 }
2645
2646 e = p + PyUnicode_GET_SIZE(self);
2647 previous_is_cased = 0;
2648 for (; p < e; p++) {
2649 register const Py_UNICODE ch = *p;
2650
2651 if (previous_is_cased)
2652 *p = Py_UNICODE_TOLOWER(ch);
2653 else
2654 *p = Py_UNICODE_TOTITLE(ch);
2655
2656 if (Py_UNICODE_ISLOWER(ch) ||
2657 Py_UNICODE_ISUPPER(ch) ||
2658 Py_UNICODE_ISTITLE(ch))
2659 previous_is_cased = 1;
2660 else
2661 previous_is_cased = 0;
2662 }
2663 return 1;
2664}
2665
2666PyObject *PyUnicode_Join(PyObject *separator,
2667 PyObject *seq)
2668{
2669 Py_UNICODE *sep;
2670 int seplen;
2671 PyUnicodeObject *res = NULL;
2672 int reslen = 0;
2673 Py_UNICODE *p;
2674 int seqlen = 0;
2675 int sz = 100;
2676 int i;
2677
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002678 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002679 if (seqlen < 0 && PyErr_Occurred())
2680 return NULL;
2681
2682 if (separator == NULL) {
2683 Py_UNICODE blank = ' ';
2684 sep = &blank;
2685 seplen = 1;
2686 }
2687 else {
2688 separator = PyUnicode_FromObject(separator);
2689 if (separator == NULL)
2690 return NULL;
2691 sep = PyUnicode_AS_UNICODE(separator);
2692 seplen = PyUnicode_GET_SIZE(separator);
2693 }
2694
2695 res = _PyUnicode_New(sz);
2696 if (res == NULL)
2697 goto onError;
2698 p = PyUnicode_AS_UNICODE(res);
2699 reslen = 0;
2700
2701 for (i = 0; i < seqlen; i++) {
2702 int itemlen;
2703 PyObject *item;
2704
2705 item = PySequence_GetItem(seq, i);
2706 if (item == NULL)
2707 goto onError;
2708 if (!PyUnicode_Check(item)) {
2709 PyObject *v;
2710 v = PyUnicode_FromObject(item);
2711 Py_DECREF(item);
2712 item = v;
2713 if (item == NULL)
2714 goto onError;
2715 }
2716 itemlen = PyUnicode_GET_SIZE(item);
2717 while (reslen + itemlen + seplen >= sz) {
2718 if (_PyUnicode_Resize(res, sz*2))
2719 goto onError;
2720 sz *= 2;
2721 p = PyUnicode_AS_UNICODE(res) + reslen;
2722 }
2723 if (i > 0) {
2724 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2725 p += seplen;
2726 reslen += seplen;
2727 }
2728 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2729 p += itemlen;
2730 reslen += itemlen;
2731 Py_DECREF(item);
2732 }
2733 if (_PyUnicode_Resize(res, reslen))
2734 goto onError;
2735
2736 Py_XDECREF(separator);
2737 return (PyObject *)res;
2738
2739 onError:
2740 Py_XDECREF(separator);
2741 Py_DECREF(res);
2742 return NULL;
2743}
2744
2745static
2746PyUnicodeObject *pad(PyUnicodeObject *self,
2747 int left,
2748 int right,
2749 Py_UNICODE fill)
2750{
2751 PyUnicodeObject *u;
2752
2753 if (left < 0)
2754 left = 0;
2755 if (right < 0)
2756 right = 0;
2757
2758 if (left == 0 && right == 0) {
2759 Py_INCREF(self);
2760 return self;
2761 }
2762
2763 u = _PyUnicode_New(left + self->length + right);
2764 if (u) {
2765 if (left)
2766 Py_UNICODE_FILL(u->str, fill, left);
2767 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2768 if (right)
2769 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2770 }
2771
2772 return u;
2773}
2774
2775#define SPLIT_APPEND(data, left, right) \
2776 str = PyUnicode_FromUnicode(data + left, right - left); \
2777 if (!str) \
2778 goto onError; \
2779 if (PyList_Append(list, str)) { \
2780 Py_DECREF(str); \
2781 goto onError; \
2782 } \
2783 else \
2784 Py_DECREF(str);
2785
2786static
2787PyObject *split_whitespace(PyUnicodeObject *self,
2788 PyObject *list,
2789 int maxcount)
2790{
2791 register int i;
2792 register int j;
2793 int len = self->length;
2794 PyObject *str;
2795
2796 for (i = j = 0; i < len; ) {
2797 /* find a token */
2798 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2799 i++;
2800 j = i;
2801 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2802 i++;
2803 if (j < i) {
2804 if (maxcount-- <= 0)
2805 break;
2806 SPLIT_APPEND(self->str, j, i);
2807 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2808 i++;
2809 j = i;
2810 }
2811 }
2812 if (j < len) {
2813 SPLIT_APPEND(self->str, j, len);
2814 }
2815 return list;
2816
2817 onError:
2818 Py_DECREF(list);
2819 return NULL;
2820}
2821
2822PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002823 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824{
2825 register int i;
2826 register int j;
2827 int len;
2828 PyObject *list;
2829 PyObject *str;
2830 Py_UNICODE *data;
2831
2832 string = PyUnicode_FromObject(string);
2833 if (string == NULL)
2834 return NULL;
2835 data = PyUnicode_AS_UNICODE(string);
2836 len = PyUnicode_GET_SIZE(string);
2837
Guido van Rossumd57fd912000-03-10 22:53:23 +00002838 list = PyList_New(0);
2839 if (!list)
2840 goto onError;
2841
2842 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002843 int eol;
2844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 /* Find a line and append it */
2846 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2847 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848
2849 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002850 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002851 if (i < len) {
2852 if (data[i] == '\r' && i + 1 < len &&
2853 data[i+1] == '\n')
2854 i += 2;
2855 else
2856 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002857 if (keepends)
2858 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 }
Guido van Rossum86662912000-04-11 15:38:46 +00002860 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002861 j = i;
2862 }
2863 if (j < len) {
2864 SPLIT_APPEND(data, j, len);
2865 }
2866
2867 Py_DECREF(string);
2868 return list;
2869
2870 onError:
2871 Py_DECREF(list);
2872 Py_DECREF(string);
2873 return NULL;
2874}
2875
2876static
2877PyObject *split_char(PyUnicodeObject *self,
2878 PyObject *list,
2879 Py_UNICODE ch,
2880 int maxcount)
2881{
2882 register int i;
2883 register int j;
2884 int len = self->length;
2885 PyObject *str;
2886
2887 for (i = j = 0; i < len; ) {
2888 if (self->str[i] == ch) {
2889 if (maxcount-- <= 0)
2890 break;
2891 SPLIT_APPEND(self->str, j, i);
2892 i = j = i + 1;
2893 } else
2894 i++;
2895 }
2896 if (j <= len) {
2897 SPLIT_APPEND(self->str, j, len);
2898 }
2899 return list;
2900
2901 onError:
2902 Py_DECREF(list);
2903 return NULL;
2904}
2905
2906static
2907PyObject *split_substring(PyUnicodeObject *self,
2908 PyObject *list,
2909 PyUnicodeObject *substring,
2910 int maxcount)
2911{
2912 register int i;
2913 register int j;
2914 int len = self->length;
2915 int sublen = substring->length;
2916 PyObject *str;
2917
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002918 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002919 if (Py_UNICODE_MATCH(self, i, substring)) {
2920 if (maxcount-- <= 0)
2921 break;
2922 SPLIT_APPEND(self->str, j, i);
2923 i = j = i + sublen;
2924 } else
2925 i++;
2926 }
2927 if (j <= len) {
2928 SPLIT_APPEND(self->str, j, len);
2929 }
2930 return list;
2931
2932 onError:
2933 Py_DECREF(list);
2934 return NULL;
2935}
2936
2937#undef SPLIT_APPEND
2938
2939static
2940PyObject *split(PyUnicodeObject *self,
2941 PyUnicodeObject *substring,
2942 int maxcount)
2943{
2944 PyObject *list;
2945
2946 if (maxcount < 0)
2947 maxcount = INT_MAX;
2948
2949 list = PyList_New(0);
2950 if (!list)
2951 return NULL;
2952
2953 if (substring == NULL)
2954 return split_whitespace(self,list,maxcount);
2955
2956 else if (substring->length == 1)
2957 return split_char(self,list,substring->str[0],maxcount);
2958
2959 else if (substring->length == 0) {
2960 Py_DECREF(list);
2961 PyErr_SetString(PyExc_ValueError, "empty separator");
2962 return NULL;
2963 }
2964 else
2965 return split_substring(self,list,substring,maxcount);
2966}
2967
2968static
2969PyObject *strip(PyUnicodeObject *self,
2970 int left,
2971 int right)
2972{
2973 Py_UNICODE *p = self->str;
2974 int start = 0;
2975 int end = self->length;
2976
2977 if (left)
2978 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2979 start++;
2980
2981 if (right)
2982 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2983 end--;
2984
2985 if (start == 0 && end == self->length) {
2986 /* couldn't strip anything off, return original string */
2987 Py_INCREF(self);
2988 return (PyObject*) self;
2989 }
2990
2991 return (PyObject*) PyUnicode_FromUnicode(
2992 self->str + start,
2993 end - start
2994 );
2995}
2996
2997static
2998PyObject *replace(PyUnicodeObject *self,
2999 PyUnicodeObject *str1,
3000 PyUnicodeObject *str2,
3001 int maxcount)
3002{
3003 PyUnicodeObject *u;
3004
3005 if (maxcount < 0)
3006 maxcount = INT_MAX;
3007
3008 if (str1->length == 1 && str2->length == 1) {
3009 int i;
3010
3011 /* replace characters */
3012 if (!findchar(self->str, self->length, str1->str[0])) {
3013 /* nothing to replace, return original string */
3014 Py_INCREF(self);
3015 u = self;
3016 } else {
3017 Py_UNICODE u1 = str1->str[0];
3018 Py_UNICODE u2 = str2->str[0];
3019
3020 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3021 self->str,
3022 self->length
3023 );
3024 if (u)
3025 for (i = 0; i < u->length; i++)
3026 if (u->str[i] == u1) {
3027 if (--maxcount < 0)
3028 break;
3029 u->str[i] = u2;
3030 }
3031 }
3032
3033 } else {
3034 int n, i;
3035 Py_UNICODE *p;
3036
3037 /* replace strings */
3038 n = count(self, 0, self->length, str1);
3039 if (n > maxcount)
3040 n = maxcount;
3041 if (n == 0) {
3042 /* nothing to replace, return original string */
3043 Py_INCREF(self);
3044 u = self;
3045 } else {
3046 u = _PyUnicode_New(
3047 self->length + n * (str2->length - str1->length));
3048 if (u) {
3049 i = 0;
3050 p = u->str;
3051 while (i <= self->length - str1->length)
3052 if (Py_UNICODE_MATCH(self, i, str1)) {
3053 /* replace string segment */
3054 Py_UNICODE_COPY(p, str2->str, str2->length);
3055 p += str2->length;
3056 i += str1->length;
3057 if (--n <= 0) {
3058 /* copy remaining part */
3059 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3060 break;
3061 }
3062 } else
3063 *p++ = self->str[i++];
3064 }
3065 }
3066 }
3067
3068 return (PyObject *) u;
3069}
3070
3071/* --- Unicode Object Methods --------------------------------------------- */
3072
3073static char title__doc__[] =
3074"S.title() -> unicode\n\
3075\n\
3076Return a titlecased version of S, i.e. words start with title case\n\
3077characters, all remaining cased characters have lower case.";
3078
3079static PyObject*
3080unicode_title(PyUnicodeObject *self, PyObject *args)
3081{
3082 if (!PyArg_NoArgs(args))
3083 return NULL;
3084 return fixup(self, fixtitle);
3085}
3086
3087static char capitalize__doc__[] =
3088"S.capitalize() -> unicode\n\
3089\n\
3090Return a capitalized version of S, i.e. make the first character\n\
3091have upper case.";
3092
3093static PyObject*
3094unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3095{
3096 if (!PyArg_NoArgs(args))
3097 return NULL;
3098 return fixup(self, fixcapitalize);
3099}
3100
3101#if 0
3102static char capwords__doc__[] =
3103"S.capwords() -> unicode\n\
3104\n\
3105Apply .capitalize() to all words in S and return the result with\n\
3106normalized whitespace (all whitespace strings are replaced by ' ').";
3107
3108static PyObject*
3109unicode_capwords(PyUnicodeObject *self, PyObject *args)
3110{
3111 PyObject *list;
3112 PyObject *item;
3113 int i;
3114
3115 if (!PyArg_NoArgs(args))
3116 return NULL;
3117
3118 /* Split into words */
3119 list = split(self, NULL, -1);
3120 if (!list)
3121 return NULL;
3122
3123 /* Capitalize each word */
3124 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3125 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3126 fixcapitalize);
3127 if (item == NULL)
3128 goto onError;
3129 Py_DECREF(PyList_GET_ITEM(list, i));
3130 PyList_SET_ITEM(list, i, item);
3131 }
3132
3133 /* Join the words to form a new string */
3134 item = PyUnicode_Join(NULL, list);
3135
3136onError:
3137 Py_DECREF(list);
3138 return (PyObject *)item;
3139}
3140#endif
3141
3142static char center__doc__[] =
3143"S.center(width) -> unicode\n\
3144\n\
3145Return S centered in a Unicode string of length width. Padding is done\n\
3146using spaces.";
3147
3148static PyObject *
3149unicode_center(PyUnicodeObject *self, PyObject *args)
3150{
3151 int marg, left;
3152 int width;
3153
3154 if (!PyArg_ParseTuple(args, "i:center", &width))
3155 return NULL;
3156
3157 if (self->length >= width) {
3158 Py_INCREF(self);
3159 return (PyObject*) self;
3160 }
3161
3162 marg = width - self->length;
3163 left = marg / 2 + (marg & width & 1);
3164
3165 return (PyObject*) pad(self, left, marg - left, ' ');
3166}
3167
Marc-André Lemburge5034372000-08-08 08:04:29 +00003168#if 0
3169
3170/* This code should go into some future Unicode collation support
3171 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003172 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003173
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003174/* speedy UTF-16 code point order comparison */
3175/* gleaned from: */
3176/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3177
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003178static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003179{
3180 0, 0, 0, 0, 0, 0, 0, 0,
3181 0, 0, 0, 0, 0, 0, 0, 0,
3182 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003183 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003184};
3185
Guido van Rossumd57fd912000-03-10 22:53:23 +00003186static int
3187unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3188{
3189 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003190
Guido van Rossumd57fd912000-03-10 22:53:23 +00003191 Py_UNICODE *s1 = str1->str;
3192 Py_UNICODE *s2 = str2->str;
3193
3194 len1 = str1->length;
3195 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003196
Guido van Rossumd57fd912000-03-10 22:53:23 +00003197 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003198 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003199 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003200
3201 c1 = *s1++;
3202 c2 = *s2++;
3203 if (c1 > (1<<11) * 26)
3204 c1 += utf16Fixup[c1>>11];
3205 if (c2 > (1<<11) * 26)
3206 c2 += utf16Fixup[c2>>11];
3207
3208 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003209 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003210 if (diff)
3211 return (diff < 0) ? -1 : (diff != 0);
3212 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003213 }
3214
3215 return (len1 < len2) ? -1 : (len1 != len2);
3216}
3217
Marc-André Lemburge5034372000-08-08 08:04:29 +00003218#else
3219
3220static int
3221unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3222{
3223 register int len1, len2;
3224
3225 Py_UNICODE *s1 = str1->str;
3226 Py_UNICODE *s2 = str2->str;
3227
3228 len1 = str1->length;
3229 len2 = str2->length;
3230
3231 while (len1 > 0 && len2 > 0) {
3232 register long diff;
3233
3234 diff = (long)*s1++ - (long)*s2++;
3235 if (diff)
3236 return (diff < 0) ? -1 : (diff != 0);
3237 len1--; len2--;
3238 }
3239
3240 return (len1 < len2) ? -1 : (len1 != len2);
3241}
3242
3243#endif
3244
Guido van Rossumd57fd912000-03-10 22:53:23 +00003245int PyUnicode_Compare(PyObject *left,
3246 PyObject *right)
3247{
3248 PyUnicodeObject *u = NULL, *v = NULL;
3249 int result;
3250
3251 /* Coerce the two arguments */
3252 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3253 if (u == NULL)
3254 goto onError;
3255 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3256 if (v == NULL)
3257 goto onError;
3258
Thomas Wouters7e474022000-07-16 12:04:32 +00003259 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 if (v == u) {
3261 Py_DECREF(u);
3262 Py_DECREF(v);
3263 return 0;
3264 }
3265
3266 result = unicode_compare(u, v);
3267
3268 Py_DECREF(u);
3269 Py_DECREF(v);
3270 return result;
3271
3272onError:
3273 Py_XDECREF(u);
3274 Py_XDECREF(v);
3275 return -1;
3276}
3277
Guido van Rossum403d68b2000-03-13 15:55:09 +00003278int PyUnicode_Contains(PyObject *container,
3279 PyObject *element)
3280{
3281 PyUnicodeObject *u = NULL, *v = NULL;
3282 int result;
3283 register const Py_UNICODE *p, *e;
3284 register Py_UNICODE ch;
3285
3286 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003287 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003288 if (v == NULL) {
3289 PyErr_SetString(PyExc_TypeError,
3290 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003291 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003292 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003293 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3294 if (u == NULL) {
3295 Py_DECREF(v);
3296 goto onError;
3297 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003298
3299 /* Check v in u */
3300 if (PyUnicode_GET_SIZE(v) != 1) {
3301 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003302 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003303 goto onError;
3304 }
3305 ch = *PyUnicode_AS_UNICODE(v);
3306 p = PyUnicode_AS_UNICODE(u);
3307 e = p + PyUnicode_GET_SIZE(u);
3308 result = 0;
3309 while (p < e) {
3310 if (*p++ == ch) {
3311 result = 1;
3312 break;
3313 }
3314 }
3315
3316 Py_DECREF(u);
3317 Py_DECREF(v);
3318 return result;
3319
3320onError:
3321 Py_XDECREF(u);
3322 Py_XDECREF(v);
3323 return -1;
3324}
3325
Guido van Rossumd57fd912000-03-10 22:53:23 +00003326/* Concat to string or Unicode object giving a new Unicode object. */
3327
3328PyObject *PyUnicode_Concat(PyObject *left,
3329 PyObject *right)
3330{
3331 PyUnicodeObject *u = NULL, *v = NULL, *w;
3332
3333 /* Coerce the two arguments */
3334 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3335 if (u == NULL)
3336 goto onError;
3337 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3338 if (v == NULL)
3339 goto onError;
3340
3341 /* Shortcuts */
3342 if (v == unicode_empty) {
3343 Py_DECREF(v);
3344 return (PyObject *)u;
3345 }
3346 if (u == unicode_empty) {
3347 Py_DECREF(u);
3348 return (PyObject *)v;
3349 }
3350
3351 /* Concat the two Unicode strings */
3352 w = _PyUnicode_New(u->length + v->length);
3353 if (w == NULL)
3354 goto onError;
3355 Py_UNICODE_COPY(w->str, u->str, u->length);
3356 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3357
3358 Py_DECREF(u);
3359 Py_DECREF(v);
3360 return (PyObject *)w;
3361
3362onError:
3363 Py_XDECREF(u);
3364 Py_XDECREF(v);
3365 return NULL;
3366}
3367
3368static char count__doc__[] =
3369"S.count(sub[, start[, end]]) -> int\n\
3370\n\
3371Return the number of occurrences of substring sub in Unicode string\n\
3372S[start:end]. Optional arguments start and end are\n\
3373interpreted as in slice notation.";
3374
3375static PyObject *
3376unicode_count(PyUnicodeObject *self, PyObject *args)
3377{
3378 PyUnicodeObject *substring;
3379 int start = 0;
3380 int end = INT_MAX;
3381 PyObject *result;
3382
Guido van Rossumb8872e62000-05-09 14:14:27 +00003383 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3384 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003385 return NULL;
3386
3387 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3388 (PyObject *)substring);
3389 if (substring == NULL)
3390 return NULL;
3391
Guido van Rossumd57fd912000-03-10 22:53:23 +00003392 if (start < 0)
3393 start += self->length;
3394 if (start < 0)
3395 start = 0;
3396 if (end > self->length)
3397 end = self->length;
3398 if (end < 0)
3399 end += self->length;
3400 if (end < 0)
3401 end = 0;
3402
3403 result = PyInt_FromLong((long) count(self, start, end, substring));
3404
3405 Py_DECREF(substring);
3406 return result;
3407}
3408
3409static char encode__doc__[] =
3410"S.encode([encoding[,errors]]) -> string\n\
3411\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003412Return an encoded string version of S. Default encoding is the current\n\
3413default string encoding. errors may be given to set a different error\n\
3414handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3415a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416
3417static PyObject *
3418unicode_encode(PyUnicodeObject *self, PyObject *args)
3419{
3420 char *encoding = NULL;
3421 char *errors = NULL;
3422 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3423 return NULL;
3424 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3425}
3426
3427static char expandtabs__doc__[] =
3428"S.expandtabs([tabsize]) -> unicode\n\
3429\n\
3430Return a copy of S where all tab characters are expanded using spaces.\n\
3431If tabsize is not given, a tab size of 8 characters is assumed.";
3432
3433static PyObject*
3434unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3435{
3436 Py_UNICODE *e;
3437 Py_UNICODE *p;
3438 Py_UNICODE *q;
3439 int i, j;
3440 PyUnicodeObject *u;
3441 int tabsize = 8;
3442
3443 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3444 return NULL;
3445
Thomas Wouters7e474022000-07-16 12:04:32 +00003446 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 i = j = 0;
3448 e = self->str + self->length;
3449 for (p = self->str; p < e; p++)
3450 if (*p == '\t') {
3451 if (tabsize > 0)
3452 j += tabsize - (j % tabsize);
3453 }
3454 else {
3455 j++;
3456 if (*p == '\n' || *p == '\r') {
3457 i += j;
3458 j = 0;
3459 }
3460 }
3461
3462 /* Second pass: create output string and fill it */
3463 u = _PyUnicode_New(i + j);
3464 if (!u)
3465 return NULL;
3466
3467 j = 0;
3468 q = u->str;
3469
3470 for (p = self->str; p < e; p++)
3471 if (*p == '\t') {
3472 if (tabsize > 0) {
3473 i = tabsize - (j % tabsize);
3474 j += i;
3475 while (i--)
3476 *q++ = ' ';
3477 }
3478 }
3479 else {
3480 j++;
3481 *q++ = *p;
3482 if (*p == '\n' || *p == '\r')
3483 j = 0;
3484 }
3485
3486 return (PyObject*) u;
3487}
3488
3489static char find__doc__[] =
3490"S.find(sub [,start [,end]]) -> int\n\
3491\n\
3492Return the lowest index in S where substring sub is found,\n\
3493such that sub is contained within s[start,end]. Optional\n\
3494arguments start and end are interpreted as in slice notation.\n\
3495\n\
3496Return -1 on failure.";
3497
3498static PyObject *
3499unicode_find(PyUnicodeObject *self, PyObject *args)
3500{
3501 PyUnicodeObject *substring;
3502 int start = 0;
3503 int end = INT_MAX;
3504 PyObject *result;
3505
Guido van Rossumb8872e62000-05-09 14:14:27 +00003506 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3507 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003508 return NULL;
3509 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3510 (PyObject *)substring);
3511 if (substring == NULL)
3512 return NULL;
3513
3514 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3515
3516 Py_DECREF(substring);
3517 return result;
3518}
3519
3520static PyObject *
3521unicode_getitem(PyUnicodeObject *self, int index)
3522{
3523 if (index < 0 || index >= self->length) {
3524 PyErr_SetString(PyExc_IndexError, "string index out of range");
3525 return NULL;
3526 }
3527
3528 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3529}
3530
3531static long
3532unicode_hash(PyUnicodeObject *self)
3533{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003534 /* Since Unicode objects compare equal to their ASCII string
3535 counterparts, they should use the individual character values
3536 as basis for their hash value. This is needed to assure that
3537 strings and Unicode objects behave in the same way as
3538 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539
Fredrik Lundhdde61642000-07-10 18:27:47 +00003540 register int len;
3541 register Py_UNICODE *p;
3542 register long x;
3543
Guido van Rossumd57fd912000-03-10 22:53:23 +00003544 if (self->hash != -1)
3545 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003546 len = PyUnicode_GET_SIZE(self);
3547 p = PyUnicode_AS_UNICODE(self);
3548 x = *p << 7;
3549 while (--len >= 0)
3550 x = (1000003*x) ^ *p++;
3551 x ^= PyUnicode_GET_SIZE(self);
3552 if (x == -1)
3553 x = -2;
3554 self->hash = x;
3555 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556}
3557
3558static char index__doc__[] =
3559"S.index(sub [,start [,end]]) -> int\n\
3560\n\
3561Like S.find() but raise ValueError when the substring is not found.";
3562
3563static PyObject *
3564unicode_index(PyUnicodeObject *self, PyObject *args)
3565{
3566 int result;
3567 PyUnicodeObject *substring;
3568 int start = 0;
3569 int end = INT_MAX;
3570
Guido van Rossumb8872e62000-05-09 14:14:27 +00003571 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3572 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003573 return NULL;
3574
3575 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3576 (PyObject *)substring);
3577 if (substring == NULL)
3578 return NULL;
3579
3580 result = findstring(self, substring, start, end, 1);
3581
3582 Py_DECREF(substring);
3583 if (result < 0) {
3584 PyErr_SetString(PyExc_ValueError, "substring not found");
3585 return NULL;
3586 }
3587 return PyInt_FromLong(result);
3588}
3589
3590static char islower__doc__[] =
3591"S.islower() -> int\n\
3592\n\
3593Return 1 if all cased characters in S are lowercase and there is\n\
3594at least one cased character in S, 0 otherwise.";
3595
3596static PyObject*
3597unicode_islower(PyUnicodeObject *self, PyObject *args)
3598{
3599 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3600 register const Py_UNICODE *e;
3601 int cased;
3602
3603 if (!PyArg_NoArgs(args))
3604 return NULL;
3605
3606 /* Shortcut for single character strings */
3607 if (PyUnicode_GET_SIZE(self) == 1)
3608 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3609
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003610 /* Special case for empty strings */
3611 if (PyString_GET_SIZE(self) == 0)
3612 return PyInt_FromLong(0);
3613
Guido van Rossumd57fd912000-03-10 22:53:23 +00003614 e = p + PyUnicode_GET_SIZE(self);
3615 cased = 0;
3616 for (; p < e; p++) {
3617 register const Py_UNICODE ch = *p;
3618
3619 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3620 return PyInt_FromLong(0);
3621 else if (!cased && Py_UNICODE_ISLOWER(ch))
3622 cased = 1;
3623 }
3624 return PyInt_FromLong(cased);
3625}
3626
3627static char isupper__doc__[] =
3628"S.isupper() -> int\n\
3629\n\
3630Return 1 if all cased characters in S are uppercase and there is\n\
3631at least one cased character in S, 0 otherwise.";
3632
3633static PyObject*
3634unicode_isupper(PyUnicodeObject *self, PyObject *args)
3635{
3636 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3637 register const Py_UNICODE *e;
3638 int cased;
3639
3640 if (!PyArg_NoArgs(args))
3641 return NULL;
3642
3643 /* Shortcut for single character strings */
3644 if (PyUnicode_GET_SIZE(self) == 1)
3645 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3646
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003647 /* Special case for empty strings */
3648 if (PyString_GET_SIZE(self) == 0)
3649 return PyInt_FromLong(0);
3650
Guido van Rossumd57fd912000-03-10 22:53:23 +00003651 e = p + PyUnicode_GET_SIZE(self);
3652 cased = 0;
3653 for (; p < e; p++) {
3654 register const Py_UNICODE ch = *p;
3655
3656 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3657 return PyInt_FromLong(0);
3658 else if (!cased && Py_UNICODE_ISUPPER(ch))
3659 cased = 1;
3660 }
3661 return PyInt_FromLong(cased);
3662}
3663
3664static char istitle__doc__[] =
3665"S.istitle() -> int\n\
3666\n\
3667Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3668may only follow uncased characters and lowercase characters only cased\n\
3669ones. Return 0 otherwise.";
3670
3671static PyObject*
3672unicode_istitle(PyUnicodeObject *self, PyObject *args)
3673{
3674 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3675 register const Py_UNICODE *e;
3676 int cased, previous_is_cased;
3677
3678 if (!PyArg_NoArgs(args))
3679 return NULL;
3680
3681 /* Shortcut for single character strings */
3682 if (PyUnicode_GET_SIZE(self) == 1)
3683 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3684 (Py_UNICODE_ISUPPER(*p) != 0));
3685
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003686 /* Special case for empty strings */
3687 if (PyString_GET_SIZE(self) == 0)
3688 return PyInt_FromLong(0);
3689
Guido van Rossumd57fd912000-03-10 22:53:23 +00003690 e = p + PyUnicode_GET_SIZE(self);
3691 cased = 0;
3692 previous_is_cased = 0;
3693 for (; p < e; p++) {
3694 register const Py_UNICODE ch = *p;
3695
3696 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3697 if (previous_is_cased)
3698 return PyInt_FromLong(0);
3699 previous_is_cased = 1;
3700 cased = 1;
3701 }
3702 else if (Py_UNICODE_ISLOWER(ch)) {
3703 if (!previous_is_cased)
3704 return PyInt_FromLong(0);
3705 previous_is_cased = 1;
3706 cased = 1;
3707 }
3708 else
3709 previous_is_cased = 0;
3710 }
3711 return PyInt_FromLong(cased);
3712}
3713
3714static char isspace__doc__[] =
3715"S.isspace() -> int\n\
3716\n\
3717Return 1 if there are only whitespace characters in S,\n\
37180 otherwise.";
3719
3720static PyObject*
3721unicode_isspace(PyUnicodeObject *self, PyObject *args)
3722{
3723 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3724 register const Py_UNICODE *e;
3725
3726 if (!PyArg_NoArgs(args))
3727 return NULL;
3728
3729 /* Shortcut for single character strings */
3730 if (PyUnicode_GET_SIZE(self) == 1 &&
3731 Py_UNICODE_ISSPACE(*p))
3732 return PyInt_FromLong(1);
3733
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003734 /* Special case for empty strings */
3735 if (PyString_GET_SIZE(self) == 0)
3736 return PyInt_FromLong(0);
3737
Guido van Rossumd57fd912000-03-10 22:53:23 +00003738 e = p + PyUnicode_GET_SIZE(self);
3739 for (; p < e; p++) {
3740 if (!Py_UNICODE_ISSPACE(*p))
3741 return PyInt_FromLong(0);
3742 }
3743 return PyInt_FromLong(1);
3744}
3745
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003746static char isalpha__doc__[] =
3747"S.isalpha() -> int\n\
3748\n\
3749Return 1 if all characters in S are alphabetic\n\
3750and there is at least one character in S, 0 otherwise.";
3751
3752static PyObject*
3753unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3754{
3755 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3756 register const Py_UNICODE *e;
3757
3758 if (!PyArg_NoArgs(args))
3759 return NULL;
3760
3761 /* Shortcut for single character strings */
3762 if (PyUnicode_GET_SIZE(self) == 1 &&
3763 Py_UNICODE_ISALPHA(*p))
3764 return PyInt_FromLong(1);
3765
3766 /* Special case for empty strings */
3767 if (PyString_GET_SIZE(self) == 0)
3768 return PyInt_FromLong(0);
3769
3770 e = p + PyUnicode_GET_SIZE(self);
3771 for (; p < e; p++) {
3772 if (!Py_UNICODE_ISALPHA(*p))
3773 return PyInt_FromLong(0);
3774 }
3775 return PyInt_FromLong(1);
3776}
3777
3778static char isalnum__doc__[] =
3779"S.isalnum() -> int\n\
3780\n\
3781Return 1 if all characters in S are alphanumeric\n\
3782and there is at least one character in S, 0 otherwise.";
3783
3784static PyObject*
3785unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3786{
3787 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3788 register const Py_UNICODE *e;
3789
3790 if (!PyArg_NoArgs(args))
3791 return NULL;
3792
3793 /* Shortcut for single character strings */
3794 if (PyUnicode_GET_SIZE(self) == 1 &&
3795 Py_UNICODE_ISALNUM(*p))
3796 return PyInt_FromLong(1);
3797
3798 /* Special case for empty strings */
3799 if (PyString_GET_SIZE(self) == 0)
3800 return PyInt_FromLong(0);
3801
3802 e = p + PyUnicode_GET_SIZE(self);
3803 for (; p < e; p++) {
3804 if (!Py_UNICODE_ISALNUM(*p))
3805 return PyInt_FromLong(0);
3806 }
3807 return PyInt_FromLong(1);
3808}
3809
Guido van Rossumd57fd912000-03-10 22:53:23 +00003810static char isdecimal__doc__[] =
3811"S.isdecimal() -> int\n\
3812\n\
3813Return 1 if there are only decimal characters in S,\n\
38140 otherwise.";
3815
3816static PyObject*
3817unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3818{
3819 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3820 register const Py_UNICODE *e;
3821
3822 if (!PyArg_NoArgs(args))
3823 return NULL;
3824
3825 /* Shortcut for single character strings */
3826 if (PyUnicode_GET_SIZE(self) == 1 &&
3827 Py_UNICODE_ISDECIMAL(*p))
3828 return PyInt_FromLong(1);
3829
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003830 /* Special case for empty strings */
3831 if (PyString_GET_SIZE(self) == 0)
3832 return PyInt_FromLong(0);
3833
Guido van Rossumd57fd912000-03-10 22:53:23 +00003834 e = p + PyUnicode_GET_SIZE(self);
3835 for (; p < e; p++) {
3836 if (!Py_UNICODE_ISDECIMAL(*p))
3837 return PyInt_FromLong(0);
3838 }
3839 return PyInt_FromLong(1);
3840}
3841
3842static char isdigit__doc__[] =
3843"S.isdigit() -> int\n\
3844\n\
3845Return 1 if there are only digit characters in S,\n\
38460 otherwise.";
3847
3848static PyObject*
3849unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3850{
3851 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3852 register const Py_UNICODE *e;
3853
3854 if (!PyArg_NoArgs(args))
3855 return NULL;
3856
3857 /* Shortcut for single character strings */
3858 if (PyUnicode_GET_SIZE(self) == 1 &&
3859 Py_UNICODE_ISDIGIT(*p))
3860 return PyInt_FromLong(1);
3861
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003862 /* Special case for empty strings */
3863 if (PyString_GET_SIZE(self) == 0)
3864 return PyInt_FromLong(0);
3865
Guido van Rossumd57fd912000-03-10 22:53:23 +00003866 e = p + PyUnicode_GET_SIZE(self);
3867 for (; p < e; p++) {
3868 if (!Py_UNICODE_ISDIGIT(*p))
3869 return PyInt_FromLong(0);
3870 }
3871 return PyInt_FromLong(1);
3872}
3873
3874static char isnumeric__doc__[] =
3875"S.isnumeric() -> int\n\
3876\n\
3877Return 1 if there are only numeric characters in S,\n\
38780 otherwise.";
3879
3880static PyObject*
3881unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3882{
3883 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3884 register const Py_UNICODE *e;
3885
3886 if (!PyArg_NoArgs(args))
3887 return NULL;
3888
3889 /* Shortcut for single character strings */
3890 if (PyUnicode_GET_SIZE(self) == 1 &&
3891 Py_UNICODE_ISNUMERIC(*p))
3892 return PyInt_FromLong(1);
3893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003894 /* Special case for empty strings */
3895 if (PyString_GET_SIZE(self) == 0)
3896 return PyInt_FromLong(0);
3897
Guido van Rossumd57fd912000-03-10 22:53:23 +00003898 e = p + PyUnicode_GET_SIZE(self);
3899 for (; p < e; p++) {
3900 if (!Py_UNICODE_ISNUMERIC(*p))
3901 return PyInt_FromLong(0);
3902 }
3903 return PyInt_FromLong(1);
3904}
3905
3906static char join__doc__[] =
3907"S.join(sequence) -> unicode\n\
3908\n\
3909Return a string which is the concatenation of the strings in the\n\
3910sequence. The separator between elements is S.";
3911
3912static PyObject*
3913unicode_join(PyUnicodeObject *self, PyObject *args)
3914{
3915 PyObject *data;
3916 if (!PyArg_ParseTuple(args, "O:join", &data))
3917 return NULL;
3918
3919 return PyUnicode_Join((PyObject *)self, data);
3920}
3921
3922static int
3923unicode_length(PyUnicodeObject *self)
3924{
3925 return self->length;
3926}
3927
3928static char ljust__doc__[] =
3929"S.ljust(width) -> unicode\n\
3930\n\
3931Return S left justified in a Unicode string of length width. Padding is\n\
3932done using spaces.";
3933
3934static PyObject *
3935unicode_ljust(PyUnicodeObject *self, PyObject *args)
3936{
3937 int width;
3938 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3939 return NULL;
3940
3941 if (self->length >= width) {
3942 Py_INCREF(self);
3943 return (PyObject*) self;
3944 }
3945
3946 return (PyObject*) pad(self, 0, width - self->length, ' ');
3947}
3948
3949static char lower__doc__[] =
3950"S.lower() -> unicode\n\
3951\n\
3952Return a copy of the string S converted to lowercase.";
3953
3954static PyObject*
3955unicode_lower(PyUnicodeObject *self, PyObject *args)
3956{
3957 if (!PyArg_NoArgs(args))
3958 return NULL;
3959 return fixup(self, fixlower);
3960}
3961
3962static char lstrip__doc__[] =
3963"S.lstrip() -> unicode\n\
3964\n\
3965Return a copy of the string S with leading whitespace removed.";
3966
3967static PyObject *
3968unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3969{
3970 if (!PyArg_NoArgs(args))
3971 return NULL;
3972 return strip(self, 1, 0);
3973}
3974
3975static PyObject*
3976unicode_repeat(PyUnicodeObject *str, int len)
3977{
3978 PyUnicodeObject *u;
3979 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00003980 int nchars;
3981 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003982
3983 if (len < 0)
3984 len = 0;
3985
3986 if (len == 1) {
3987 /* no repeat, return original string */
3988 Py_INCREF(str);
3989 return (PyObject*) str;
3990 }
Tim Peters8f422462000-09-09 06:13:41 +00003991
3992 /* ensure # of chars needed doesn't overflow int and # of bytes
3993 * needed doesn't overflow size_t
3994 */
3995 nchars = len * str->length;
3996 if (len && nchars / len != str->length) {
3997 PyErr_SetString(PyExc_OverflowError,
3998 "repeated string is too long");
3999 return NULL;
4000 }
4001 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4002 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4003 PyErr_SetString(PyExc_OverflowError,
4004 "repeated string is too long");
4005 return NULL;
4006 }
4007 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004008 if (!u)
4009 return NULL;
4010
4011 p = u->str;
4012
4013 while (len-- > 0) {
4014 Py_UNICODE_COPY(p, str->str, str->length);
4015 p += str->length;
4016 }
4017
4018 return (PyObject*) u;
4019}
4020
4021PyObject *PyUnicode_Replace(PyObject *obj,
4022 PyObject *subobj,
4023 PyObject *replobj,
4024 int maxcount)
4025{
4026 PyObject *self;
4027 PyObject *str1;
4028 PyObject *str2;
4029 PyObject *result;
4030
4031 self = PyUnicode_FromObject(obj);
4032 if (self == NULL)
4033 return NULL;
4034 str1 = PyUnicode_FromObject(subobj);
4035 if (str1 == NULL) {
4036 Py_DECREF(self);
4037 return NULL;
4038 }
4039 str2 = PyUnicode_FromObject(replobj);
4040 if (str2 == NULL) {
4041 Py_DECREF(self);
4042 Py_DECREF(str1);
4043 return NULL;
4044 }
4045 result = replace((PyUnicodeObject *)self,
4046 (PyUnicodeObject *)str1,
4047 (PyUnicodeObject *)str2,
4048 maxcount);
4049 Py_DECREF(self);
4050 Py_DECREF(str1);
4051 Py_DECREF(str2);
4052 return result;
4053}
4054
4055static char replace__doc__[] =
4056"S.replace (old, new[, maxsplit]) -> unicode\n\
4057\n\
4058Return a copy of S with all occurrences of substring\n\
4059old replaced by new. If the optional argument maxsplit is\n\
4060given, only the first maxsplit occurrences are replaced.";
4061
4062static PyObject*
4063unicode_replace(PyUnicodeObject *self, PyObject *args)
4064{
4065 PyUnicodeObject *str1;
4066 PyUnicodeObject *str2;
4067 int maxcount = -1;
4068 PyObject *result;
4069
4070 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4071 return NULL;
4072 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4073 if (str1 == NULL)
4074 return NULL;
4075 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4076 if (str2 == NULL)
4077 return NULL;
4078
4079 result = replace(self, str1, str2, maxcount);
4080
4081 Py_DECREF(str1);
4082 Py_DECREF(str2);
4083 return result;
4084}
4085
4086static
4087PyObject *unicode_repr(PyObject *unicode)
4088{
4089 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4090 PyUnicode_GET_SIZE(unicode),
4091 1);
4092}
4093
4094static char rfind__doc__[] =
4095"S.rfind(sub [,start [,end]]) -> int\n\
4096\n\
4097Return the highest index in S where substring sub is found,\n\
4098such that sub is contained within s[start,end]. Optional\n\
4099arguments start and end are interpreted as in slice notation.\n\
4100\n\
4101Return -1 on failure.";
4102
4103static PyObject *
4104unicode_rfind(PyUnicodeObject *self, PyObject *args)
4105{
4106 PyUnicodeObject *substring;
4107 int start = 0;
4108 int end = INT_MAX;
4109 PyObject *result;
4110
Guido van Rossumb8872e62000-05-09 14:14:27 +00004111 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4112 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 return NULL;
4114 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4115 (PyObject *)substring);
4116 if (substring == NULL)
4117 return NULL;
4118
4119 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4120
4121 Py_DECREF(substring);
4122 return result;
4123}
4124
4125static char rindex__doc__[] =
4126"S.rindex(sub [,start [,end]]) -> int\n\
4127\n\
4128Like S.rfind() but raise ValueError when the substring is not found.";
4129
4130static PyObject *
4131unicode_rindex(PyUnicodeObject *self, PyObject *args)
4132{
4133 int result;
4134 PyUnicodeObject *substring;
4135 int start = 0;
4136 int end = INT_MAX;
4137
Guido van Rossumb8872e62000-05-09 14:14:27 +00004138 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4139 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004140 return NULL;
4141 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4142 (PyObject *)substring);
4143 if (substring == NULL)
4144 return NULL;
4145
4146 result = findstring(self, substring, start, end, -1);
4147
4148 Py_DECREF(substring);
4149 if (result < 0) {
4150 PyErr_SetString(PyExc_ValueError, "substring not found");
4151 return NULL;
4152 }
4153 return PyInt_FromLong(result);
4154}
4155
4156static char rjust__doc__[] =
4157"S.rjust(width) -> unicode\n\
4158\n\
4159Return S right justified in a Unicode string of length width. Padding is\n\
4160done using spaces.";
4161
4162static PyObject *
4163unicode_rjust(PyUnicodeObject *self, PyObject *args)
4164{
4165 int width;
4166 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4167 return NULL;
4168
4169 if (self->length >= width) {
4170 Py_INCREF(self);
4171 return (PyObject*) self;
4172 }
4173
4174 return (PyObject*) pad(self, width - self->length, 0, ' ');
4175}
4176
4177static char rstrip__doc__[] =
4178"S.rstrip() -> unicode\n\
4179\n\
4180Return a copy of the string S with trailing whitespace removed.";
4181
4182static PyObject *
4183unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4184{
4185 if (!PyArg_NoArgs(args))
4186 return NULL;
4187 return strip(self, 0, 1);
4188}
4189
4190static PyObject*
4191unicode_slice(PyUnicodeObject *self, int start, int end)
4192{
4193 /* standard clamping */
4194 if (start < 0)
4195 start = 0;
4196 if (end < 0)
4197 end = 0;
4198 if (end > self->length)
4199 end = self->length;
4200 if (start == 0 && end == self->length) {
4201 /* full slice, return original string */
4202 Py_INCREF(self);
4203 return (PyObject*) self;
4204 }
4205 if (start > end)
4206 start = end;
4207 /* copy slice */
4208 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4209 end - start);
4210}
4211
4212PyObject *PyUnicode_Split(PyObject *s,
4213 PyObject *sep,
4214 int maxsplit)
4215{
4216 PyObject *result;
4217
4218 s = PyUnicode_FromObject(s);
4219 if (s == NULL)
4220 return NULL;
4221 if (sep != NULL) {
4222 sep = PyUnicode_FromObject(sep);
4223 if (sep == NULL) {
4224 Py_DECREF(s);
4225 return NULL;
4226 }
4227 }
4228
4229 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4230
4231 Py_DECREF(s);
4232 Py_XDECREF(sep);
4233 return result;
4234}
4235
4236static char split__doc__[] =
4237"S.split([sep [,maxsplit]]) -> list of strings\n\
4238\n\
4239Return a list of the words in S, using sep as the\n\
4240delimiter string. If maxsplit is given, at most maxsplit\n\
4241splits are done. If sep is not specified, any whitespace string\n\
4242is a separator.";
4243
4244static PyObject*
4245unicode_split(PyUnicodeObject *self, PyObject *args)
4246{
4247 PyObject *substring = Py_None;
4248 int maxcount = -1;
4249
4250 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4251 return NULL;
4252
4253 if (substring == Py_None)
4254 return split(self, NULL, maxcount);
4255 else if (PyUnicode_Check(substring))
4256 return split(self, (PyUnicodeObject *)substring, maxcount);
4257 else
4258 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4259}
4260
4261static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004262"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263\n\
4264Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004265Line breaks are not included in the resulting list unless keepends\n\
4266is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004267
4268static PyObject*
4269unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4270{
Guido van Rossum86662912000-04-11 15:38:46 +00004271 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272
Guido van Rossum86662912000-04-11 15:38:46 +00004273 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004274 return NULL;
4275
Guido van Rossum86662912000-04-11 15:38:46 +00004276 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004277}
4278
4279static
4280PyObject *unicode_str(PyUnicodeObject *self)
4281{
Fred Drakee4315f52000-05-09 19:53:39 +00004282 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004283}
4284
4285static char strip__doc__[] =
4286"S.strip() -> unicode\n\
4287\n\
4288Return a copy of S with leading and trailing whitespace removed.";
4289
4290static PyObject *
4291unicode_strip(PyUnicodeObject *self, PyObject *args)
4292{
4293 if (!PyArg_NoArgs(args))
4294 return NULL;
4295 return strip(self, 1, 1);
4296}
4297
4298static char swapcase__doc__[] =
4299"S.swapcase() -> unicode\n\
4300\n\
4301Return a copy of S with uppercase characters converted to lowercase\n\
4302and vice versa.";
4303
4304static PyObject*
4305unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4306{
4307 if (!PyArg_NoArgs(args))
4308 return NULL;
4309 return fixup(self, fixswapcase);
4310}
4311
4312static char translate__doc__[] =
4313"S.translate(table) -> unicode\n\
4314\n\
4315Return a copy of the string S, where all characters have been mapped\n\
4316through the given translation table, which must be a mapping of\n\
4317Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4318are left untouched. Characters mapped to None are deleted.";
4319
4320static PyObject*
4321unicode_translate(PyUnicodeObject *self, PyObject *args)
4322{
4323 PyObject *table;
4324
4325 if (!PyArg_ParseTuple(args, "O:translate", &table))
4326 return NULL;
4327 return PyUnicode_TranslateCharmap(self->str,
4328 self->length,
4329 table,
4330 "ignore");
4331}
4332
4333static char upper__doc__[] =
4334"S.upper() -> unicode\n\
4335\n\
4336Return a copy of S converted to uppercase.";
4337
4338static PyObject*
4339unicode_upper(PyUnicodeObject *self, PyObject *args)
4340{
4341 if (!PyArg_NoArgs(args))
4342 return NULL;
4343 return fixup(self, fixupper);
4344}
4345
4346#if 0
4347static char zfill__doc__[] =
4348"S.zfill(width) -> unicode\n\
4349\n\
4350Pad a numeric string x with zeros on the left, to fill a field\n\
4351of the specified width. The string x is never truncated.";
4352
4353static PyObject *
4354unicode_zfill(PyUnicodeObject *self, PyObject *args)
4355{
4356 int fill;
4357 PyUnicodeObject *u;
4358
4359 int width;
4360 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4361 return NULL;
4362
4363 if (self->length >= width) {
4364 Py_INCREF(self);
4365 return (PyObject*) self;
4366 }
4367
4368 fill = width - self->length;
4369
4370 u = pad(self, fill, 0, '0');
4371
4372 if (u->str[fill] == '+' || u->str[fill] == '-') {
4373 /* move sign to beginning of string */
4374 u->str[0] = u->str[fill];
4375 u->str[fill] = '0';
4376 }
4377
4378 return (PyObject*) u;
4379}
4380#endif
4381
4382#if 0
4383static PyObject*
4384unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4385{
4386 if (!PyArg_NoArgs(args))
4387 return NULL;
4388 return PyInt_FromLong(unicode_freelist_size);
4389}
4390#endif
4391
4392static char startswith__doc__[] =
4393"S.startswith(prefix[, start[, end]]) -> int\n\
4394\n\
4395Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4396optional start, test S beginning at that position. With optional end, stop\n\
4397comparing S at that position.";
4398
4399static PyObject *
4400unicode_startswith(PyUnicodeObject *self,
4401 PyObject *args)
4402{
4403 PyUnicodeObject *substring;
4404 int start = 0;
4405 int end = INT_MAX;
4406 PyObject *result;
4407
Guido van Rossumb8872e62000-05-09 14:14:27 +00004408 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4409 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004410 return NULL;
4411 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4412 (PyObject *)substring);
4413 if (substring == NULL)
4414 return NULL;
4415
4416 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4417
4418 Py_DECREF(substring);
4419 return result;
4420}
4421
4422
4423static char endswith__doc__[] =
4424"S.endswith(suffix[, start[, end]]) -> int\n\
4425\n\
4426Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4427optional start, test S beginning at that position. With optional end, stop\n\
4428comparing S at that position.";
4429
4430static PyObject *
4431unicode_endswith(PyUnicodeObject *self,
4432 PyObject *args)
4433{
4434 PyUnicodeObject *substring;
4435 int start = 0;
4436 int end = INT_MAX;
4437 PyObject *result;
4438
Guido van Rossumb8872e62000-05-09 14:14:27 +00004439 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4440 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004441 return NULL;
4442 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4443 (PyObject *)substring);
4444 if (substring == NULL)
4445 return NULL;
4446
4447 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4448
4449 Py_DECREF(substring);
4450 return result;
4451}
4452
4453
4454static PyMethodDef unicode_methods[] = {
4455
4456 /* Order is according to common usage: often used methods should
4457 appear first, since lookup is done sequentially. */
4458
4459 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4460 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4461 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4462 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4463 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4464 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4465 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4466 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4467 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4468 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4469 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4470 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4471 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4472 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4473/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4474 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4475 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4476 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4477 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4478 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4479 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4480 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4481 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4482 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4483 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4484 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4485 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4486 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4487 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4488 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4489 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4490 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4491 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004492 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4493 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004494#if 0
4495 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4496 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4497#endif
4498
4499#if 0
4500 /* This one is just used for debugging the implementation. */
4501 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4502#endif
4503
4504 {NULL, NULL}
4505};
4506
4507static PyObject *
4508unicode_getattr(PyUnicodeObject *self, char *name)
4509{
4510 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4511}
4512
4513static PySequenceMethods unicode_as_sequence = {
4514 (inquiry) unicode_length, /* sq_length */
4515 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4516 (intargfunc) unicode_repeat, /* sq_repeat */
4517 (intargfunc) unicode_getitem, /* sq_item */
4518 (intintargfunc) unicode_slice, /* sq_slice */
4519 0, /* sq_ass_item */
4520 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004521 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004522};
4523
4524static int
4525unicode_buffer_getreadbuf(PyUnicodeObject *self,
4526 int index,
4527 const void **ptr)
4528{
4529 if (index != 0) {
4530 PyErr_SetString(PyExc_SystemError,
4531 "accessing non-existent unicode segment");
4532 return -1;
4533 }
4534 *ptr = (void *) self->str;
4535 return PyUnicode_GET_DATA_SIZE(self);
4536}
4537
4538static int
4539unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4540 const void **ptr)
4541{
4542 PyErr_SetString(PyExc_TypeError,
4543 "cannot use unicode as modifyable buffer");
4544 return -1;
4545}
4546
4547static int
4548unicode_buffer_getsegcount(PyUnicodeObject *self,
4549 int *lenp)
4550{
4551 if (lenp)
4552 *lenp = PyUnicode_GET_DATA_SIZE(self);
4553 return 1;
4554}
4555
4556static int
4557unicode_buffer_getcharbuf(PyUnicodeObject *self,
4558 int index,
4559 const void **ptr)
4560{
4561 PyObject *str;
4562
4563 if (index != 0) {
4564 PyErr_SetString(PyExc_SystemError,
4565 "accessing non-existent unicode segment");
4566 return -1;
4567 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004568 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 if (str == NULL)
4570 return -1;
4571 *ptr = (void *) PyString_AS_STRING(str);
4572 return PyString_GET_SIZE(str);
4573}
4574
4575/* Helpers for PyUnicode_Format() */
4576
4577static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004578getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579{
4580 int argidx = *p_argidx;
4581 if (argidx < arglen) {
4582 (*p_argidx)++;
4583 if (arglen < 0)
4584 return args;
4585 else
4586 return PyTuple_GetItem(args, argidx);
4587 }
4588 PyErr_SetString(PyExc_TypeError,
4589 "not enough arguments for format string");
4590 return NULL;
4591}
4592
4593#define F_LJUST (1<<0)
4594#define F_SIGN (1<<1)
4595#define F_BLANK (1<<2)
4596#define F_ALT (1<<3)
4597#define F_ZERO (1<<4)
4598
4599static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004600int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004601{
4602 register int i;
4603 int len;
4604 va_list va;
4605 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004606 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004607
4608 /* First, format the string as char array, then expand to Py_UNICODE
4609 array. */
4610 charbuffer = (char *)buffer;
4611 len = vsprintf(charbuffer, format, va);
4612 for (i = len - 1; i >= 0; i--)
4613 buffer[i] = (Py_UNICODE) charbuffer[i];
4614
4615 va_end(va);
4616 return len;
4617}
4618
4619static int
4620formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004621 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004622 int flags,
4623 int prec,
4624 int type,
4625 PyObject *v)
4626{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004627 /* fmt = '%#.' + `prec` + `type`
4628 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004629 char fmt[20];
4630 double x;
4631
4632 x = PyFloat_AsDouble(v);
4633 if (x == -1.0 && PyErr_Occurred())
4634 return -1;
4635 if (prec < 0)
4636 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004637 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4638 type = 'g';
4639 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004640 /* worst case length calc to ensure no buffer overrun:
4641 fmt = %#.<prec>g
4642 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4643 for any double rep.)
4644 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4645 If prec=0 the effective precision is 1 (the leading digit is
4646 always given), therefore increase by one to 10+prec. */
4647 if (buflen <= (size_t)10 + (size_t)prec) {
4648 PyErr_SetString(PyExc_OverflowError,
4649 "formatted float is too long (precision too long?)");
4650 return -1;
4651 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004652 return usprintf(buf, fmt, x);
4653}
4654
Tim Peters38fd5b62000-09-21 05:43:11 +00004655static PyObject*
4656formatlong(PyObject *val, int flags, int prec, int type)
4657{
4658 char *buf;
4659 int i, len;
4660 PyObject *str; /* temporary string object. */
4661 PyUnicodeObject *result;
4662
4663 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4664 if (!str)
4665 return NULL;
4666 result = _PyUnicode_New(len);
4667 for (i = 0; i < len; i++)
4668 result->str[i] = buf[i];
4669 result->str[len] = 0;
4670 Py_DECREF(str);
4671 return (PyObject*)result;
4672}
4673
Guido van Rossumd57fd912000-03-10 22:53:23 +00004674static int
4675formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004676 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004677 int flags,
4678 int prec,
4679 int type,
4680 PyObject *v)
4681{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004682 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004683 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4684 + 1 + 1 = 24*/
4685 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 long x;
4687
4688 x = PyInt_AsLong(v);
4689 if (x == -1 && PyErr_Occurred())
4690 return -1;
4691 if (prec < 0)
4692 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004693 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4694 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4695 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4696 PyErr_SetString(PyExc_OverflowError,
4697 "formatted integer is too long (precision too long?)");
4698 return -1;
4699 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004700 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4701 return usprintf(buf, fmt, x);
4702}
4703
4704static int
4705formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004706 size_t buflen,
4707 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004708{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004709 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004710 if (PyUnicode_Check(v)) {
4711 if (PyUnicode_GET_SIZE(v) != 1)
4712 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004714 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004715
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004716 else if (PyString_Check(v)) {
4717 if (PyString_GET_SIZE(v) != 1)
4718 goto onError;
4719 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4720 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004721
4722 else {
4723 /* Integer input truncated to a character */
4724 long x;
4725 x = PyInt_AsLong(v);
4726 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004727 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004728 buf[0] = (char) x;
4729 }
4730 buf[1] = '\0';
4731 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004732
4733 onError:
4734 PyErr_SetString(PyExc_TypeError,
4735 "%c requires int or char");
4736 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004737}
4738
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004739/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4740
4741 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4742 chars are formatted. XXX This is a magic number. Each formatting
4743 routine does bounds checking to ensure no overflow, but a better
4744 solution may be to malloc a buffer of appropriate size for each
4745 format. For now, the current solution is sufficient.
4746*/
4747#define FORMATBUFLEN (size_t)120
4748
Guido van Rossumd57fd912000-03-10 22:53:23 +00004749PyObject *PyUnicode_Format(PyObject *format,
4750 PyObject *args)
4751{
4752 Py_UNICODE *fmt, *res;
4753 int fmtcnt, rescnt, reslen, arglen, argidx;
4754 int args_owned = 0;
4755 PyUnicodeObject *result = NULL;
4756 PyObject *dict = NULL;
4757 PyObject *uformat;
4758
4759 if (format == NULL || args == NULL) {
4760 PyErr_BadInternalCall();
4761 return NULL;
4762 }
4763 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004764 if (uformat == NULL)
4765 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004766 fmt = PyUnicode_AS_UNICODE(uformat);
4767 fmtcnt = PyUnicode_GET_SIZE(uformat);
4768
4769 reslen = rescnt = fmtcnt + 100;
4770 result = _PyUnicode_New(reslen);
4771 if (result == NULL)
4772 goto onError;
4773 res = PyUnicode_AS_UNICODE(result);
4774
4775 if (PyTuple_Check(args)) {
4776 arglen = PyTuple_Size(args);
4777 argidx = 0;
4778 }
4779 else {
4780 arglen = -1;
4781 argidx = -2;
4782 }
4783 if (args->ob_type->tp_as_mapping)
4784 dict = args;
4785
4786 while (--fmtcnt >= 0) {
4787 if (*fmt != '%') {
4788 if (--rescnt < 0) {
4789 rescnt = fmtcnt + 100;
4790 reslen += rescnt;
4791 if (_PyUnicode_Resize(result, reslen) < 0)
4792 return NULL;
4793 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4794 --rescnt;
4795 }
4796 *res++ = *fmt++;
4797 }
4798 else {
4799 /* Got a format specifier */
4800 int flags = 0;
4801 int width = -1;
4802 int prec = -1;
4803 int size = 0;
4804 Py_UNICODE c = '\0';
4805 Py_UNICODE fill;
4806 PyObject *v = NULL;
4807 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004808 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004809 Py_UNICODE sign;
4810 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004811 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004812
4813 fmt++;
4814 if (*fmt == '(') {
4815 Py_UNICODE *keystart;
4816 int keylen;
4817 PyObject *key;
4818 int pcount = 1;
4819
4820 if (dict == NULL) {
4821 PyErr_SetString(PyExc_TypeError,
4822 "format requires a mapping");
4823 goto onError;
4824 }
4825 ++fmt;
4826 --fmtcnt;
4827 keystart = fmt;
4828 /* Skip over balanced parentheses */
4829 while (pcount > 0 && --fmtcnt >= 0) {
4830 if (*fmt == ')')
4831 --pcount;
4832 else if (*fmt == '(')
4833 ++pcount;
4834 fmt++;
4835 }
4836 keylen = fmt - keystart - 1;
4837 if (fmtcnt < 0 || pcount > 0) {
4838 PyErr_SetString(PyExc_ValueError,
4839 "incomplete format key");
4840 goto onError;
4841 }
Fred Drakee4315f52000-05-09 19:53:39 +00004842 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004843 then looked up since Python uses strings to hold
4844 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004845 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 key = PyUnicode_EncodeUTF8(keystart,
4847 keylen,
4848 NULL);
4849 if (key == NULL)
4850 goto onError;
4851 if (args_owned) {
4852 Py_DECREF(args);
4853 args_owned = 0;
4854 }
4855 args = PyObject_GetItem(dict, key);
4856 Py_DECREF(key);
4857 if (args == NULL) {
4858 goto onError;
4859 }
4860 args_owned = 1;
4861 arglen = -1;
4862 argidx = -2;
4863 }
4864 while (--fmtcnt >= 0) {
4865 switch (c = *fmt++) {
4866 case '-': flags |= F_LJUST; continue;
4867 case '+': flags |= F_SIGN; continue;
4868 case ' ': flags |= F_BLANK; continue;
4869 case '#': flags |= F_ALT; continue;
4870 case '0': flags |= F_ZERO; continue;
4871 }
4872 break;
4873 }
4874 if (c == '*') {
4875 v = getnextarg(args, arglen, &argidx);
4876 if (v == NULL)
4877 goto onError;
4878 if (!PyInt_Check(v)) {
4879 PyErr_SetString(PyExc_TypeError,
4880 "* wants int");
4881 goto onError;
4882 }
4883 width = PyInt_AsLong(v);
4884 if (width < 0) {
4885 flags |= F_LJUST;
4886 width = -width;
4887 }
4888 if (--fmtcnt >= 0)
4889 c = *fmt++;
4890 }
4891 else if (c >= '0' && c <= '9') {
4892 width = c - '0';
4893 while (--fmtcnt >= 0) {
4894 c = *fmt++;
4895 if (c < '0' || c > '9')
4896 break;
4897 if ((width*10) / 10 != width) {
4898 PyErr_SetString(PyExc_ValueError,
4899 "width too big");
4900 goto onError;
4901 }
4902 width = width*10 + (c - '0');
4903 }
4904 }
4905 if (c == '.') {
4906 prec = 0;
4907 if (--fmtcnt >= 0)
4908 c = *fmt++;
4909 if (c == '*') {
4910 v = getnextarg(args, arglen, &argidx);
4911 if (v == NULL)
4912 goto onError;
4913 if (!PyInt_Check(v)) {
4914 PyErr_SetString(PyExc_TypeError,
4915 "* wants int");
4916 goto onError;
4917 }
4918 prec = PyInt_AsLong(v);
4919 if (prec < 0)
4920 prec = 0;
4921 if (--fmtcnt >= 0)
4922 c = *fmt++;
4923 }
4924 else if (c >= '0' && c <= '9') {
4925 prec = c - '0';
4926 while (--fmtcnt >= 0) {
4927 c = Py_CHARMASK(*fmt++);
4928 if (c < '0' || c > '9')
4929 break;
4930 if ((prec*10) / 10 != prec) {
4931 PyErr_SetString(PyExc_ValueError,
4932 "prec too big");
4933 goto onError;
4934 }
4935 prec = prec*10 + (c - '0');
4936 }
4937 }
4938 } /* prec */
4939 if (fmtcnt >= 0) {
4940 if (c == 'h' || c == 'l' || c == 'L') {
4941 size = c;
4942 if (--fmtcnt >= 0)
4943 c = *fmt++;
4944 }
4945 }
4946 if (fmtcnt < 0) {
4947 PyErr_SetString(PyExc_ValueError,
4948 "incomplete format");
4949 goto onError;
4950 }
4951 if (c != '%') {
4952 v = getnextarg(args, arglen, &argidx);
4953 if (v == NULL)
4954 goto onError;
4955 }
4956 sign = 0;
4957 fill = ' ';
4958 switch (c) {
4959
4960 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004961 pbuf = formatbuf;
4962 /* presume that buffer length is at least 1 */
4963 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004964 len = 1;
4965 break;
4966
4967 case 's':
4968 case 'r':
4969 if (PyUnicode_Check(v) && c == 's') {
4970 temp = v;
4971 Py_INCREF(temp);
4972 }
4973 else {
4974 PyObject *unicode;
4975 if (c == 's')
4976 temp = PyObject_Str(v);
4977 else
4978 temp = PyObject_Repr(v);
4979 if (temp == NULL)
4980 goto onError;
4981 if (!PyString_Check(temp)) {
4982 /* XXX Note: this should never happen, since
4983 PyObject_Repr() and PyObject_Str() assure
4984 this */
4985 Py_DECREF(temp);
4986 PyErr_SetString(PyExc_TypeError,
4987 "%s argument has non-string str()");
4988 goto onError;
4989 }
Fred Drakee4315f52000-05-09 19:53:39 +00004990 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004992 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004993 "strict");
4994 Py_DECREF(temp);
4995 temp = unicode;
4996 if (temp == NULL)
4997 goto onError;
4998 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004999 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005000 len = PyUnicode_GET_SIZE(temp);
5001 if (prec >= 0 && len > prec)
5002 len = prec;
5003 break;
5004
5005 case 'i':
5006 case 'd':
5007 case 'u':
5008 case 'o':
5009 case 'x':
5010 case 'X':
5011 if (c == 'i')
5012 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005013 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005014 temp = formatlong(v, flags, prec, c);
5015 if (!temp)
5016 goto onError;
5017 pbuf = PyUnicode_AS_UNICODE(temp);
5018 len = PyUnicode_GET_SIZE(temp);
5019 /* unbounded ints can always produce
5020 a sign character! */
5021 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005023 else {
5024 pbuf = formatbuf;
5025 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5026 flags, prec, c, v);
5027 if (len < 0)
5028 goto onError;
5029 /* only d conversion is signed */
5030 sign = c == 'd';
5031 }
5032 if (flags & F_ZERO)
5033 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 break;
5035
5036 case 'e':
5037 case 'E':
5038 case 'f':
5039 case 'g':
5040 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005041 pbuf = formatbuf;
5042 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5043 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005044 if (len < 0)
5045 goto onError;
5046 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005047 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048 fill = '0';
5049 break;
5050
5051 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005052 pbuf = formatbuf;
5053 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005054 if (len < 0)
5055 goto onError;
5056 break;
5057
5058 default:
5059 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005060 "unsupported format character '%c' (0x%x) "
5061 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005062 (31<=c && c<=126) ? c : '?',
5063 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005064 goto onError;
5065 }
5066 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005067 if (*pbuf == '-' || *pbuf == '+') {
5068 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005069 len--;
5070 }
5071 else if (flags & F_SIGN)
5072 sign = '+';
5073 else if (flags & F_BLANK)
5074 sign = ' ';
5075 else
5076 sign = 0;
5077 }
5078 if (width < len)
5079 width = len;
5080 if (rescnt < width + (sign != 0)) {
5081 reslen -= rescnt;
5082 rescnt = width + fmtcnt + 100;
5083 reslen += rescnt;
5084 if (_PyUnicode_Resize(result, reslen) < 0)
5085 return NULL;
5086 res = PyUnicode_AS_UNICODE(result)
5087 + reslen - rescnt;
5088 }
5089 if (sign) {
5090 if (fill != ' ')
5091 *res++ = sign;
5092 rescnt--;
5093 if (width > len)
5094 width--;
5095 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005096 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5097 assert(pbuf[0] == '0');
5098 assert(pbuf[1] == c);
5099 if (fill != ' ') {
5100 *res++ = *pbuf++;
5101 *res++ = *pbuf++;
5102 }
5103 rescnt -= 2;
5104 width -= 2;
5105 if (width < 0)
5106 width = 0;
5107 len -= 2;
5108 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005109 if (width > len && !(flags & F_LJUST)) {
5110 do {
5111 --rescnt;
5112 *res++ = fill;
5113 } while (--width > len);
5114 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005115 if (fill == ' ') {
5116 if (sign)
5117 *res++ = sign;
5118 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5119 assert(pbuf[0] == '0');
5120 assert(pbuf[1] == c);
5121 *res++ = *pbuf++;
5122 *res++ = *pbuf++;
5123 }
5124 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005125 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126 res += len;
5127 rescnt -= len;
5128 while (--width >= len) {
5129 --rescnt;
5130 *res++ = ' ';
5131 }
5132 if (dict && (argidx < arglen) && c != '%') {
5133 PyErr_SetString(PyExc_TypeError,
5134 "not all arguments converted");
5135 goto onError;
5136 }
5137 Py_XDECREF(temp);
5138 } /* '%' */
5139 } /* until end */
5140 if (argidx < arglen && !dict) {
5141 PyErr_SetString(PyExc_TypeError,
5142 "not all arguments converted");
5143 goto onError;
5144 }
5145
5146 if (args_owned) {
5147 Py_DECREF(args);
5148 }
5149 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005150 if (_PyUnicode_Resize(result, reslen - rescnt))
5151 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005152 return (PyObject *)result;
5153
5154 onError:
5155 Py_XDECREF(result);
5156 Py_DECREF(uformat);
5157 if (args_owned) {
5158 Py_DECREF(args);
5159 }
5160 return NULL;
5161}
5162
5163static PyBufferProcs unicode_as_buffer = {
5164 (getreadbufferproc) unicode_buffer_getreadbuf,
5165 (getwritebufferproc) unicode_buffer_getwritebuf,
5166 (getsegcountproc) unicode_buffer_getsegcount,
5167 (getcharbufferproc) unicode_buffer_getcharbuf,
5168};
5169
5170PyTypeObject PyUnicode_Type = {
5171 PyObject_HEAD_INIT(&PyType_Type)
5172 0, /* ob_size */
5173 "unicode", /* tp_name */
5174 sizeof(PyUnicodeObject), /* tp_size */
5175 0, /* tp_itemsize */
5176 /* Slots */
5177 (destructor)_PyUnicode_Free, /* tp_dealloc */
5178 0, /* tp_print */
5179 (getattrfunc)unicode_getattr, /* tp_getattr */
5180 0, /* tp_setattr */
5181 (cmpfunc) unicode_compare, /* tp_compare */
5182 (reprfunc) unicode_repr, /* tp_repr */
5183 0, /* tp_as_number */
5184 &unicode_as_sequence, /* tp_as_sequence */
5185 0, /* tp_as_mapping */
5186 (hashfunc) unicode_hash, /* tp_hash*/
5187 0, /* tp_call*/
5188 (reprfunc) unicode_str, /* tp_str */
5189 (getattrofunc) NULL, /* tp_getattro */
5190 (setattrofunc) NULL, /* tp_setattro */
5191 &unicode_as_buffer, /* tp_as_buffer */
5192 Py_TPFLAGS_DEFAULT, /* tp_flags */
5193};
5194
5195/* Initialize the Unicode implementation */
5196
Thomas Wouters78890102000-07-22 19:25:51 +00005197void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005198{
5199 /* Doublecheck the configuration... */
5200 if (sizeof(Py_UNICODE) != 2)
5201 Py_FatalError("Unicode configuration error: "
5202 "sizeof(Py_UNICODE) != 2 bytes");
5203
Fred Drakee4315f52000-05-09 19:53:39 +00005204 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005205 unicode_freelist = NULL;
5206 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005208 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209}
5210
5211/* Finalize the Unicode implementation */
5212
5213void
Thomas Wouters78890102000-07-22 19:25:51 +00005214_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005216 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005218 Py_XDECREF(unicode_empty);
5219 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005220
5221 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005222 PyUnicodeObject *v = u;
5223 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005224 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005225 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005226 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005227 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005228 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005229 unicode_freelist = NULL;
5230 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005231}