blob: 585afe6364739792c8372c64040d714969646dc8 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
86/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
89/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000090static PyUnicodeObject *unicode_freelist;
91static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Fred Drakee4315f52000-05-09 19:53:39 +000093/* Default encoding to use and assume when NULL is passed as encoding
94 parameter; it is initialized by _PyUnicode_Init().
95
96 Always use the PyUnicode_SetDefaultEncoding() and
97 PyUnicode_GetDefaultEncoding() APIs to access this global.
98
99*/
100
101static char unicode_default_encoding[100];
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* --- Unicode Object ----------------------------------------------------- */
104
105static
106int _PyUnicode_Resize(register PyUnicodeObject *unicode,
107 int length)
108{
109 void *oldstr;
110
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000111 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000112 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000113 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115 /* Resizing unicode_empty is not allowed. */
116 if (unicode == unicode_empty) {
117 PyErr_SetString(PyExc_SystemError,
118 "can't resize empty unicode object");
119 return -1;
120 }
121
122 /* We allocate one more byte to make sure the string is
123 Ux0000 terminated -- XXX is this needed ? */
124 oldstr = unicode->str;
125 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
126 if (!unicode->str) {
127 unicode->str = oldstr;
128 PyErr_NoMemory();
129 return -1;
130 }
131 unicode->str[length] = 0;
132 unicode->length = length;
133
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000136 if (unicode->defenc) {
137 Py_DECREF(unicode->defenc);
138 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 }
140 unicode->hash = -1;
141
142 return 0;
143}
144
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145int PyUnicode_Resize(PyObject **unicode,
146 int length)
147{
148 PyUnicodeObject *v;
149
150 if (unicode == NULL) {
151 PyErr_BadInternalCall();
152 return -1;
153 }
154 v = (PyUnicodeObject *)*unicode;
155 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
156 PyErr_BadInternalCall();
157 return -1;
158 }
159 return _PyUnicode_Resize(v, length);
160}
161
Guido van Rossumd57fd912000-03-10 22:53:23 +0000162/* We allocate one more byte to make sure the string is
163 Ux0000 terminated -- XXX is this needed ?
164
165 XXX This allocator could further be enhanced by assuring that the
166 free list never reduces its size below 1.
167
168*/
169
170static
171PyUnicodeObject *_PyUnicode_New(int length)
172{
173 register PyUnicodeObject *unicode;
174
175 /* Optimization for empty strings */
176 if (length == 0 && unicode_empty != NULL) {
177 Py_INCREF(unicode_empty);
178 return unicode_empty;
179 }
180
181 /* Unicode freelist & memory allocation */
182 if (unicode_freelist) {
183 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000184 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000187 /* Keep-Alive optimization: we only upsize the buffer,
188 never downsize it. */
189 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000191 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 }
194 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000195 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000197 }
198 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 }
200 else {
201 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
202 if (unicode == NULL)
203 return NULL;
204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
205 }
206
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000207 if (!unicode->str) {
208 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000209 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode->str[length] = 0;
212 unicode->length = length;
213 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000216
217 onError:
218 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000219 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221}
222
223static
224void _PyUnicode_Free(register PyUnicodeObject *unicode)
225{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 /* Keep-Alive optimization */
228 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->str = NULL;
231 unicode->length = 0;
232 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000233 if (unicode->defenc) {
234 Py_DECREF(unicode->defenc);
235 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000236 }
237 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 *(PyUnicodeObject **)unicode = unicode_freelist;
239 unicode_freelist = unicode;
240 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 }
242 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000243 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000244 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 }
247}
248
249PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
250 int size)
251{
252 PyUnicodeObject *unicode;
253
254 unicode = _PyUnicode_New(size);
255 if (!unicode)
256 return NULL;
257
258 /* Copy the Unicode data into the new object */
259 if (u != NULL)
260 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
261
262 return (PyObject *)unicode;
263}
264
265#ifdef HAVE_WCHAR_H
266
267PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
268 int size)
269{
270 PyUnicodeObject *unicode;
271
272 if (w == NULL) {
273 PyErr_BadInternalCall();
274 return NULL;
275 }
276
277 unicode = _PyUnicode_New(size);
278 if (!unicode)
279 return NULL;
280
281 /* Copy the wchar_t data into the new object */
282#ifdef HAVE_USABLE_WCHAR_T
283 memcpy(unicode->str, w, size * sizeof(wchar_t));
284#else
285 {
286 register Py_UNICODE *u;
287 register int i;
288 u = PyUnicode_AS_UNICODE(unicode);
289 for (i = size; i >= 0; i--)
290 *u++ = *w++;
291 }
292#endif
293
294 return (PyObject *)unicode;
295}
296
297int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
298 register wchar_t *w,
299 int size)
300{
301 if (unicode == NULL) {
302 PyErr_BadInternalCall();
303 return -1;
304 }
305 if (size > PyUnicode_GET_SIZE(unicode))
306 size = PyUnicode_GET_SIZE(unicode);
307#ifdef HAVE_USABLE_WCHAR_T
308 memcpy(w, unicode->str, size * sizeof(wchar_t));
309#else
310 {
311 register Py_UNICODE *u;
312 register int i;
313 u = PyUnicode_AS_UNICODE(unicode);
314 for (i = size; i >= 0; i--)
315 *w++ = *u++;
316 }
317#endif
318
319 return size;
320}
321
322#endif
323
324PyObject *PyUnicode_FromObject(register PyObject *obj)
325{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000326 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
327}
328
329PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
330 const char *encoding,
331 const char *errors)
332{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 const char *s;
334 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000335 int owned = 0;
336 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337
338 if (obj == NULL) {
339 PyErr_BadInternalCall();
340 return NULL;
341 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000342
343 /* Coerce object */
344 if (PyInstance_Check(obj)) {
345 PyObject *func;
346 func = PyObject_GetAttrString(obj, "__str__");
347 if (func == NULL) {
348 PyErr_SetString(PyExc_TypeError,
349 "coercing to Unicode: instance doesn't define __str__");
350 return NULL;
351 }
352 obj = PyEval_CallObject(func, NULL);
353 Py_DECREF(func);
354 if (obj == NULL)
355 return NULL;
356 owned = 1;
357 }
358 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000360 v = obj;
361 if (encoding) {
362 PyErr_SetString(PyExc_TypeError,
363 "decoding Unicode is not supported");
364 return NULL;
365 }
366 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 }
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
371 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000376 PyErr_Format(PyExc_TypeError,
377 "coercing to Unicode: need string or buffer, "
378 "%.80s found",
379 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000380 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000381 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382
383 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 if (len == 0) {
385 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000388 else
389 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000390
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000392 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000394 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000395 return v;
396
397 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000398 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000399 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000400 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402}
403
404PyObject *PyUnicode_Decode(const char *s,
405 int size,
406 const char *encoding,
407 const char *errors)
408{
409 PyObject *buffer = NULL, *unicode;
410
Fred Drakee4315f52000-05-09 19:53:39 +0000411 if (encoding == NULL)
412 encoding = PyUnicode_GetDefaultEncoding();
413
414 /* Shortcuts for common default encodings */
415 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000417 else if (strcmp(encoding, "latin-1") == 0)
418 return PyUnicode_DecodeLatin1(s, size, errors);
419 else if (strcmp(encoding, "ascii") == 0)
420 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421
422 /* Decode via the codec registry */
423 buffer = PyBuffer_FromMemory((void *)s, size);
424 if (buffer == NULL)
425 goto onError;
426 unicode = PyCodec_Decode(buffer, encoding, errors);
427 if (unicode == NULL)
428 goto onError;
429 if (!PyUnicode_Check(unicode)) {
430 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000431 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 unicode->ob_type->tp_name);
433 Py_DECREF(unicode);
434 goto onError;
435 }
436 Py_DECREF(buffer);
437 return unicode;
438
439 onError:
440 Py_XDECREF(buffer);
441 return NULL;
442}
443
444PyObject *PyUnicode_Encode(const Py_UNICODE *s,
445 int size,
446 const char *encoding,
447 const char *errors)
448{
449 PyObject *v, *unicode;
450
451 unicode = PyUnicode_FromUnicode(s, size);
452 if (unicode == NULL)
453 return NULL;
454 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
455 Py_DECREF(unicode);
456 return v;
457}
458
459PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
460 const char *encoding,
461 const char *errors)
462{
463 PyObject *v;
464
465 if (!PyUnicode_Check(unicode)) {
466 PyErr_BadArgument();
467 goto onError;
468 }
Fred Drakee4315f52000-05-09 19:53:39 +0000469
470 if (encoding == NULL)
471 encoding = PyUnicode_GetDefaultEncoding();
472
473 /* Shortcuts for common default encodings */
474 if (errors == NULL) {
475 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000477 else if (strcmp(encoding, "latin-1") == 0)
478 return PyUnicode_AsLatin1String(unicode);
479 else if (strcmp(encoding, "ascii") == 0)
480 return PyUnicode_AsASCIIString(unicode);
481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482
483 /* Encode via the codec registry */
484 v = PyCodec_Encode(unicode, encoding, errors);
485 if (v == NULL)
486 goto onError;
487 /* XXX Should we really enforce this ? */
488 if (!PyString_Check(v)) {
489 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000490 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 v->ob_type->tp_name);
492 Py_DECREF(v);
493 goto onError;
494 }
495 return v;
496
497 onError:
498 return NULL;
499}
500
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000501/* Return a Python string holding the default encoded value of the
502 Unicode object.
503
504 The resulting string is cached in the Unicode object for subsequent
505 usage by this function. The cached version is needed to implement
506 the character buffer interface and will live (at least) as long as
507 the Unicode object itself.
508
509 The refcount of the string is *not* incremented.
510
511 *** Exported for internal use by the interpreter only !!! ***
512
513*/
514
515PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
516 const char *errors)
517{
518 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
519
520 if (v)
521 return v;
522 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
523 if (v && errors == NULL)
524 ((PyUnicodeObject *)unicode)->defenc = v;
525 return v;
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
529{
530 if (!PyUnicode_Check(unicode)) {
531 PyErr_BadArgument();
532 goto onError;
533 }
534 return PyUnicode_AS_UNICODE(unicode);
535
536 onError:
537 return NULL;
538}
539
540int PyUnicode_GetSize(PyObject *unicode)
541{
542 if (!PyUnicode_Check(unicode)) {
543 PyErr_BadArgument();
544 goto onError;
545 }
546 return PyUnicode_GET_SIZE(unicode);
547
548 onError:
549 return -1;
550}
551
Thomas Wouters78890102000-07-22 19:25:51 +0000552const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000553{
554 return unicode_default_encoding;
555}
556
557int PyUnicode_SetDefaultEncoding(const char *encoding)
558{
559 PyObject *v;
560
561 /* Make sure the encoding is valid. As side effect, this also
562 loads the encoding into the codec registry cache. */
563 v = _PyCodec_Lookup(encoding);
564 if (v == NULL)
565 goto onError;
566 Py_DECREF(v);
567 strncpy(unicode_default_encoding,
568 encoding,
569 sizeof(unicode_default_encoding));
570 return 0;
571
572 onError:
573 return -1;
574}
575
Guido van Rossumd57fd912000-03-10 22:53:23 +0000576/* --- UTF-8 Codec -------------------------------------------------------- */
577
578static
579char utf8_code_length[256] = {
580 /* Map UTF-8 encoded prefix byte to sequence length. zero means
581 illegal prefix. see RFC 2279 for details */
582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
597 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
598};
599
600static
601int utf8_decoding_error(const char **source,
602 Py_UNICODE **dest,
603 const char *errors,
604 const char *details)
605{
606 if ((errors == NULL) ||
607 (strcmp(errors,"strict") == 0)) {
608 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000609 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 details);
611 return -1;
612 }
613 else if (strcmp(errors,"ignore") == 0) {
614 (*source)++;
615 return 0;
616 }
617 else if (strcmp(errors,"replace") == 0) {
618 (*source)++;
619 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
620 (*dest)++;
621 return 0;
622 }
623 else {
624 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 errors);
627 return -1;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631PyObject *PyUnicode_DecodeUTF8(const char *s,
632 int size,
633 const char *errors)
634{
635 int n;
636 const char *e;
637 PyUnicodeObject *unicode;
638 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000639 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Note: size will always be longer than the resulting Unicode
642 character count */
643 unicode = _PyUnicode_New(size);
644 if (!unicode)
645 return NULL;
646 if (size == 0)
647 return (PyObject *)unicode;
648
649 /* Unpack UTF-8 encoded data */
650 p = unicode->str;
651 e = s + size;
652
653 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000654 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655
656 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000657 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 s++;
659 continue;
660 }
661
662 n = utf8_code_length[ch];
663
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 if (s + n > e) {
665 errmsg = "unexpected end of data";
666 goto utf8Error;
667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668
669 switch (n) {
670
671 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000672 errmsg = "unexpected code byte";
673 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000674 break;
675
676 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000677 errmsg = "internal error";
678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000679 break;
680
681 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000682 if ((s[1] & 0xc0) != 0x80) {
683 errmsg = "invalid data";
684 goto utf8Error;
685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000687 if (ch < 0x80) {
688 errmsg = "illegal encoding";
689 goto utf8Error;
690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000692 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 break;
694
695 case 3:
696 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 (s[2] & 0xc0) != 0x80) {
698 errmsg = "invalid data";
699 goto utf8Error;
700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
703 errmsg = "illegal encoding";
704 goto utf8Error;
705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000707 *p++ = (Py_UNICODE)ch;
708 break;
709
710 case 4:
711 if ((s[1] & 0xc0) != 0x80 ||
712 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 (s[3] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000717 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
718 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
719 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if ((ch < 0x10000) || /* minimum value allowed for 4
721 byte encoding */
722 (ch > 0x10ffff)) { /* maximum value allowed for
723 UTF-16 */
724 errmsg = "illegal encoding";
725 goto utf8Error;
726 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000727 /* compute and append the two surrogates: */
728
729 /* translate from 10000..10FFFF to 0..FFFF */
730 ch -= 0x10000;
731
732 /* high surrogate = top 10 bits added to D800 */
733 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
734
735 /* low surrogate = bottom 10 bits added to DC00 */
736 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 break;
738
739 default:
740 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 errmsg = "unsupported Unicode code range";
742 goto utf8Error;
743 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 }
745 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 continue;
747
748 utf8Error:
749 if (utf8_decoding_error(&s, &p, errors, errmsg))
750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000751 }
752
753 /* Adjust length */
754 if (_PyUnicode_Resize(unicode, p - unicode->str))
755 goto onError;
756
757 return (PyObject *)unicode;
758
759onError:
760 Py_DECREF(unicode);
761 return NULL;
762}
763
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000764/* Not used anymore, now that the encoder supports UTF-16
765 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000766#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767static
768int utf8_encoding_error(const Py_UNICODE **source,
769 char **dest,
770 const char *errors,
771 const char *details)
772{
773 if ((errors == NULL) ||
774 (strcmp(errors,"strict") == 0)) {
775 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000776 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777 details);
778 return -1;
779 }
780 else if (strcmp(errors,"ignore") == 0) {
781 return 0;
782 }
783 else if (strcmp(errors,"replace") == 0) {
784 **dest = '?';
785 (*dest)++;
786 return 0;
787 }
788 else {
789 PyErr_Format(PyExc_ValueError,
790 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000791 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792 errors);
793 return -1;
794 }
795}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000796#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
799 int size,
800 const char *errors)
801{
802 PyObject *v;
803 char *p;
804 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000805 Py_UCS4 ch2;
806 unsigned int cbAllocated = 3 * size;
807 unsigned int cbWritten = 0;
808 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000810 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (v == NULL)
812 return NULL;
813 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000814 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815
816 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000817 while (i < size) {
818 Py_UCS4 ch = s[i++];
819 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000821 cbWritten++;
822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 else if (ch < 0x0800) {
824 *p++ = 0xc0 | (ch >> 6);
825 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000826 cbWritten += 2;
827 }
828 else {
829 /* Check for high surrogate */
830 if (0xD800 <= ch && ch <= 0xDBFF) {
831 if (i != size) {
832 ch2 = s[i];
833 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
834
835 if (cbWritten >= (cbAllocated - 4)) {
836 /* Provide enough room for some more
837 surrogates */
838 cbAllocated += 4*10;
839 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000840 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 }
842
843 /* combine the two values */
844 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
845
846 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000847 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 i++;
849 cbWritten += 4;
850 }
851 }
852 }
853 else {
854 *p++ = (char)(0xe0 | (ch >> 12));
855 cbWritten += 3;
856 }
857 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
858 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859 }
860 }
861 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000862 if (_PyString_Resize(&v, p - q))
863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 return v;
865
866 onError:
867 Py_DECREF(v);
868 return NULL;
869}
870
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
872{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 if (!PyUnicode_Check(unicode)) {
874 PyErr_BadArgument();
875 return NULL;
876 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000877 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
878 PyUnicode_GET_SIZE(unicode),
879 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880}
881
882/* --- UTF-16 Codec ------------------------------------------------------- */
883
884static
885int utf16_decoding_error(const Py_UNICODE **source,
886 Py_UNICODE **dest,
887 const char *errors,
888 const char *details)
889{
890 if ((errors == NULL) ||
891 (strcmp(errors,"strict") == 0)) {
892 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000893 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000894 details);
895 return -1;
896 }
897 else if (strcmp(errors,"ignore") == 0) {
898 return 0;
899 }
900 else if (strcmp(errors,"replace") == 0) {
901 if (dest) {
902 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
903 (*dest)++;
904 }
905 return 0;
906 }
907 else {
908 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000909 "UTF-16 decoding error; "
910 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 errors);
912 return -1;
913 }
914}
915
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916PyObject *PyUnicode_DecodeUTF16(const char *s,
917 int size,
918 const char *errors,
919 int *byteorder)
920{
921 PyUnicodeObject *unicode;
922 Py_UNICODE *p;
923 const Py_UNICODE *q, *e;
924 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000925 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926
927 /* size should be an even number */
928 if (size % sizeof(Py_UNICODE) != 0) {
929 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
930 return NULL;
931 /* The remaining input chars are ignored if we fall through
932 here... */
933 }
934
935 /* Note: size will always be longer than the resulting Unicode
936 character count */
937 unicode = _PyUnicode_New(size);
938 if (!unicode)
939 return NULL;
940 if (size == 0)
941 return (PyObject *)unicode;
942
943 /* Unpack UTF-16 encoded data */
944 p = unicode->str;
945 q = (Py_UNICODE *)s;
946 e = q + (size / sizeof(Py_UNICODE));
947
948 if (byteorder)
949 bo = *byteorder;
950
951 while (q < e) {
952 register Py_UNICODE ch = *q++;
953
954 /* Check for BOM marks (U+FEFF) in the input and adjust
955 current byte order setting accordingly. Swap input
956 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
957 !) */
958#ifdef BYTEORDER_IS_LITTLE_ENDIAN
959 if (ch == 0xFEFF) {
960 bo = -1;
961 continue;
962 } else if (ch == 0xFFFE) {
963 bo = 1;
964 continue;
965 }
966 if (bo == 1)
967 ch = (ch >> 8) | (ch << 8);
968#else
969 if (ch == 0xFEFF) {
970 bo = 1;
971 continue;
972 } else if (ch == 0xFFFE) {
973 bo = -1;
974 continue;
975 }
976 if (bo == -1)
977 ch = (ch >> 8) | (ch << 8);
978#endif
979 if (ch < 0xD800 || ch > 0xDFFF) {
980 *p++ = ch;
981 continue;
982 }
983
984 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000985 if (q >= e) {
986 errmsg = "unexpected end of data";
987 goto utf16Error;
988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 if (0xDC00 <= *q && *q <= 0xDFFF) {
990 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000991 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992 /* This is valid data (a UTF-16 surrogate pair), but
993 we are not able to store this information since our
994 Py_UNICODE type only has 16 bits... this might
995 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000996 errmsg = "code pairs are not supported";
997 goto utf16Error;
998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999 else
1000 continue;
1001 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001002 errmsg = "illegal encoding";
1003 /* Fall through to report the error */
1004
1005 utf16Error:
1006 if (utf16_decoding_error(&q, &p, errors, errmsg))
1007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 }
1009
1010 if (byteorder)
1011 *byteorder = bo;
1012
1013 /* Adjust length */
1014 if (_PyUnicode_Resize(unicode, p - unicode->str))
1015 goto onError;
1016
1017 return (PyObject *)unicode;
1018
1019onError:
1020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024#undef UTF16_ERROR
1025
1026PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1027 int size,
1028 const char *errors,
1029 int byteorder)
1030{
1031 PyObject *v;
1032 Py_UNICODE *p;
1033 char *q;
1034
1035 /* We don't create UTF-16 pairs... */
1036 v = PyString_FromStringAndSize(NULL,
1037 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1038 if (v == NULL)
1039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040
1041 q = PyString_AS_STRING(v);
1042 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (byteorder == 0)
1044 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001045 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001046 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (byteorder == 0 ||
1048#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1049 byteorder == -1
1050#else
1051 byteorder == 1
1052#endif
1053 )
1054 memcpy(p, s, size * sizeof(Py_UNICODE));
1055 else
1056 while (size-- > 0) {
1057 Py_UNICODE ch = *s++;
1058 *p++ = (ch >> 8) | (ch << 8);
1059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return v;
1061}
1062
1063PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1064{
1065 if (!PyUnicode_Check(unicode)) {
1066 PyErr_BadArgument();
1067 return NULL;
1068 }
1069 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1070 PyUnicode_GET_SIZE(unicode),
1071 NULL,
1072 0);
1073}
1074
1075/* --- Unicode Escape Codec ----------------------------------------------- */
1076
1077static
1078int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001079 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 const char *errors,
1081 const char *details)
1082{
1083 if ((errors == NULL) ||
1084 (strcmp(errors,"strict") == 0)) {
1085 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001086 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 details);
1088 return -1;
1089 }
1090 else if (strcmp(errors,"ignore") == 0) {
1091 return 0;
1092 }
1093 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001094 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 return 0;
1096 }
1097 else {
1098 PyErr_Format(PyExc_ValueError,
1099 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001100 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 errors);
1102 return -1;
1103 }
1104}
1105
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001106static _PyUnicode_Name_CAPI *unicode_names = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001107
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1109 int size,
1110 const char *errors)
1111{
1112 PyUnicodeObject *v;
1113 Py_UNICODE *p = NULL, *buf = NULL;
1114 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001115 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116
1117 /* Escaped strings will always be longer than the resulting
1118 Unicode string, so we start with size here and then reduce the
1119 length after conversion to the true value. */
1120 v = _PyUnicode_New(size);
1121 if (v == NULL)
1122 goto onError;
1123 if (size == 0)
1124 return (PyObject *)v;
1125 p = buf = PyUnicode_AS_UNICODE(v);
1126 end = s + size;
1127 while (s < end) {
1128 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001129 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130 int i;
1131
1132 /* Non-escape characters are interpreted as Unicode ordinals */
1133 if (*s != '\\') {
1134 *p++ = (unsigned char)*s++;
1135 continue;
1136 }
1137
1138 /* \ - Escapes */
1139 s++;
1140 switch (*s++) {
1141
1142 /* \x escapes */
1143 case '\n': break;
1144 case '\\': *p++ = '\\'; break;
1145 case '\'': *p++ = '\''; break;
1146 case '\"': *p++ = '\"'; break;
1147 case 'b': *p++ = '\b'; break;
1148 case 'f': *p++ = '\014'; break; /* FF */
1149 case 't': *p++ = '\t'; break;
1150 case 'n': *p++ = '\n'; break;
1151 case 'r': *p++ = '\r'; break;
1152 case 'v': *p++ = '\013'; break; /* VT */
1153 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1154
1155 /* \OOO (octal) escapes */
1156 case '0': case '1': case '2': case '3':
1157 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001158 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001160 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001162 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001164 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 break;
1166
Fredrik Lundhdf846752000-09-03 11:29:49 +00001167 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001169 for (x = 0, i = 0; i < 2; i++) {
1170 c = (unsigned char)s[i];
1171 if (!isxdigit(c)) {
1172 if (unicodeescape_decoding_error(&s, &x, errors,
1173 "truncated \\xXX"))
1174 goto onError;
1175 i++;
1176 break;
1177 }
1178 x = (x<<4) & ~0xF;
1179 if (c >= '0' && c <= '9')
1180 x += c - '0';
1181 else if (c >= 'a' && c <= 'f')
1182 x += 10 + c - 'a';
1183 else
1184 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001185 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001186 s += i;
1187 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001188 break;
1189
1190 /* \uXXXX with 4 hex digits */
1191 case 'u':
1192 for (x = 0, i = 0; i < 4; i++) {
1193 c = (unsigned char)s[i];
1194 if (!isxdigit(c)) {
1195 if (unicodeescape_decoding_error(&s, &x, errors,
1196 "truncated \\uXXXX"))
1197 goto onError;
1198 i++;
1199 break;
1200 }
1201 x = (x<<4) & ~0xF;
1202 if (c >= '0' && c <= '9')
1203 x += c - '0';
1204 else if (c >= 'a' && c <= 'f')
1205 x += 10 + c - 'a';
1206 else
1207 x += 10 + c - 'A';
1208 }
1209 s += i;
1210 *p++ = x;
1211 break;
1212
Fredrik Lundhdf846752000-09-03 11:29:49 +00001213 /* \UXXXXXXXX with 8 hex digits */
1214 case 'U':
1215 for (chr = 0, i = 0; i < 8; i++) {
1216 c = (unsigned char)s[i];
1217 if (!isxdigit(c)) {
1218 if (unicodeescape_decoding_error(&s, &x, errors,
1219 "truncated \\uXXXX"))
1220 goto onError;
1221 i++;
1222 break;
1223 }
1224 chr = (chr<<4) & ~0xF;
1225 if (c >= '0' && c <= '9')
1226 chr += c - '0';
1227 else if (c >= 'a' && c <= 'f')
1228 chr += 10 + c - 'a';
1229 else
1230 chr += 10 + c - 'A';
1231 }
1232 s += i;
1233 goto store;
1234
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001235 case 'N':
1236 /* Ok, we need to deal with Unicode Character Names now,
1237 * make sure we've imported the hash table data...
1238 */
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001239 if (unicode_names == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001240 PyObject *mod = 0, *v = 0;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001241 mod = PyImport_ImportModule("ucnhash");
1242 if (mod == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001243 goto ucnhashError;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001244 v = PyObject_GetAttrString(mod,"Unicode_Names_CAPI");
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001245 Py_DECREF(mod);
1246 if (v == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001247 goto ucnhashError;
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001248 unicode_names = PyCObject_AsVoidPtr(v);
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001249 Py_DECREF(v);
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001250 if (unicode_names == NULL)
Fredrik Lundhf6056062001-01-20 11:15:25 +00001251 goto ucnhashError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001252 }
1253
Fredrik Lundhdf846752000-09-03 11:29:49 +00001254 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001255 const char *start = s + 1;
1256 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001257
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001258 /* look for the closing brace */
1259 while (*endBrace != '}' && endBrace < end)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001260 endBrace++;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001261 if (endBrace != end && *endBrace == '}') {
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001262 if (!unicode_names->getcode(start, endBrace-start, &chr)) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001263 if (unicodeescape_decoding_error(
1264 &s, &x, errors,
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00001265 "Invalid Unicode Character Name")
1266 )
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001267 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001268 goto ucnFallthrough;
1269 }
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001270 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001271 goto store;
1272 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001273 if (unicodeescape_decoding_error(
1274 &s, &x, errors,
1275 "Unicode name missing closing brace"))
1276 goto onError;
1277 goto ucnFallthrough;
1278 }
1279 break;
1280 }
1281 if (unicodeescape_decoding_error(
1282 &s, &x, errors,
1283 "Missing opening brace for Unicode Character Name escape"))
1284 goto onError;
1285ucnFallthrough:
1286 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001287 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288 *p++ = '\\';
1289 *p++ = (unsigned char)s[-1];
1290 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001291store:
1292 /* when we get here, chr is a 32-bit unicode character */
1293 if (chr <= 0xffff)
1294 /* UCS-2 character */
1295 *p++ = (Py_UNICODE) chr;
1296 else if (chr <= 0x10ffff) {
1297 /* UCS-4 character. store as two surrogate characters */
1298 chr -= 0x10000L;
1299 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1300 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1301 } else {
1302 if (unicodeescape_decoding_error(
1303 &s, &x, errors,
1304 "Illegal Unicode character")
1305 )
1306 goto onError;
1307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 }
1309 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001310 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001311 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001312 return (PyObject *)v;
1313
Fredrik Lundhf6056062001-01-20 11:15:25 +00001314 ucnhashError:
1315 PyErr_SetString(PyExc_UnicodeError,
1316 "\\N escapes not supported (can't load ucnhash module)");
1317 return NULL;
1318
Guido van Rossumd57fd912000-03-10 22:53:23 +00001319 onError:
1320 Py_XDECREF(v);
1321 return NULL;
1322}
1323
1324/* Return a Unicode-Escape string version of the Unicode object.
1325
1326 If quotes is true, the string is enclosed in u"" or u'' quotes as
1327 appropriate.
1328
1329*/
1330
Barry Warsaw51ac5802000-03-20 16:36:48 +00001331static const Py_UNICODE *findchar(const Py_UNICODE *s,
1332 int size,
1333 Py_UNICODE ch);
1334
Guido van Rossumd57fd912000-03-10 22:53:23 +00001335static
1336PyObject *unicodeescape_string(const Py_UNICODE *s,
1337 int size,
1338 int quotes)
1339{
1340 PyObject *repr;
1341 char *p;
1342 char *q;
1343
1344 static const char *hexdigit = "0123456789ABCDEF";
1345
1346 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1347 if (repr == NULL)
1348 return NULL;
1349
1350 p = q = PyString_AS_STRING(repr);
1351
1352 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353 *p++ = 'u';
1354 *p++ = (findchar(s, size, '\'') &&
1355 !findchar(s, size, '"')) ? '"' : '\'';
1356 }
1357 while (size-- > 0) {
1358 Py_UNICODE ch = *s++;
1359 /* Escape quotes */
1360 if (quotes && (ch == q[1] || ch == '\\')) {
1361 *p++ = '\\';
1362 *p++ = (char) ch;
1363 }
1364 /* Map 16-bit characters to '\uxxxx' */
1365 else if (ch >= 256) {
1366 *p++ = '\\';
1367 *p++ = 'u';
1368 *p++ = hexdigit[(ch >> 12) & 0xf];
1369 *p++ = hexdigit[(ch >> 8) & 0xf];
1370 *p++ = hexdigit[(ch >> 4) & 0xf];
1371 *p++ = hexdigit[ch & 15];
1372 }
1373 /* Map non-printable US ASCII to '\ooo' */
1374 else if (ch < ' ' || ch >= 128) {
1375 *p++ = '\\';
1376 *p++ = hexdigit[(ch >> 6) & 7];
1377 *p++ = hexdigit[(ch >> 3) & 7];
1378 *p++ = hexdigit[ch & 7];
1379 }
1380 /* Copy everything else as-is */
1381 else
1382 *p++ = (char) ch;
1383 }
1384 if (quotes)
1385 *p++ = q[1];
1386
1387 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001388 if (_PyString_Resize(&repr, p - q))
1389 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001390
1391 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001392
1393 onError:
1394 Py_DECREF(repr);
1395 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001396}
1397
1398PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1399 int size)
1400{
1401 return unicodeescape_string(s, size, 0);
1402}
1403
1404PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1405{
1406 if (!PyUnicode_Check(unicode)) {
1407 PyErr_BadArgument();
1408 return NULL;
1409 }
1410 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1411 PyUnicode_GET_SIZE(unicode));
1412}
1413
1414/* --- Raw Unicode Escape Codec ------------------------------------------- */
1415
1416PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1417 int size,
1418 const char *errors)
1419{
1420 PyUnicodeObject *v;
1421 Py_UNICODE *p, *buf;
1422 const char *end;
1423 const char *bs;
1424
1425 /* Escaped strings will always be longer than the resulting
1426 Unicode string, so we start with size here and then reduce the
1427 length after conversion to the true value. */
1428 v = _PyUnicode_New(size);
1429 if (v == NULL)
1430 goto onError;
1431 if (size == 0)
1432 return (PyObject *)v;
1433 p = buf = PyUnicode_AS_UNICODE(v);
1434 end = s + size;
1435 while (s < end) {
1436 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001437 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 int i;
1439
1440 /* Non-escape characters are interpreted as Unicode ordinals */
1441 if (*s != '\\') {
1442 *p++ = (unsigned char)*s++;
1443 continue;
1444 }
1445
1446 /* \u-escapes are only interpreted iff the number of leading
1447 backslashes if odd */
1448 bs = s;
1449 for (;s < end;) {
1450 if (*s != '\\')
1451 break;
1452 *p++ = (unsigned char)*s++;
1453 }
1454 if (((s - bs) & 1) == 0 ||
1455 s >= end ||
1456 *s != 'u') {
1457 continue;
1458 }
1459 p--;
1460 s++;
1461
1462 /* \uXXXX with 4 hex digits */
1463 for (x = 0, i = 0; i < 4; i++) {
1464 c = (unsigned char)s[i];
1465 if (!isxdigit(c)) {
1466 if (unicodeescape_decoding_error(&s, &x, errors,
1467 "truncated \\uXXXX"))
1468 goto onError;
1469 i++;
1470 break;
1471 }
1472 x = (x<<4) & ~0xF;
1473 if (c >= '0' && c <= '9')
1474 x += c - '0';
1475 else if (c >= 'a' && c <= 'f')
1476 x += 10 + c - 'a';
1477 else
1478 x += 10 + c - 'A';
1479 }
1480 s += i;
1481 *p++ = x;
1482 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001483 if (_PyUnicode_Resize(v, (int)(p - buf)))
1484 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485 return (PyObject *)v;
1486
1487 onError:
1488 Py_XDECREF(v);
1489 return NULL;
1490}
1491
1492PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1493 int size)
1494{
1495 PyObject *repr;
1496 char *p;
1497 char *q;
1498
1499 static const char *hexdigit = "0123456789ABCDEF";
1500
1501 repr = PyString_FromStringAndSize(NULL, 6 * size);
1502 if (repr == NULL)
1503 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001504 if (size == 0)
1505 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506
1507 p = q = PyString_AS_STRING(repr);
1508 while (size-- > 0) {
1509 Py_UNICODE ch = *s++;
1510 /* Map 16-bit characters to '\uxxxx' */
1511 if (ch >= 256) {
1512 *p++ = '\\';
1513 *p++ = 'u';
1514 *p++ = hexdigit[(ch >> 12) & 0xf];
1515 *p++ = hexdigit[(ch >> 8) & 0xf];
1516 *p++ = hexdigit[(ch >> 4) & 0xf];
1517 *p++ = hexdigit[ch & 15];
1518 }
1519 /* Copy everything else as-is */
1520 else
1521 *p++ = (char) ch;
1522 }
1523 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001524 if (_PyString_Resize(&repr, p - q))
1525 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526
1527 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001528
1529 onError:
1530 Py_DECREF(repr);
1531 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001532}
1533
1534PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1535{
1536 if (!PyUnicode_Check(unicode)) {
1537 PyErr_BadArgument();
1538 return NULL;
1539 }
1540 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1541 PyUnicode_GET_SIZE(unicode));
1542}
1543
1544/* --- Latin-1 Codec ------------------------------------------------------ */
1545
1546PyObject *PyUnicode_DecodeLatin1(const char *s,
1547 int size,
1548 const char *errors)
1549{
1550 PyUnicodeObject *v;
1551 Py_UNICODE *p;
1552
1553 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1554 v = _PyUnicode_New(size);
1555 if (v == NULL)
1556 goto onError;
1557 if (size == 0)
1558 return (PyObject *)v;
1559 p = PyUnicode_AS_UNICODE(v);
1560 while (size-- > 0)
1561 *p++ = (unsigned char)*s++;
1562 return (PyObject *)v;
1563
1564 onError:
1565 Py_XDECREF(v);
1566 return NULL;
1567}
1568
1569static
1570int latin1_encoding_error(const Py_UNICODE **source,
1571 char **dest,
1572 const char *errors,
1573 const char *details)
1574{
1575 if ((errors == NULL) ||
1576 (strcmp(errors,"strict") == 0)) {
1577 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001578 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 details);
1580 return -1;
1581 }
1582 else if (strcmp(errors,"ignore") == 0) {
1583 return 0;
1584 }
1585 else if (strcmp(errors,"replace") == 0) {
1586 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001587 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 return 0;
1589 }
1590 else {
1591 PyErr_Format(PyExc_ValueError,
1592 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001593 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001594 errors);
1595 return -1;
1596 }
1597}
1598
1599PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1600 int size,
1601 const char *errors)
1602{
1603 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001604 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001605
Guido van Rossumd57fd912000-03-10 22:53:23 +00001606 repr = PyString_FromStringAndSize(NULL, size);
1607 if (repr == NULL)
1608 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001609 if (size == 0)
1610 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001611
1612 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001613 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 while (size-- > 0) {
1615 Py_UNICODE ch = *p++;
1616 if (ch >= 256) {
1617 if (latin1_encoding_error(&p, &s, errors,
1618 "ordinal not in range(256)"))
1619 goto onError;
1620 }
1621 else
1622 *s++ = (char)ch;
1623 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001624 /* Resize if error handling skipped some characters */
1625 if (s - start < PyString_GET_SIZE(repr))
1626 if (_PyString_Resize(&repr, s - start))
1627 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001628 return repr;
1629
1630 onError:
1631 Py_DECREF(repr);
1632 return NULL;
1633}
1634
1635PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1636{
1637 if (!PyUnicode_Check(unicode)) {
1638 PyErr_BadArgument();
1639 return NULL;
1640 }
1641 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1642 PyUnicode_GET_SIZE(unicode),
1643 NULL);
1644}
1645
1646/* --- 7-bit ASCII Codec -------------------------------------------------- */
1647
1648static
1649int ascii_decoding_error(const char **source,
1650 Py_UNICODE **dest,
1651 const char *errors,
1652 const char *details)
1653{
1654 if ((errors == NULL) ||
1655 (strcmp(errors,"strict") == 0)) {
1656 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001657 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001658 details);
1659 return -1;
1660 }
1661 else if (strcmp(errors,"ignore") == 0) {
1662 return 0;
1663 }
1664 else if (strcmp(errors,"replace") == 0) {
1665 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1666 (*dest)++;
1667 return 0;
1668 }
1669 else {
1670 PyErr_Format(PyExc_ValueError,
1671 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001672 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 errors);
1674 return -1;
1675 }
1676}
1677
1678PyObject *PyUnicode_DecodeASCII(const char *s,
1679 int size,
1680 const char *errors)
1681{
1682 PyUnicodeObject *v;
1683 Py_UNICODE *p;
1684
1685 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1686 v = _PyUnicode_New(size);
1687 if (v == NULL)
1688 goto onError;
1689 if (size == 0)
1690 return (PyObject *)v;
1691 p = PyUnicode_AS_UNICODE(v);
1692 while (size-- > 0) {
1693 register unsigned char c;
1694
1695 c = (unsigned char)*s++;
1696 if (c < 128)
1697 *p++ = c;
1698 else if (ascii_decoding_error(&s, &p, errors,
1699 "ordinal not in range(128)"))
1700 goto onError;
1701 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001702 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1703 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1704 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705 return (PyObject *)v;
1706
1707 onError:
1708 Py_XDECREF(v);
1709 return NULL;
1710}
1711
1712static
1713int ascii_encoding_error(const Py_UNICODE **source,
1714 char **dest,
1715 const char *errors,
1716 const char *details)
1717{
1718 if ((errors == NULL) ||
1719 (strcmp(errors,"strict") == 0)) {
1720 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001721 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001722 details);
1723 return -1;
1724 }
1725 else if (strcmp(errors,"ignore") == 0) {
1726 return 0;
1727 }
1728 else if (strcmp(errors,"replace") == 0) {
1729 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001730 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 return 0;
1732 }
1733 else {
1734 PyErr_Format(PyExc_ValueError,
1735 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001736 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 errors);
1738 return -1;
1739 }
1740}
1741
1742PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1743 int size,
1744 const char *errors)
1745{
1746 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001747 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001748
Guido van Rossumd57fd912000-03-10 22:53:23 +00001749 repr = PyString_FromStringAndSize(NULL, size);
1750 if (repr == NULL)
1751 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001752 if (size == 0)
1753 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001754
1755 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001756 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 while (size-- > 0) {
1758 Py_UNICODE ch = *p++;
1759 if (ch >= 128) {
1760 if (ascii_encoding_error(&p, &s, errors,
1761 "ordinal not in range(128)"))
1762 goto onError;
1763 }
1764 else
1765 *s++ = (char)ch;
1766 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001767 /* Resize if error handling skipped some characters */
1768 if (s - start < PyString_GET_SIZE(repr))
1769 if (_PyString_Resize(&repr, s - start))
1770 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001771 return repr;
1772
1773 onError:
1774 Py_DECREF(repr);
1775 return NULL;
1776}
1777
1778PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1779{
1780 if (!PyUnicode_Check(unicode)) {
1781 PyErr_BadArgument();
1782 return NULL;
1783 }
1784 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1785 PyUnicode_GET_SIZE(unicode),
1786 NULL);
1787}
1788
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001789#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001790
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001791/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001792
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001793PyObject *PyUnicode_DecodeMBCS(const char *s,
1794 int size,
1795 const char *errors)
1796{
1797 PyUnicodeObject *v;
1798 Py_UNICODE *p;
1799
1800 /* First get the size of the result */
1801 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001802 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001803 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1804
1805 v = _PyUnicode_New(usize);
1806 if (v == NULL)
1807 return NULL;
1808 if (usize == 0)
1809 return (PyObject *)v;
1810 p = PyUnicode_AS_UNICODE(v);
1811 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1812 Py_DECREF(v);
1813 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1814 }
1815
1816 return (PyObject *)v;
1817}
1818
1819PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1820 int size,
1821 const char *errors)
1822{
1823 PyObject *repr;
1824 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001825 DWORD mbcssize;
1826
1827 /* If there are no characters, bail now! */
1828 if (size==0)
1829 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001830
1831 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001832 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001833 if (mbcssize==0)
1834 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1835
1836 repr = PyString_FromStringAndSize(NULL, mbcssize);
1837 if (repr == NULL)
1838 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001839 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001840 return repr;
1841
1842 /* Do the conversion */
1843 s = PyString_AS_STRING(repr);
1844 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1845 Py_DECREF(repr);
1846 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1847 }
1848 return repr;
1849}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001850
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001851#endif /* MS_WIN32 */
1852
Guido van Rossumd57fd912000-03-10 22:53:23 +00001853/* --- Character Mapping Codec -------------------------------------------- */
1854
1855static
1856int charmap_decoding_error(const char **source,
1857 Py_UNICODE **dest,
1858 const char *errors,
1859 const char *details)
1860{
1861 if ((errors == NULL) ||
1862 (strcmp(errors,"strict") == 0)) {
1863 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001864 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001865 details);
1866 return -1;
1867 }
1868 else if (strcmp(errors,"ignore") == 0) {
1869 return 0;
1870 }
1871 else if (strcmp(errors,"replace") == 0) {
1872 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1873 (*dest)++;
1874 return 0;
1875 }
1876 else {
1877 PyErr_Format(PyExc_ValueError,
1878 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001879 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001880 errors);
1881 return -1;
1882 }
1883}
1884
1885PyObject *PyUnicode_DecodeCharmap(const char *s,
1886 int size,
1887 PyObject *mapping,
1888 const char *errors)
1889{
1890 PyUnicodeObject *v;
1891 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001892 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893
1894 /* Default to Latin-1 */
1895 if (mapping == NULL)
1896 return PyUnicode_DecodeLatin1(s, size, errors);
1897
1898 v = _PyUnicode_New(size);
1899 if (v == NULL)
1900 goto onError;
1901 if (size == 0)
1902 return (PyObject *)v;
1903 p = PyUnicode_AS_UNICODE(v);
1904 while (size-- > 0) {
1905 unsigned char ch = *s++;
1906 PyObject *w, *x;
1907
1908 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1909 w = PyInt_FromLong((long)ch);
1910 if (w == NULL)
1911 goto onError;
1912 x = PyObject_GetItem(mapping, w);
1913 Py_DECREF(w);
1914 if (x == NULL) {
1915 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001916 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001918 x = Py_None;
1919 Py_INCREF(x);
1920 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001921 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 }
1923
1924 /* Apply mapping */
1925 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001926 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001927 if (value < 0 || value > 65535) {
1928 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001929 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001930 Py_DECREF(x);
1931 goto onError;
1932 }
1933 *p++ = (Py_UNICODE)value;
1934 }
1935 else if (x == Py_None) {
1936 /* undefined mapping */
1937 if (charmap_decoding_error(&s, &p, errors,
1938 "character maps to <undefined>")) {
1939 Py_DECREF(x);
1940 goto onError;
1941 }
1942 }
1943 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001944 int targetsize = PyUnicode_GET_SIZE(x);
1945
1946 if (targetsize == 1)
1947 /* 1-1 mapping */
1948 *p++ = *PyUnicode_AS_UNICODE(x);
1949
1950 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001951 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001952 if (targetsize > extrachars) {
1953 /* resize first */
1954 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1955 int needed = (targetsize - extrachars) + \
1956 (targetsize << 2);
1957 extrachars += needed;
1958 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001959 Py_DECREF(x);
1960 goto onError;
1961 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001962 p = PyUnicode_AS_UNICODE(v) + oldpos;
1963 }
1964 Py_UNICODE_COPY(p,
1965 PyUnicode_AS_UNICODE(x),
1966 targetsize);
1967 p += targetsize;
1968 extrachars -= targetsize;
1969 }
1970 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001971 }
1972 else {
1973 /* wrong return value */
1974 PyErr_SetString(PyExc_TypeError,
1975 "character mapping must return integer, None or unicode");
1976 Py_DECREF(x);
1977 goto onError;
1978 }
1979 Py_DECREF(x);
1980 }
1981 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1982 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1983 goto onError;
1984 return (PyObject *)v;
1985
1986 onError:
1987 Py_XDECREF(v);
1988 return NULL;
1989}
1990
1991static
1992int charmap_encoding_error(const Py_UNICODE **source,
1993 char **dest,
1994 const char *errors,
1995 const char *details)
1996{
1997 if ((errors == NULL) ||
1998 (strcmp(errors,"strict") == 0)) {
1999 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002000 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002001 details);
2002 return -1;
2003 }
2004 else if (strcmp(errors,"ignore") == 0) {
2005 return 0;
2006 }
2007 else if (strcmp(errors,"replace") == 0) {
2008 **dest = '?';
2009 (*dest)++;
2010 return 0;
2011 }
2012 else {
2013 PyErr_Format(PyExc_ValueError,
2014 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002015 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002016 errors);
2017 return -1;
2018 }
2019}
2020
2021PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2022 int size,
2023 PyObject *mapping,
2024 const char *errors)
2025{
2026 PyObject *v;
2027 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002028 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029
2030 /* Default to Latin-1 */
2031 if (mapping == NULL)
2032 return PyUnicode_EncodeLatin1(p, size, errors);
2033
2034 v = PyString_FromStringAndSize(NULL, size);
2035 if (v == NULL)
2036 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002037 if (size == 0)
2038 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039 s = PyString_AS_STRING(v);
2040 while (size-- > 0) {
2041 Py_UNICODE ch = *p++;
2042 PyObject *w, *x;
2043
2044 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2045 w = PyInt_FromLong((long)ch);
2046 if (w == NULL)
2047 goto onError;
2048 x = PyObject_GetItem(mapping, w);
2049 Py_DECREF(w);
2050 if (x == NULL) {
2051 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002052 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002053 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002054 x = Py_None;
2055 Py_INCREF(x);
2056 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002057 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 }
2059
2060 /* Apply mapping */
2061 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002062 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002063 if (value < 0 || value > 255) {
2064 PyErr_SetString(PyExc_TypeError,
2065 "character mapping must be in range(256)");
2066 Py_DECREF(x);
2067 goto onError;
2068 }
2069 *s++ = (char)value;
2070 }
2071 else if (x == Py_None) {
2072 /* undefined mapping */
2073 if (charmap_encoding_error(&p, &s, errors,
2074 "character maps to <undefined>")) {
2075 Py_DECREF(x);
2076 goto onError;
2077 }
2078 }
2079 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002080 int targetsize = PyString_GET_SIZE(x);
2081
2082 if (targetsize == 1)
2083 /* 1-1 mapping */
2084 *s++ = *PyString_AS_STRING(x);
2085
2086 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002087 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002088 if (targetsize > extrachars) {
2089 /* resize first */
2090 int oldpos = (int)(s - PyString_AS_STRING(v));
2091 int needed = (targetsize - extrachars) + \
2092 (targetsize << 2);
2093 extrachars += needed;
2094 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002095 Py_DECREF(x);
2096 goto onError;
2097 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002098 s = PyString_AS_STRING(v) + oldpos;
2099 }
2100 memcpy(s,
2101 PyString_AS_STRING(x),
2102 targetsize);
2103 s += targetsize;
2104 extrachars -= targetsize;
2105 }
2106 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002107 }
2108 else {
2109 /* wrong return value */
2110 PyErr_SetString(PyExc_TypeError,
2111 "character mapping must return integer, None or unicode");
2112 Py_DECREF(x);
2113 goto onError;
2114 }
2115 Py_DECREF(x);
2116 }
2117 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2118 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2119 goto onError;
2120 return v;
2121
2122 onError:
2123 Py_DECREF(v);
2124 return NULL;
2125}
2126
2127PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2128 PyObject *mapping)
2129{
2130 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2131 PyErr_BadArgument();
2132 return NULL;
2133 }
2134 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2135 PyUnicode_GET_SIZE(unicode),
2136 mapping,
2137 NULL);
2138}
2139
2140static
2141int translate_error(const Py_UNICODE **source,
2142 Py_UNICODE **dest,
2143 const char *errors,
2144 const char *details)
2145{
2146 if ((errors == NULL) ||
2147 (strcmp(errors,"strict") == 0)) {
2148 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002149 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002150 details);
2151 return -1;
2152 }
2153 else if (strcmp(errors,"ignore") == 0) {
2154 return 0;
2155 }
2156 else if (strcmp(errors,"replace") == 0) {
2157 **dest = '?';
2158 (*dest)++;
2159 return 0;
2160 }
2161 else {
2162 PyErr_Format(PyExc_ValueError,
2163 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002164 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002165 errors);
2166 return -1;
2167 }
2168}
2169
2170PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2171 int size,
2172 PyObject *mapping,
2173 const char *errors)
2174{
2175 PyUnicodeObject *v;
2176 Py_UNICODE *p;
2177
2178 if (mapping == NULL) {
2179 PyErr_BadArgument();
2180 return NULL;
2181 }
2182
2183 /* Output will never be longer than input */
2184 v = _PyUnicode_New(size);
2185 if (v == NULL)
2186 goto onError;
2187 if (size == 0)
2188 goto done;
2189 p = PyUnicode_AS_UNICODE(v);
2190 while (size-- > 0) {
2191 Py_UNICODE ch = *s++;
2192 PyObject *w, *x;
2193
2194 /* Get mapping */
2195 w = PyInt_FromLong(ch);
2196 if (w == NULL)
2197 goto onError;
2198 x = PyObject_GetItem(mapping, w);
2199 Py_DECREF(w);
2200 if (x == NULL) {
2201 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2202 /* No mapping found: default to 1-1 mapping */
2203 PyErr_Clear();
2204 *p++ = ch;
2205 continue;
2206 }
2207 goto onError;
2208 }
2209
2210 /* Apply mapping */
2211 if (PyInt_Check(x))
2212 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2213 else if (x == Py_None) {
2214 /* undefined mapping */
2215 if (translate_error(&s, &p, errors,
2216 "character maps to <undefined>")) {
2217 Py_DECREF(x);
2218 goto onError;
2219 }
2220 }
2221 else if (PyUnicode_Check(x)) {
2222 if (PyUnicode_GET_SIZE(x) != 1) {
2223 /* 1-n mapping */
2224 PyErr_SetString(PyExc_NotImplementedError,
2225 "1-n mappings are currently not implemented");
2226 Py_DECREF(x);
2227 goto onError;
2228 }
2229 *p++ = *PyUnicode_AS_UNICODE(x);
2230 }
2231 else {
2232 /* wrong return value */
2233 PyErr_SetString(PyExc_TypeError,
2234 "translate mapping must return integer, None or unicode");
2235 Py_DECREF(x);
2236 goto onError;
2237 }
2238 Py_DECREF(x);
2239 }
2240 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002241 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2242 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002243
2244 done:
2245 return (PyObject *)v;
2246
2247 onError:
2248 Py_XDECREF(v);
2249 return NULL;
2250}
2251
2252PyObject *PyUnicode_Translate(PyObject *str,
2253 PyObject *mapping,
2254 const char *errors)
2255{
2256 PyObject *result;
2257
2258 str = PyUnicode_FromObject(str);
2259 if (str == NULL)
2260 goto onError;
2261 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2262 PyUnicode_GET_SIZE(str),
2263 mapping,
2264 errors);
2265 Py_DECREF(str);
2266 return result;
2267
2268 onError:
2269 Py_XDECREF(str);
2270 return NULL;
2271}
2272
Guido van Rossum9e896b32000-04-05 20:11:21 +00002273/* --- Decimal Encoder ---------------------------------------------------- */
2274
2275int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2276 int length,
2277 char *output,
2278 const char *errors)
2279{
2280 Py_UNICODE *p, *end;
2281
2282 if (output == NULL) {
2283 PyErr_BadArgument();
2284 return -1;
2285 }
2286
2287 p = s;
2288 end = s + length;
2289 while (p < end) {
2290 register Py_UNICODE ch = *p++;
2291 int decimal;
2292
2293 if (Py_UNICODE_ISSPACE(ch)) {
2294 *output++ = ' ';
2295 continue;
2296 }
2297 decimal = Py_UNICODE_TODECIMAL(ch);
2298 if (decimal >= 0) {
2299 *output++ = '0' + decimal;
2300 continue;
2301 }
Guido van Rossumba477042000-04-06 18:18:10 +00002302 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002303 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002304 continue;
2305 }
2306 /* All other characters are considered invalid */
2307 if (errors == NULL || strcmp(errors, "strict") == 0) {
2308 PyErr_SetString(PyExc_ValueError,
2309 "invalid decimal Unicode string");
2310 goto onError;
2311 }
2312 else if (strcmp(errors, "ignore") == 0)
2313 continue;
2314 else if (strcmp(errors, "replace") == 0) {
2315 *output++ = '?';
2316 continue;
2317 }
2318 }
2319 /* 0-terminate the output string */
2320 *output++ = '\0';
2321 return 0;
2322
2323 onError:
2324 return -1;
2325}
2326
Guido van Rossumd57fd912000-03-10 22:53:23 +00002327/* --- Helpers ------------------------------------------------------------ */
2328
2329static
2330int count(PyUnicodeObject *self,
2331 int start,
2332 int end,
2333 PyUnicodeObject *substring)
2334{
2335 int count = 0;
2336
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002337 if (start < 0)
2338 start += self->length;
2339 if (start < 0)
2340 start = 0;
2341 if (end > self->length)
2342 end = self->length;
2343 if (end < 0)
2344 end += self->length;
2345 if (end < 0)
2346 end = 0;
2347
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002348 if (substring->length == 0)
2349 return (end - start + 1);
2350
Guido van Rossumd57fd912000-03-10 22:53:23 +00002351 end -= substring->length;
2352
2353 while (start <= end)
2354 if (Py_UNICODE_MATCH(self, start, substring)) {
2355 count++;
2356 start += substring->length;
2357 } else
2358 start++;
2359
2360 return count;
2361}
2362
2363int PyUnicode_Count(PyObject *str,
2364 PyObject *substr,
2365 int start,
2366 int end)
2367{
2368 int result;
2369
2370 str = PyUnicode_FromObject(str);
2371 if (str == NULL)
2372 return -1;
2373 substr = PyUnicode_FromObject(substr);
2374 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002375 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002376 return -1;
2377 }
2378
2379 result = count((PyUnicodeObject *)str,
2380 start, end,
2381 (PyUnicodeObject *)substr);
2382
2383 Py_DECREF(str);
2384 Py_DECREF(substr);
2385 return result;
2386}
2387
2388static
2389int findstring(PyUnicodeObject *self,
2390 PyUnicodeObject *substring,
2391 int start,
2392 int end,
2393 int direction)
2394{
2395 if (start < 0)
2396 start += self->length;
2397 if (start < 0)
2398 start = 0;
2399
2400 if (substring->length == 0)
2401 return start;
2402
2403 if (end > self->length)
2404 end = self->length;
2405 if (end < 0)
2406 end += self->length;
2407 if (end < 0)
2408 end = 0;
2409
2410 end -= substring->length;
2411
2412 if (direction < 0) {
2413 for (; end >= start; end--)
2414 if (Py_UNICODE_MATCH(self, end, substring))
2415 return end;
2416 } else {
2417 for (; start <= end; start++)
2418 if (Py_UNICODE_MATCH(self, start, substring))
2419 return start;
2420 }
2421
2422 return -1;
2423}
2424
2425int PyUnicode_Find(PyObject *str,
2426 PyObject *substr,
2427 int start,
2428 int end,
2429 int direction)
2430{
2431 int result;
2432
2433 str = PyUnicode_FromObject(str);
2434 if (str == NULL)
2435 return -1;
2436 substr = PyUnicode_FromObject(substr);
2437 if (substr == NULL) {
2438 Py_DECREF(substr);
2439 return -1;
2440 }
2441
2442 result = findstring((PyUnicodeObject *)str,
2443 (PyUnicodeObject *)substr,
2444 start, end, direction);
2445 Py_DECREF(str);
2446 Py_DECREF(substr);
2447 return result;
2448}
2449
2450static
2451int tailmatch(PyUnicodeObject *self,
2452 PyUnicodeObject *substring,
2453 int start,
2454 int end,
2455 int direction)
2456{
2457 if (start < 0)
2458 start += self->length;
2459 if (start < 0)
2460 start = 0;
2461
2462 if (substring->length == 0)
2463 return 1;
2464
2465 if (end > self->length)
2466 end = self->length;
2467 if (end < 0)
2468 end += self->length;
2469 if (end < 0)
2470 end = 0;
2471
2472 end -= substring->length;
2473 if (end < start)
2474 return 0;
2475
2476 if (direction > 0) {
2477 if (Py_UNICODE_MATCH(self, end, substring))
2478 return 1;
2479 } else {
2480 if (Py_UNICODE_MATCH(self, start, substring))
2481 return 1;
2482 }
2483
2484 return 0;
2485}
2486
2487int PyUnicode_Tailmatch(PyObject *str,
2488 PyObject *substr,
2489 int start,
2490 int end,
2491 int direction)
2492{
2493 int result;
2494
2495 str = PyUnicode_FromObject(str);
2496 if (str == NULL)
2497 return -1;
2498 substr = PyUnicode_FromObject(substr);
2499 if (substr == NULL) {
2500 Py_DECREF(substr);
2501 return -1;
2502 }
2503
2504 result = tailmatch((PyUnicodeObject *)str,
2505 (PyUnicodeObject *)substr,
2506 start, end, direction);
2507 Py_DECREF(str);
2508 Py_DECREF(substr);
2509 return result;
2510}
2511
2512static
2513const Py_UNICODE *findchar(const Py_UNICODE *s,
2514 int size,
2515 Py_UNICODE ch)
2516{
2517 /* like wcschr, but doesn't stop at NULL characters */
2518
2519 while (size-- > 0) {
2520 if (*s == ch)
2521 return s;
2522 s++;
2523 }
2524
2525 return NULL;
2526}
2527
2528/* Apply fixfct filter to the Unicode object self and return a
2529 reference to the modified object */
2530
2531static
2532PyObject *fixup(PyUnicodeObject *self,
2533 int (*fixfct)(PyUnicodeObject *s))
2534{
2535
2536 PyUnicodeObject *u;
2537
2538 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2539 self->length);
2540 if (u == NULL)
2541 return NULL;
2542 if (!fixfct(u)) {
2543 /* fixfct should return TRUE if it modified the buffer. If
2544 FALSE, return a reference to the original buffer instead
2545 (to save space, not time) */
2546 Py_INCREF(self);
2547 Py_DECREF(u);
2548 return (PyObject*) self;
2549 }
2550 return (PyObject*) u;
2551}
2552
2553static
2554int fixupper(PyUnicodeObject *self)
2555{
2556 int len = self->length;
2557 Py_UNICODE *s = self->str;
2558 int status = 0;
2559
2560 while (len-- > 0) {
2561 register Py_UNICODE ch;
2562
2563 ch = Py_UNICODE_TOUPPER(*s);
2564 if (ch != *s) {
2565 status = 1;
2566 *s = ch;
2567 }
2568 s++;
2569 }
2570
2571 return status;
2572}
2573
2574static
2575int fixlower(PyUnicodeObject *self)
2576{
2577 int len = self->length;
2578 Py_UNICODE *s = self->str;
2579 int status = 0;
2580
2581 while (len-- > 0) {
2582 register Py_UNICODE ch;
2583
2584 ch = Py_UNICODE_TOLOWER(*s);
2585 if (ch != *s) {
2586 status = 1;
2587 *s = ch;
2588 }
2589 s++;
2590 }
2591
2592 return status;
2593}
2594
2595static
2596int fixswapcase(PyUnicodeObject *self)
2597{
2598 int len = self->length;
2599 Py_UNICODE *s = self->str;
2600 int status = 0;
2601
2602 while (len-- > 0) {
2603 if (Py_UNICODE_ISUPPER(*s)) {
2604 *s = Py_UNICODE_TOLOWER(*s);
2605 status = 1;
2606 } else if (Py_UNICODE_ISLOWER(*s)) {
2607 *s = Py_UNICODE_TOUPPER(*s);
2608 status = 1;
2609 }
2610 s++;
2611 }
2612
2613 return status;
2614}
2615
2616static
2617int fixcapitalize(PyUnicodeObject *self)
2618{
2619 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2620 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2621 return 1;
2622 }
2623 return 0;
2624}
2625
2626static
2627int fixtitle(PyUnicodeObject *self)
2628{
2629 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2630 register Py_UNICODE *e;
2631 int previous_is_cased;
2632
2633 /* Shortcut for single character strings */
2634 if (PyUnicode_GET_SIZE(self) == 1) {
2635 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2636 if (*p != ch) {
2637 *p = ch;
2638 return 1;
2639 }
2640 else
2641 return 0;
2642 }
2643
2644 e = p + PyUnicode_GET_SIZE(self);
2645 previous_is_cased = 0;
2646 for (; p < e; p++) {
2647 register const Py_UNICODE ch = *p;
2648
2649 if (previous_is_cased)
2650 *p = Py_UNICODE_TOLOWER(ch);
2651 else
2652 *p = Py_UNICODE_TOTITLE(ch);
2653
2654 if (Py_UNICODE_ISLOWER(ch) ||
2655 Py_UNICODE_ISUPPER(ch) ||
2656 Py_UNICODE_ISTITLE(ch))
2657 previous_is_cased = 1;
2658 else
2659 previous_is_cased = 0;
2660 }
2661 return 1;
2662}
2663
2664PyObject *PyUnicode_Join(PyObject *separator,
2665 PyObject *seq)
2666{
2667 Py_UNICODE *sep;
2668 int seplen;
2669 PyUnicodeObject *res = NULL;
2670 int reslen = 0;
2671 Py_UNICODE *p;
2672 int seqlen = 0;
2673 int sz = 100;
2674 int i;
2675
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002676 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002677 if (seqlen < 0 && PyErr_Occurred())
2678 return NULL;
2679
2680 if (separator == NULL) {
2681 Py_UNICODE blank = ' ';
2682 sep = &blank;
2683 seplen = 1;
2684 }
2685 else {
2686 separator = PyUnicode_FromObject(separator);
2687 if (separator == NULL)
2688 return NULL;
2689 sep = PyUnicode_AS_UNICODE(separator);
2690 seplen = PyUnicode_GET_SIZE(separator);
2691 }
2692
2693 res = _PyUnicode_New(sz);
2694 if (res == NULL)
2695 goto onError;
2696 p = PyUnicode_AS_UNICODE(res);
2697 reslen = 0;
2698
2699 for (i = 0; i < seqlen; i++) {
2700 int itemlen;
2701 PyObject *item;
2702
2703 item = PySequence_GetItem(seq, i);
2704 if (item == NULL)
2705 goto onError;
2706 if (!PyUnicode_Check(item)) {
2707 PyObject *v;
2708 v = PyUnicode_FromObject(item);
2709 Py_DECREF(item);
2710 item = v;
2711 if (item == NULL)
2712 goto onError;
2713 }
2714 itemlen = PyUnicode_GET_SIZE(item);
2715 while (reslen + itemlen + seplen >= sz) {
2716 if (_PyUnicode_Resize(res, sz*2))
2717 goto onError;
2718 sz *= 2;
2719 p = PyUnicode_AS_UNICODE(res) + reslen;
2720 }
2721 if (i > 0) {
2722 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2723 p += seplen;
2724 reslen += seplen;
2725 }
2726 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2727 p += itemlen;
2728 reslen += itemlen;
2729 Py_DECREF(item);
2730 }
2731 if (_PyUnicode_Resize(res, reslen))
2732 goto onError;
2733
2734 Py_XDECREF(separator);
2735 return (PyObject *)res;
2736
2737 onError:
2738 Py_XDECREF(separator);
2739 Py_DECREF(res);
2740 return NULL;
2741}
2742
2743static
2744PyUnicodeObject *pad(PyUnicodeObject *self,
2745 int left,
2746 int right,
2747 Py_UNICODE fill)
2748{
2749 PyUnicodeObject *u;
2750
2751 if (left < 0)
2752 left = 0;
2753 if (right < 0)
2754 right = 0;
2755
2756 if (left == 0 && right == 0) {
2757 Py_INCREF(self);
2758 return self;
2759 }
2760
2761 u = _PyUnicode_New(left + self->length + right);
2762 if (u) {
2763 if (left)
2764 Py_UNICODE_FILL(u->str, fill, left);
2765 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2766 if (right)
2767 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2768 }
2769
2770 return u;
2771}
2772
2773#define SPLIT_APPEND(data, left, right) \
2774 str = PyUnicode_FromUnicode(data + left, right - left); \
2775 if (!str) \
2776 goto onError; \
2777 if (PyList_Append(list, str)) { \
2778 Py_DECREF(str); \
2779 goto onError; \
2780 } \
2781 else \
2782 Py_DECREF(str);
2783
2784static
2785PyObject *split_whitespace(PyUnicodeObject *self,
2786 PyObject *list,
2787 int maxcount)
2788{
2789 register int i;
2790 register int j;
2791 int len = self->length;
2792 PyObject *str;
2793
2794 for (i = j = 0; i < len; ) {
2795 /* find a token */
2796 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2797 i++;
2798 j = i;
2799 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2800 i++;
2801 if (j < i) {
2802 if (maxcount-- <= 0)
2803 break;
2804 SPLIT_APPEND(self->str, j, i);
2805 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2806 i++;
2807 j = i;
2808 }
2809 }
2810 if (j < len) {
2811 SPLIT_APPEND(self->str, j, len);
2812 }
2813 return list;
2814
2815 onError:
2816 Py_DECREF(list);
2817 return NULL;
2818}
2819
2820PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002821 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002822{
2823 register int i;
2824 register int j;
2825 int len;
2826 PyObject *list;
2827 PyObject *str;
2828 Py_UNICODE *data;
2829
2830 string = PyUnicode_FromObject(string);
2831 if (string == NULL)
2832 return NULL;
2833 data = PyUnicode_AS_UNICODE(string);
2834 len = PyUnicode_GET_SIZE(string);
2835
Guido van Rossumd57fd912000-03-10 22:53:23 +00002836 list = PyList_New(0);
2837 if (!list)
2838 goto onError;
2839
2840 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002841 int eol;
2842
Guido van Rossumd57fd912000-03-10 22:53:23 +00002843 /* Find a line and append it */
2844 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2845 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002846
2847 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002848 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002849 if (i < len) {
2850 if (data[i] == '\r' && i + 1 < len &&
2851 data[i+1] == '\n')
2852 i += 2;
2853 else
2854 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002855 if (keepends)
2856 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002857 }
Guido van Rossum86662912000-04-11 15:38:46 +00002858 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002859 j = i;
2860 }
2861 if (j < len) {
2862 SPLIT_APPEND(data, j, len);
2863 }
2864
2865 Py_DECREF(string);
2866 return list;
2867
2868 onError:
2869 Py_DECREF(list);
2870 Py_DECREF(string);
2871 return NULL;
2872}
2873
2874static
2875PyObject *split_char(PyUnicodeObject *self,
2876 PyObject *list,
2877 Py_UNICODE ch,
2878 int maxcount)
2879{
2880 register int i;
2881 register int j;
2882 int len = self->length;
2883 PyObject *str;
2884
2885 for (i = j = 0; i < len; ) {
2886 if (self->str[i] == ch) {
2887 if (maxcount-- <= 0)
2888 break;
2889 SPLIT_APPEND(self->str, j, i);
2890 i = j = i + 1;
2891 } else
2892 i++;
2893 }
2894 if (j <= len) {
2895 SPLIT_APPEND(self->str, j, len);
2896 }
2897 return list;
2898
2899 onError:
2900 Py_DECREF(list);
2901 return NULL;
2902}
2903
2904static
2905PyObject *split_substring(PyUnicodeObject *self,
2906 PyObject *list,
2907 PyUnicodeObject *substring,
2908 int maxcount)
2909{
2910 register int i;
2911 register int j;
2912 int len = self->length;
2913 int sublen = substring->length;
2914 PyObject *str;
2915
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002916 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002917 if (Py_UNICODE_MATCH(self, i, substring)) {
2918 if (maxcount-- <= 0)
2919 break;
2920 SPLIT_APPEND(self->str, j, i);
2921 i = j = i + sublen;
2922 } else
2923 i++;
2924 }
2925 if (j <= len) {
2926 SPLIT_APPEND(self->str, j, len);
2927 }
2928 return list;
2929
2930 onError:
2931 Py_DECREF(list);
2932 return NULL;
2933}
2934
2935#undef SPLIT_APPEND
2936
2937static
2938PyObject *split(PyUnicodeObject *self,
2939 PyUnicodeObject *substring,
2940 int maxcount)
2941{
2942 PyObject *list;
2943
2944 if (maxcount < 0)
2945 maxcount = INT_MAX;
2946
2947 list = PyList_New(0);
2948 if (!list)
2949 return NULL;
2950
2951 if (substring == NULL)
2952 return split_whitespace(self,list,maxcount);
2953
2954 else if (substring->length == 1)
2955 return split_char(self,list,substring->str[0],maxcount);
2956
2957 else if (substring->length == 0) {
2958 Py_DECREF(list);
2959 PyErr_SetString(PyExc_ValueError, "empty separator");
2960 return NULL;
2961 }
2962 else
2963 return split_substring(self,list,substring,maxcount);
2964}
2965
2966static
2967PyObject *strip(PyUnicodeObject *self,
2968 int left,
2969 int right)
2970{
2971 Py_UNICODE *p = self->str;
2972 int start = 0;
2973 int end = self->length;
2974
2975 if (left)
2976 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2977 start++;
2978
2979 if (right)
2980 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2981 end--;
2982
2983 if (start == 0 && end == self->length) {
2984 /* couldn't strip anything off, return original string */
2985 Py_INCREF(self);
2986 return (PyObject*) self;
2987 }
2988
2989 return (PyObject*) PyUnicode_FromUnicode(
2990 self->str + start,
2991 end - start
2992 );
2993}
2994
2995static
2996PyObject *replace(PyUnicodeObject *self,
2997 PyUnicodeObject *str1,
2998 PyUnicodeObject *str2,
2999 int maxcount)
3000{
3001 PyUnicodeObject *u;
3002
3003 if (maxcount < 0)
3004 maxcount = INT_MAX;
3005
3006 if (str1->length == 1 && str2->length == 1) {
3007 int i;
3008
3009 /* replace characters */
3010 if (!findchar(self->str, self->length, str1->str[0])) {
3011 /* nothing to replace, return original string */
3012 Py_INCREF(self);
3013 u = self;
3014 } else {
3015 Py_UNICODE u1 = str1->str[0];
3016 Py_UNICODE u2 = str2->str[0];
3017
3018 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3019 self->str,
3020 self->length
3021 );
3022 if (u)
3023 for (i = 0; i < u->length; i++)
3024 if (u->str[i] == u1) {
3025 if (--maxcount < 0)
3026 break;
3027 u->str[i] = u2;
3028 }
3029 }
3030
3031 } else {
3032 int n, i;
3033 Py_UNICODE *p;
3034
3035 /* replace strings */
3036 n = count(self, 0, self->length, str1);
3037 if (n > maxcount)
3038 n = maxcount;
3039 if (n == 0) {
3040 /* nothing to replace, return original string */
3041 Py_INCREF(self);
3042 u = self;
3043 } else {
3044 u = _PyUnicode_New(
3045 self->length + n * (str2->length - str1->length));
3046 if (u) {
3047 i = 0;
3048 p = u->str;
3049 while (i <= self->length - str1->length)
3050 if (Py_UNICODE_MATCH(self, i, str1)) {
3051 /* replace string segment */
3052 Py_UNICODE_COPY(p, str2->str, str2->length);
3053 p += str2->length;
3054 i += str1->length;
3055 if (--n <= 0) {
3056 /* copy remaining part */
3057 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3058 break;
3059 }
3060 } else
3061 *p++ = self->str[i++];
3062 }
3063 }
3064 }
3065
3066 return (PyObject *) u;
3067}
3068
3069/* --- Unicode Object Methods --------------------------------------------- */
3070
3071static char title__doc__[] =
3072"S.title() -> unicode\n\
3073\n\
3074Return a titlecased version of S, i.e. words start with title case\n\
3075characters, all remaining cased characters have lower case.";
3076
3077static PyObject*
3078unicode_title(PyUnicodeObject *self, PyObject *args)
3079{
3080 if (!PyArg_NoArgs(args))
3081 return NULL;
3082 return fixup(self, fixtitle);
3083}
3084
3085static char capitalize__doc__[] =
3086"S.capitalize() -> unicode\n\
3087\n\
3088Return a capitalized version of S, i.e. make the first character\n\
3089have upper case.";
3090
3091static PyObject*
3092unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3093{
3094 if (!PyArg_NoArgs(args))
3095 return NULL;
3096 return fixup(self, fixcapitalize);
3097}
3098
3099#if 0
3100static char capwords__doc__[] =
3101"S.capwords() -> unicode\n\
3102\n\
3103Apply .capitalize() to all words in S and return the result with\n\
3104normalized whitespace (all whitespace strings are replaced by ' ').";
3105
3106static PyObject*
3107unicode_capwords(PyUnicodeObject *self, PyObject *args)
3108{
3109 PyObject *list;
3110 PyObject *item;
3111 int i;
3112
3113 if (!PyArg_NoArgs(args))
3114 return NULL;
3115
3116 /* Split into words */
3117 list = split(self, NULL, -1);
3118 if (!list)
3119 return NULL;
3120
3121 /* Capitalize each word */
3122 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3123 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3124 fixcapitalize);
3125 if (item == NULL)
3126 goto onError;
3127 Py_DECREF(PyList_GET_ITEM(list, i));
3128 PyList_SET_ITEM(list, i, item);
3129 }
3130
3131 /* Join the words to form a new string */
3132 item = PyUnicode_Join(NULL, list);
3133
3134onError:
3135 Py_DECREF(list);
3136 return (PyObject *)item;
3137}
3138#endif
3139
3140static char center__doc__[] =
3141"S.center(width) -> unicode\n\
3142\n\
3143Return S centered in a Unicode string of length width. Padding is done\n\
3144using spaces.";
3145
3146static PyObject *
3147unicode_center(PyUnicodeObject *self, PyObject *args)
3148{
3149 int marg, left;
3150 int width;
3151
3152 if (!PyArg_ParseTuple(args, "i:center", &width))
3153 return NULL;
3154
3155 if (self->length >= width) {
3156 Py_INCREF(self);
3157 return (PyObject*) self;
3158 }
3159
3160 marg = width - self->length;
3161 left = marg / 2 + (marg & width & 1);
3162
3163 return (PyObject*) pad(self, left, marg - left, ' ');
3164}
3165
Marc-André Lemburge5034372000-08-08 08:04:29 +00003166#if 0
3167
3168/* This code should go into some future Unicode collation support
3169 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003170 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003171
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003172/* speedy UTF-16 code point order comparison */
3173/* gleaned from: */
3174/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3175
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003176static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003177{
3178 0, 0, 0, 0, 0, 0, 0, 0,
3179 0, 0, 0, 0, 0, 0, 0, 0,
3180 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003181 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003182};
3183
Guido van Rossumd57fd912000-03-10 22:53:23 +00003184static int
3185unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3186{
3187 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003188
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 Py_UNICODE *s1 = str1->str;
3190 Py_UNICODE *s2 = str2->str;
3191
3192 len1 = str1->length;
3193 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003194
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003196 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003197 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003198
3199 c1 = *s1++;
3200 c2 = *s2++;
3201 if (c1 > (1<<11) * 26)
3202 c1 += utf16Fixup[c1>>11];
3203 if (c2 > (1<<11) * 26)
3204 c2 += utf16Fixup[c2>>11];
3205
3206 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003207 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003208 if (diff)
3209 return (diff < 0) ? -1 : (diff != 0);
3210 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 }
3212
3213 return (len1 < len2) ? -1 : (len1 != len2);
3214}
3215
Marc-André Lemburge5034372000-08-08 08:04:29 +00003216#else
3217
3218static int
3219unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3220{
3221 register int len1, len2;
3222
3223 Py_UNICODE *s1 = str1->str;
3224 Py_UNICODE *s2 = str2->str;
3225
3226 len1 = str1->length;
3227 len2 = str2->length;
3228
3229 while (len1 > 0 && len2 > 0) {
3230 register long diff;
3231
3232 diff = (long)*s1++ - (long)*s2++;
3233 if (diff)
3234 return (diff < 0) ? -1 : (diff != 0);
3235 len1--; len2--;
3236 }
3237
3238 return (len1 < len2) ? -1 : (len1 != len2);
3239}
3240
3241#endif
3242
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243int PyUnicode_Compare(PyObject *left,
3244 PyObject *right)
3245{
3246 PyUnicodeObject *u = NULL, *v = NULL;
3247 int result;
3248
3249 /* Coerce the two arguments */
3250 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3251 if (u == NULL)
3252 goto onError;
3253 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3254 if (v == NULL)
3255 goto onError;
3256
Thomas Wouters7e474022000-07-16 12:04:32 +00003257 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003258 if (v == u) {
3259 Py_DECREF(u);
3260 Py_DECREF(v);
3261 return 0;
3262 }
3263
3264 result = unicode_compare(u, v);
3265
3266 Py_DECREF(u);
3267 Py_DECREF(v);
3268 return result;
3269
3270onError:
3271 Py_XDECREF(u);
3272 Py_XDECREF(v);
3273 return -1;
3274}
3275
Guido van Rossum403d68b2000-03-13 15:55:09 +00003276int PyUnicode_Contains(PyObject *container,
3277 PyObject *element)
3278{
3279 PyUnicodeObject *u = NULL, *v = NULL;
3280 int result;
3281 register const Py_UNICODE *p, *e;
3282 register Py_UNICODE ch;
3283
3284 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003285 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003286 if (v == NULL) {
3287 PyErr_SetString(PyExc_TypeError,
3288 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003289 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003290 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003291 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3292 if (u == NULL) {
3293 Py_DECREF(v);
3294 goto onError;
3295 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003296
3297 /* Check v in u */
3298 if (PyUnicode_GET_SIZE(v) != 1) {
3299 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003300 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003301 goto onError;
3302 }
3303 ch = *PyUnicode_AS_UNICODE(v);
3304 p = PyUnicode_AS_UNICODE(u);
3305 e = p + PyUnicode_GET_SIZE(u);
3306 result = 0;
3307 while (p < e) {
3308 if (*p++ == ch) {
3309 result = 1;
3310 break;
3311 }
3312 }
3313
3314 Py_DECREF(u);
3315 Py_DECREF(v);
3316 return result;
3317
3318onError:
3319 Py_XDECREF(u);
3320 Py_XDECREF(v);
3321 return -1;
3322}
3323
Guido van Rossumd57fd912000-03-10 22:53:23 +00003324/* Concat to string or Unicode object giving a new Unicode object. */
3325
3326PyObject *PyUnicode_Concat(PyObject *left,
3327 PyObject *right)
3328{
3329 PyUnicodeObject *u = NULL, *v = NULL, *w;
3330
3331 /* Coerce the two arguments */
3332 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3333 if (u == NULL)
3334 goto onError;
3335 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3336 if (v == NULL)
3337 goto onError;
3338
3339 /* Shortcuts */
3340 if (v == unicode_empty) {
3341 Py_DECREF(v);
3342 return (PyObject *)u;
3343 }
3344 if (u == unicode_empty) {
3345 Py_DECREF(u);
3346 return (PyObject *)v;
3347 }
3348
3349 /* Concat the two Unicode strings */
3350 w = _PyUnicode_New(u->length + v->length);
3351 if (w == NULL)
3352 goto onError;
3353 Py_UNICODE_COPY(w->str, u->str, u->length);
3354 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3355
3356 Py_DECREF(u);
3357 Py_DECREF(v);
3358 return (PyObject *)w;
3359
3360onError:
3361 Py_XDECREF(u);
3362 Py_XDECREF(v);
3363 return NULL;
3364}
3365
3366static char count__doc__[] =
3367"S.count(sub[, start[, end]]) -> int\n\
3368\n\
3369Return the number of occurrences of substring sub in Unicode string\n\
3370S[start:end]. Optional arguments start and end are\n\
3371interpreted as in slice notation.";
3372
3373static PyObject *
3374unicode_count(PyUnicodeObject *self, PyObject *args)
3375{
3376 PyUnicodeObject *substring;
3377 int start = 0;
3378 int end = INT_MAX;
3379 PyObject *result;
3380
Guido van Rossumb8872e62000-05-09 14:14:27 +00003381 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3382 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383 return NULL;
3384
3385 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3386 (PyObject *)substring);
3387 if (substring == NULL)
3388 return NULL;
3389
Guido van Rossumd57fd912000-03-10 22:53:23 +00003390 if (start < 0)
3391 start += self->length;
3392 if (start < 0)
3393 start = 0;
3394 if (end > self->length)
3395 end = self->length;
3396 if (end < 0)
3397 end += self->length;
3398 if (end < 0)
3399 end = 0;
3400
3401 result = PyInt_FromLong((long) count(self, start, end, substring));
3402
3403 Py_DECREF(substring);
3404 return result;
3405}
3406
3407static char encode__doc__[] =
3408"S.encode([encoding[,errors]]) -> string\n\
3409\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003410Return an encoded string version of S. Default encoding is the current\n\
3411default string encoding. errors may be given to set a different error\n\
3412handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3413a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003414
3415static PyObject *
3416unicode_encode(PyUnicodeObject *self, PyObject *args)
3417{
3418 char *encoding = NULL;
3419 char *errors = NULL;
3420 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3421 return NULL;
3422 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3423}
3424
3425static char expandtabs__doc__[] =
3426"S.expandtabs([tabsize]) -> unicode\n\
3427\n\
3428Return a copy of S where all tab characters are expanded using spaces.\n\
3429If tabsize is not given, a tab size of 8 characters is assumed.";
3430
3431static PyObject*
3432unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3433{
3434 Py_UNICODE *e;
3435 Py_UNICODE *p;
3436 Py_UNICODE *q;
3437 int i, j;
3438 PyUnicodeObject *u;
3439 int tabsize = 8;
3440
3441 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3442 return NULL;
3443
Thomas Wouters7e474022000-07-16 12:04:32 +00003444 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 i = j = 0;
3446 e = self->str + self->length;
3447 for (p = self->str; p < e; p++)
3448 if (*p == '\t') {
3449 if (tabsize > 0)
3450 j += tabsize - (j % tabsize);
3451 }
3452 else {
3453 j++;
3454 if (*p == '\n' || *p == '\r') {
3455 i += j;
3456 j = 0;
3457 }
3458 }
3459
3460 /* Second pass: create output string and fill it */
3461 u = _PyUnicode_New(i + j);
3462 if (!u)
3463 return NULL;
3464
3465 j = 0;
3466 q = u->str;
3467
3468 for (p = self->str; p < e; p++)
3469 if (*p == '\t') {
3470 if (tabsize > 0) {
3471 i = tabsize - (j % tabsize);
3472 j += i;
3473 while (i--)
3474 *q++ = ' ';
3475 }
3476 }
3477 else {
3478 j++;
3479 *q++ = *p;
3480 if (*p == '\n' || *p == '\r')
3481 j = 0;
3482 }
3483
3484 return (PyObject*) u;
3485}
3486
3487static char find__doc__[] =
3488"S.find(sub [,start [,end]]) -> int\n\
3489\n\
3490Return the lowest index in S where substring sub is found,\n\
3491such that sub is contained within s[start,end]. Optional\n\
3492arguments start and end are interpreted as in slice notation.\n\
3493\n\
3494Return -1 on failure.";
3495
3496static PyObject *
3497unicode_find(PyUnicodeObject *self, PyObject *args)
3498{
3499 PyUnicodeObject *substring;
3500 int start = 0;
3501 int end = INT_MAX;
3502 PyObject *result;
3503
Guido van Rossumb8872e62000-05-09 14:14:27 +00003504 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3505 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003506 return NULL;
3507 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3508 (PyObject *)substring);
3509 if (substring == NULL)
3510 return NULL;
3511
3512 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3513
3514 Py_DECREF(substring);
3515 return result;
3516}
3517
3518static PyObject *
3519unicode_getitem(PyUnicodeObject *self, int index)
3520{
3521 if (index < 0 || index >= self->length) {
3522 PyErr_SetString(PyExc_IndexError, "string index out of range");
3523 return NULL;
3524 }
3525
3526 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3527}
3528
3529static long
3530unicode_hash(PyUnicodeObject *self)
3531{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003532 /* Since Unicode objects compare equal to their ASCII string
3533 counterparts, they should use the individual character values
3534 as basis for their hash value. This is needed to assure that
3535 strings and Unicode objects behave in the same way as
3536 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003537
Fredrik Lundhdde61642000-07-10 18:27:47 +00003538 register int len;
3539 register Py_UNICODE *p;
3540 register long x;
3541
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542 if (self->hash != -1)
3543 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003544 len = PyUnicode_GET_SIZE(self);
3545 p = PyUnicode_AS_UNICODE(self);
3546 x = *p << 7;
3547 while (--len >= 0)
3548 x = (1000003*x) ^ *p++;
3549 x ^= PyUnicode_GET_SIZE(self);
3550 if (x == -1)
3551 x = -2;
3552 self->hash = x;
3553 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003554}
3555
3556static char index__doc__[] =
3557"S.index(sub [,start [,end]]) -> int\n\
3558\n\
3559Like S.find() but raise ValueError when the substring is not found.";
3560
3561static PyObject *
3562unicode_index(PyUnicodeObject *self, PyObject *args)
3563{
3564 int result;
3565 PyUnicodeObject *substring;
3566 int start = 0;
3567 int end = INT_MAX;
3568
Guido van Rossumb8872e62000-05-09 14:14:27 +00003569 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3570 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003571 return NULL;
3572
3573 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3574 (PyObject *)substring);
3575 if (substring == NULL)
3576 return NULL;
3577
3578 result = findstring(self, substring, start, end, 1);
3579
3580 Py_DECREF(substring);
3581 if (result < 0) {
3582 PyErr_SetString(PyExc_ValueError, "substring not found");
3583 return NULL;
3584 }
3585 return PyInt_FromLong(result);
3586}
3587
3588static char islower__doc__[] =
3589"S.islower() -> int\n\
3590\n\
3591Return 1 if all cased characters in S are lowercase and there is\n\
3592at least one cased character in S, 0 otherwise.";
3593
3594static PyObject*
3595unicode_islower(PyUnicodeObject *self, PyObject *args)
3596{
3597 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3598 register const Py_UNICODE *e;
3599 int cased;
3600
3601 if (!PyArg_NoArgs(args))
3602 return NULL;
3603
3604 /* Shortcut for single character strings */
3605 if (PyUnicode_GET_SIZE(self) == 1)
3606 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3607
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003608 /* Special case for empty strings */
3609 if (PyString_GET_SIZE(self) == 0)
3610 return PyInt_FromLong(0);
3611
Guido van Rossumd57fd912000-03-10 22:53:23 +00003612 e = p + PyUnicode_GET_SIZE(self);
3613 cased = 0;
3614 for (; p < e; p++) {
3615 register const Py_UNICODE ch = *p;
3616
3617 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3618 return PyInt_FromLong(0);
3619 else if (!cased && Py_UNICODE_ISLOWER(ch))
3620 cased = 1;
3621 }
3622 return PyInt_FromLong(cased);
3623}
3624
3625static char isupper__doc__[] =
3626"S.isupper() -> int\n\
3627\n\
3628Return 1 if all cased characters in S are uppercase and there is\n\
3629at least one cased character in S, 0 otherwise.";
3630
3631static PyObject*
3632unicode_isupper(PyUnicodeObject *self, PyObject *args)
3633{
3634 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3635 register const Py_UNICODE *e;
3636 int cased;
3637
3638 if (!PyArg_NoArgs(args))
3639 return NULL;
3640
3641 /* Shortcut for single character strings */
3642 if (PyUnicode_GET_SIZE(self) == 1)
3643 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3644
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003645 /* Special case for empty strings */
3646 if (PyString_GET_SIZE(self) == 0)
3647 return PyInt_FromLong(0);
3648
Guido van Rossumd57fd912000-03-10 22:53:23 +00003649 e = p + PyUnicode_GET_SIZE(self);
3650 cased = 0;
3651 for (; p < e; p++) {
3652 register const Py_UNICODE ch = *p;
3653
3654 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3655 return PyInt_FromLong(0);
3656 else if (!cased && Py_UNICODE_ISUPPER(ch))
3657 cased = 1;
3658 }
3659 return PyInt_FromLong(cased);
3660}
3661
3662static char istitle__doc__[] =
3663"S.istitle() -> int\n\
3664\n\
3665Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3666may only follow uncased characters and lowercase characters only cased\n\
3667ones. Return 0 otherwise.";
3668
3669static PyObject*
3670unicode_istitle(PyUnicodeObject *self, PyObject *args)
3671{
3672 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3673 register const Py_UNICODE *e;
3674 int cased, previous_is_cased;
3675
3676 if (!PyArg_NoArgs(args))
3677 return NULL;
3678
3679 /* Shortcut for single character strings */
3680 if (PyUnicode_GET_SIZE(self) == 1)
3681 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3682 (Py_UNICODE_ISUPPER(*p) != 0));
3683
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003684 /* Special case for empty strings */
3685 if (PyString_GET_SIZE(self) == 0)
3686 return PyInt_FromLong(0);
3687
Guido van Rossumd57fd912000-03-10 22:53:23 +00003688 e = p + PyUnicode_GET_SIZE(self);
3689 cased = 0;
3690 previous_is_cased = 0;
3691 for (; p < e; p++) {
3692 register const Py_UNICODE ch = *p;
3693
3694 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3695 if (previous_is_cased)
3696 return PyInt_FromLong(0);
3697 previous_is_cased = 1;
3698 cased = 1;
3699 }
3700 else if (Py_UNICODE_ISLOWER(ch)) {
3701 if (!previous_is_cased)
3702 return PyInt_FromLong(0);
3703 previous_is_cased = 1;
3704 cased = 1;
3705 }
3706 else
3707 previous_is_cased = 0;
3708 }
3709 return PyInt_FromLong(cased);
3710}
3711
3712static char isspace__doc__[] =
3713"S.isspace() -> int\n\
3714\n\
3715Return 1 if there are only whitespace characters in S,\n\
37160 otherwise.";
3717
3718static PyObject*
3719unicode_isspace(PyUnicodeObject *self, PyObject *args)
3720{
3721 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3722 register const Py_UNICODE *e;
3723
3724 if (!PyArg_NoArgs(args))
3725 return NULL;
3726
3727 /* Shortcut for single character strings */
3728 if (PyUnicode_GET_SIZE(self) == 1 &&
3729 Py_UNICODE_ISSPACE(*p))
3730 return PyInt_FromLong(1);
3731
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003732 /* Special case for empty strings */
3733 if (PyString_GET_SIZE(self) == 0)
3734 return PyInt_FromLong(0);
3735
Guido van Rossumd57fd912000-03-10 22:53:23 +00003736 e = p + PyUnicode_GET_SIZE(self);
3737 for (; p < e; p++) {
3738 if (!Py_UNICODE_ISSPACE(*p))
3739 return PyInt_FromLong(0);
3740 }
3741 return PyInt_FromLong(1);
3742}
3743
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003744static char isalpha__doc__[] =
3745"S.isalpha() -> int\n\
3746\n\
3747Return 1 if all characters in S are alphabetic\n\
3748and there is at least one character in S, 0 otherwise.";
3749
3750static PyObject*
3751unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3752{
3753 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3754 register const Py_UNICODE *e;
3755
3756 if (!PyArg_NoArgs(args))
3757 return NULL;
3758
3759 /* Shortcut for single character strings */
3760 if (PyUnicode_GET_SIZE(self) == 1 &&
3761 Py_UNICODE_ISALPHA(*p))
3762 return PyInt_FromLong(1);
3763
3764 /* Special case for empty strings */
3765 if (PyString_GET_SIZE(self) == 0)
3766 return PyInt_FromLong(0);
3767
3768 e = p + PyUnicode_GET_SIZE(self);
3769 for (; p < e; p++) {
3770 if (!Py_UNICODE_ISALPHA(*p))
3771 return PyInt_FromLong(0);
3772 }
3773 return PyInt_FromLong(1);
3774}
3775
3776static char isalnum__doc__[] =
3777"S.isalnum() -> int\n\
3778\n\
3779Return 1 if all characters in S are alphanumeric\n\
3780and there is at least one character in S, 0 otherwise.";
3781
3782static PyObject*
3783unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3784{
3785 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3786 register const Py_UNICODE *e;
3787
3788 if (!PyArg_NoArgs(args))
3789 return NULL;
3790
3791 /* Shortcut for single character strings */
3792 if (PyUnicode_GET_SIZE(self) == 1 &&
3793 Py_UNICODE_ISALNUM(*p))
3794 return PyInt_FromLong(1);
3795
3796 /* Special case for empty strings */
3797 if (PyString_GET_SIZE(self) == 0)
3798 return PyInt_FromLong(0);
3799
3800 e = p + PyUnicode_GET_SIZE(self);
3801 for (; p < e; p++) {
3802 if (!Py_UNICODE_ISALNUM(*p))
3803 return PyInt_FromLong(0);
3804 }
3805 return PyInt_FromLong(1);
3806}
3807
Guido van Rossumd57fd912000-03-10 22:53:23 +00003808static char isdecimal__doc__[] =
3809"S.isdecimal() -> int\n\
3810\n\
3811Return 1 if there are only decimal characters in S,\n\
38120 otherwise.";
3813
3814static PyObject*
3815unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3816{
3817 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3818 register const Py_UNICODE *e;
3819
3820 if (!PyArg_NoArgs(args))
3821 return NULL;
3822
3823 /* Shortcut for single character strings */
3824 if (PyUnicode_GET_SIZE(self) == 1 &&
3825 Py_UNICODE_ISDECIMAL(*p))
3826 return PyInt_FromLong(1);
3827
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003828 /* Special case for empty strings */
3829 if (PyString_GET_SIZE(self) == 0)
3830 return PyInt_FromLong(0);
3831
Guido van Rossumd57fd912000-03-10 22:53:23 +00003832 e = p + PyUnicode_GET_SIZE(self);
3833 for (; p < e; p++) {
3834 if (!Py_UNICODE_ISDECIMAL(*p))
3835 return PyInt_FromLong(0);
3836 }
3837 return PyInt_FromLong(1);
3838}
3839
3840static char isdigit__doc__[] =
3841"S.isdigit() -> int\n\
3842\n\
3843Return 1 if there are only digit characters in S,\n\
38440 otherwise.";
3845
3846static PyObject*
3847unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3848{
3849 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3850 register const Py_UNICODE *e;
3851
3852 if (!PyArg_NoArgs(args))
3853 return NULL;
3854
3855 /* Shortcut for single character strings */
3856 if (PyUnicode_GET_SIZE(self) == 1 &&
3857 Py_UNICODE_ISDIGIT(*p))
3858 return PyInt_FromLong(1);
3859
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003860 /* Special case for empty strings */
3861 if (PyString_GET_SIZE(self) == 0)
3862 return PyInt_FromLong(0);
3863
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864 e = p + PyUnicode_GET_SIZE(self);
3865 for (; p < e; p++) {
3866 if (!Py_UNICODE_ISDIGIT(*p))
3867 return PyInt_FromLong(0);
3868 }
3869 return PyInt_FromLong(1);
3870}
3871
3872static char isnumeric__doc__[] =
3873"S.isnumeric() -> int\n\
3874\n\
3875Return 1 if there are only numeric characters in S,\n\
38760 otherwise.";
3877
3878static PyObject*
3879unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3880{
3881 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3882 register const Py_UNICODE *e;
3883
3884 if (!PyArg_NoArgs(args))
3885 return NULL;
3886
3887 /* Shortcut for single character strings */
3888 if (PyUnicode_GET_SIZE(self) == 1 &&
3889 Py_UNICODE_ISNUMERIC(*p))
3890 return PyInt_FromLong(1);
3891
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003892 /* Special case for empty strings */
3893 if (PyString_GET_SIZE(self) == 0)
3894 return PyInt_FromLong(0);
3895
Guido van Rossumd57fd912000-03-10 22:53:23 +00003896 e = p + PyUnicode_GET_SIZE(self);
3897 for (; p < e; p++) {
3898 if (!Py_UNICODE_ISNUMERIC(*p))
3899 return PyInt_FromLong(0);
3900 }
3901 return PyInt_FromLong(1);
3902}
3903
3904static char join__doc__[] =
3905"S.join(sequence) -> unicode\n\
3906\n\
3907Return a string which is the concatenation of the strings in the\n\
3908sequence. The separator between elements is S.";
3909
3910static PyObject*
3911unicode_join(PyUnicodeObject *self, PyObject *args)
3912{
3913 PyObject *data;
3914 if (!PyArg_ParseTuple(args, "O:join", &data))
3915 return NULL;
3916
3917 return PyUnicode_Join((PyObject *)self, data);
3918}
3919
3920static int
3921unicode_length(PyUnicodeObject *self)
3922{
3923 return self->length;
3924}
3925
3926static char ljust__doc__[] =
3927"S.ljust(width) -> unicode\n\
3928\n\
3929Return S left justified in a Unicode string of length width. Padding is\n\
3930done using spaces.";
3931
3932static PyObject *
3933unicode_ljust(PyUnicodeObject *self, PyObject *args)
3934{
3935 int width;
3936 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3937 return NULL;
3938
3939 if (self->length >= width) {
3940 Py_INCREF(self);
3941 return (PyObject*) self;
3942 }
3943
3944 return (PyObject*) pad(self, 0, width - self->length, ' ');
3945}
3946
3947static char lower__doc__[] =
3948"S.lower() -> unicode\n\
3949\n\
3950Return a copy of the string S converted to lowercase.";
3951
3952static PyObject*
3953unicode_lower(PyUnicodeObject *self, PyObject *args)
3954{
3955 if (!PyArg_NoArgs(args))
3956 return NULL;
3957 return fixup(self, fixlower);
3958}
3959
3960static char lstrip__doc__[] =
3961"S.lstrip() -> unicode\n\
3962\n\
3963Return a copy of the string S with leading whitespace removed.";
3964
3965static PyObject *
3966unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3967{
3968 if (!PyArg_NoArgs(args))
3969 return NULL;
3970 return strip(self, 1, 0);
3971}
3972
3973static PyObject*
3974unicode_repeat(PyUnicodeObject *str, int len)
3975{
3976 PyUnicodeObject *u;
3977 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00003978 int nchars;
3979 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003980
3981 if (len < 0)
3982 len = 0;
3983
3984 if (len == 1) {
3985 /* no repeat, return original string */
3986 Py_INCREF(str);
3987 return (PyObject*) str;
3988 }
Tim Peters8f422462000-09-09 06:13:41 +00003989
3990 /* ensure # of chars needed doesn't overflow int and # of bytes
3991 * needed doesn't overflow size_t
3992 */
3993 nchars = len * str->length;
3994 if (len && nchars / len != str->length) {
3995 PyErr_SetString(PyExc_OverflowError,
3996 "repeated string is too long");
3997 return NULL;
3998 }
3999 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4000 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4001 PyErr_SetString(PyExc_OverflowError,
4002 "repeated string is too long");
4003 return NULL;
4004 }
4005 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004006 if (!u)
4007 return NULL;
4008
4009 p = u->str;
4010
4011 while (len-- > 0) {
4012 Py_UNICODE_COPY(p, str->str, str->length);
4013 p += str->length;
4014 }
4015
4016 return (PyObject*) u;
4017}
4018
4019PyObject *PyUnicode_Replace(PyObject *obj,
4020 PyObject *subobj,
4021 PyObject *replobj,
4022 int maxcount)
4023{
4024 PyObject *self;
4025 PyObject *str1;
4026 PyObject *str2;
4027 PyObject *result;
4028
4029 self = PyUnicode_FromObject(obj);
4030 if (self == NULL)
4031 return NULL;
4032 str1 = PyUnicode_FromObject(subobj);
4033 if (str1 == NULL) {
4034 Py_DECREF(self);
4035 return NULL;
4036 }
4037 str2 = PyUnicode_FromObject(replobj);
4038 if (str2 == NULL) {
4039 Py_DECREF(self);
4040 Py_DECREF(str1);
4041 return NULL;
4042 }
4043 result = replace((PyUnicodeObject *)self,
4044 (PyUnicodeObject *)str1,
4045 (PyUnicodeObject *)str2,
4046 maxcount);
4047 Py_DECREF(self);
4048 Py_DECREF(str1);
4049 Py_DECREF(str2);
4050 return result;
4051}
4052
4053static char replace__doc__[] =
4054"S.replace (old, new[, maxsplit]) -> unicode\n\
4055\n\
4056Return a copy of S with all occurrences of substring\n\
4057old replaced by new. If the optional argument maxsplit is\n\
4058given, only the first maxsplit occurrences are replaced.";
4059
4060static PyObject*
4061unicode_replace(PyUnicodeObject *self, PyObject *args)
4062{
4063 PyUnicodeObject *str1;
4064 PyUnicodeObject *str2;
4065 int maxcount = -1;
4066 PyObject *result;
4067
4068 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4069 return NULL;
4070 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4071 if (str1 == NULL)
4072 return NULL;
4073 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4074 if (str2 == NULL)
4075 return NULL;
4076
4077 result = replace(self, str1, str2, maxcount);
4078
4079 Py_DECREF(str1);
4080 Py_DECREF(str2);
4081 return result;
4082}
4083
4084static
4085PyObject *unicode_repr(PyObject *unicode)
4086{
4087 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4088 PyUnicode_GET_SIZE(unicode),
4089 1);
4090}
4091
4092static char rfind__doc__[] =
4093"S.rfind(sub [,start [,end]]) -> int\n\
4094\n\
4095Return the highest index in S where substring sub is found,\n\
4096such that sub is contained within s[start,end]. Optional\n\
4097arguments start and end are interpreted as in slice notation.\n\
4098\n\
4099Return -1 on failure.";
4100
4101static PyObject *
4102unicode_rfind(PyUnicodeObject *self, PyObject *args)
4103{
4104 PyUnicodeObject *substring;
4105 int start = 0;
4106 int end = INT_MAX;
4107 PyObject *result;
4108
Guido van Rossumb8872e62000-05-09 14:14:27 +00004109 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4110 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111 return NULL;
4112 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4113 (PyObject *)substring);
4114 if (substring == NULL)
4115 return NULL;
4116
4117 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4118
4119 Py_DECREF(substring);
4120 return result;
4121}
4122
4123static char rindex__doc__[] =
4124"S.rindex(sub [,start [,end]]) -> int\n\
4125\n\
4126Like S.rfind() but raise ValueError when the substring is not found.";
4127
4128static PyObject *
4129unicode_rindex(PyUnicodeObject *self, PyObject *args)
4130{
4131 int result;
4132 PyUnicodeObject *substring;
4133 int start = 0;
4134 int end = INT_MAX;
4135
Guido van Rossumb8872e62000-05-09 14:14:27 +00004136 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4137 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138 return NULL;
4139 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4140 (PyObject *)substring);
4141 if (substring == NULL)
4142 return NULL;
4143
4144 result = findstring(self, substring, start, end, -1);
4145
4146 Py_DECREF(substring);
4147 if (result < 0) {
4148 PyErr_SetString(PyExc_ValueError, "substring not found");
4149 return NULL;
4150 }
4151 return PyInt_FromLong(result);
4152}
4153
4154static char rjust__doc__[] =
4155"S.rjust(width) -> unicode\n\
4156\n\
4157Return S right justified in a Unicode string of length width. Padding is\n\
4158done using spaces.";
4159
4160static PyObject *
4161unicode_rjust(PyUnicodeObject *self, PyObject *args)
4162{
4163 int width;
4164 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4165 return NULL;
4166
4167 if (self->length >= width) {
4168 Py_INCREF(self);
4169 return (PyObject*) self;
4170 }
4171
4172 return (PyObject*) pad(self, width - self->length, 0, ' ');
4173}
4174
4175static char rstrip__doc__[] =
4176"S.rstrip() -> unicode\n\
4177\n\
4178Return a copy of the string S with trailing whitespace removed.";
4179
4180static PyObject *
4181unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4182{
4183 if (!PyArg_NoArgs(args))
4184 return NULL;
4185 return strip(self, 0, 1);
4186}
4187
4188static PyObject*
4189unicode_slice(PyUnicodeObject *self, int start, int end)
4190{
4191 /* standard clamping */
4192 if (start < 0)
4193 start = 0;
4194 if (end < 0)
4195 end = 0;
4196 if (end > self->length)
4197 end = self->length;
4198 if (start == 0 && end == self->length) {
4199 /* full slice, return original string */
4200 Py_INCREF(self);
4201 return (PyObject*) self;
4202 }
4203 if (start > end)
4204 start = end;
4205 /* copy slice */
4206 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4207 end - start);
4208}
4209
4210PyObject *PyUnicode_Split(PyObject *s,
4211 PyObject *sep,
4212 int maxsplit)
4213{
4214 PyObject *result;
4215
4216 s = PyUnicode_FromObject(s);
4217 if (s == NULL)
4218 return NULL;
4219 if (sep != NULL) {
4220 sep = PyUnicode_FromObject(sep);
4221 if (sep == NULL) {
4222 Py_DECREF(s);
4223 return NULL;
4224 }
4225 }
4226
4227 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4228
4229 Py_DECREF(s);
4230 Py_XDECREF(sep);
4231 return result;
4232}
4233
4234static char split__doc__[] =
4235"S.split([sep [,maxsplit]]) -> list of strings\n\
4236\n\
4237Return a list of the words in S, using sep as the\n\
4238delimiter string. If maxsplit is given, at most maxsplit\n\
4239splits are done. If sep is not specified, any whitespace string\n\
4240is a separator.";
4241
4242static PyObject*
4243unicode_split(PyUnicodeObject *self, PyObject *args)
4244{
4245 PyObject *substring = Py_None;
4246 int maxcount = -1;
4247
4248 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4249 return NULL;
4250
4251 if (substring == Py_None)
4252 return split(self, NULL, maxcount);
4253 else if (PyUnicode_Check(substring))
4254 return split(self, (PyUnicodeObject *)substring, maxcount);
4255 else
4256 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4257}
4258
4259static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004260"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004261\n\
4262Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004263Line breaks are not included in the resulting list unless keepends\n\
4264is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004265
4266static PyObject*
4267unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4268{
Guido van Rossum86662912000-04-11 15:38:46 +00004269 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004270
Guido van Rossum86662912000-04-11 15:38:46 +00004271 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004272 return NULL;
4273
Guido van Rossum86662912000-04-11 15:38:46 +00004274 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275}
4276
4277static
4278PyObject *unicode_str(PyUnicodeObject *self)
4279{
Fred Drakee4315f52000-05-09 19:53:39 +00004280 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281}
4282
4283static char strip__doc__[] =
4284"S.strip() -> unicode\n\
4285\n\
4286Return a copy of S with leading and trailing whitespace removed.";
4287
4288static PyObject *
4289unicode_strip(PyUnicodeObject *self, PyObject *args)
4290{
4291 if (!PyArg_NoArgs(args))
4292 return NULL;
4293 return strip(self, 1, 1);
4294}
4295
4296static char swapcase__doc__[] =
4297"S.swapcase() -> unicode\n\
4298\n\
4299Return a copy of S with uppercase characters converted to lowercase\n\
4300and vice versa.";
4301
4302static PyObject*
4303unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4304{
4305 if (!PyArg_NoArgs(args))
4306 return NULL;
4307 return fixup(self, fixswapcase);
4308}
4309
4310static char translate__doc__[] =
4311"S.translate(table) -> unicode\n\
4312\n\
4313Return a copy of the string S, where all characters have been mapped\n\
4314through the given translation table, which must be a mapping of\n\
4315Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4316are left untouched. Characters mapped to None are deleted.";
4317
4318static PyObject*
4319unicode_translate(PyUnicodeObject *self, PyObject *args)
4320{
4321 PyObject *table;
4322
4323 if (!PyArg_ParseTuple(args, "O:translate", &table))
4324 return NULL;
4325 return PyUnicode_TranslateCharmap(self->str,
4326 self->length,
4327 table,
4328 "ignore");
4329}
4330
4331static char upper__doc__[] =
4332"S.upper() -> unicode\n\
4333\n\
4334Return a copy of S converted to uppercase.";
4335
4336static PyObject*
4337unicode_upper(PyUnicodeObject *self, PyObject *args)
4338{
4339 if (!PyArg_NoArgs(args))
4340 return NULL;
4341 return fixup(self, fixupper);
4342}
4343
4344#if 0
4345static char zfill__doc__[] =
4346"S.zfill(width) -> unicode\n\
4347\n\
4348Pad a numeric string x with zeros on the left, to fill a field\n\
4349of the specified width. The string x is never truncated.";
4350
4351static PyObject *
4352unicode_zfill(PyUnicodeObject *self, PyObject *args)
4353{
4354 int fill;
4355 PyUnicodeObject *u;
4356
4357 int width;
4358 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4359 return NULL;
4360
4361 if (self->length >= width) {
4362 Py_INCREF(self);
4363 return (PyObject*) self;
4364 }
4365
4366 fill = width - self->length;
4367
4368 u = pad(self, fill, 0, '0');
4369
4370 if (u->str[fill] == '+' || u->str[fill] == '-') {
4371 /* move sign to beginning of string */
4372 u->str[0] = u->str[fill];
4373 u->str[fill] = '0';
4374 }
4375
4376 return (PyObject*) u;
4377}
4378#endif
4379
4380#if 0
4381static PyObject*
4382unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4383{
4384 if (!PyArg_NoArgs(args))
4385 return NULL;
4386 return PyInt_FromLong(unicode_freelist_size);
4387}
4388#endif
4389
4390static char startswith__doc__[] =
4391"S.startswith(prefix[, start[, end]]) -> int\n\
4392\n\
4393Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4394optional start, test S beginning at that position. With optional end, stop\n\
4395comparing S at that position.";
4396
4397static PyObject *
4398unicode_startswith(PyUnicodeObject *self,
4399 PyObject *args)
4400{
4401 PyUnicodeObject *substring;
4402 int start = 0;
4403 int end = INT_MAX;
4404 PyObject *result;
4405
Guido van Rossumb8872e62000-05-09 14:14:27 +00004406 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4407 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004408 return NULL;
4409 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4410 (PyObject *)substring);
4411 if (substring == NULL)
4412 return NULL;
4413
4414 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4415
4416 Py_DECREF(substring);
4417 return result;
4418}
4419
4420
4421static char endswith__doc__[] =
4422"S.endswith(suffix[, start[, end]]) -> int\n\
4423\n\
4424Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4425optional start, test S beginning at that position. With optional end, stop\n\
4426comparing S at that position.";
4427
4428static PyObject *
4429unicode_endswith(PyUnicodeObject *self,
4430 PyObject *args)
4431{
4432 PyUnicodeObject *substring;
4433 int start = 0;
4434 int end = INT_MAX;
4435 PyObject *result;
4436
Guido van Rossumb8872e62000-05-09 14:14:27 +00004437 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4438 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004439 return NULL;
4440 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4441 (PyObject *)substring);
4442 if (substring == NULL)
4443 return NULL;
4444
4445 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4446
4447 Py_DECREF(substring);
4448 return result;
4449}
4450
4451
4452static PyMethodDef unicode_methods[] = {
4453
4454 /* Order is according to common usage: often used methods should
4455 appear first, since lookup is done sequentially. */
4456
4457 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4458 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4459 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4460 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4461 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4462 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4463 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4464 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4465 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4466 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4467 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4468 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4469 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4470 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4471/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4472 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4473 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4474 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4475 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4476 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4477 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4478 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4479 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4480 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4481 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4482 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4483 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4484 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4485 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4486 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4487 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4488 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4489 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004490 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4491 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004492#if 0
4493 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4494 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4495#endif
4496
4497#if 0
4498 /* This one is just used for debugging the implementation. */
4499 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4500#endif
4501
4502 {NULL, NULL}
4503};
4504
4505static PyObject *
4506unicode_getattr(PyUnicodeObject *self, char *name)
4507{
4508 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4509}
4510
4511static PySequenceMethods unicode_as_sequence = {
4512 (inquiry) unicode_length, /* sq_length */
4513 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4514 (intargfunc) unicode_repeat, /* sq_repeat */
4515 (intargfunc) unicode_getitem, /* sq_item */
4516 (intintargfunc) unicode_slice, /* sq_slice */
4517 0, /* sq_ass_item */
4518 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004519 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004520};
4521
4522static int
4523unicode_buffer_getreadbuf(PyUnicodeObject *self,
4524 int index,
4525 const void **ptr)
4526{
4527 if (index != 0) {
4528 PyErr_SetString(PyExc_SystemError,
4529 "accessing non-existent unicode segment");
4530 return -1;
4531 }
4532 *ptr = (void *) self->str;
4533 return PyUnicode_GET_DATA_SIZE(self);
4534}
4535
4536static int
4537unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4538 const void **ptr)
4539{
4540 PyErr_SetString(PyExc_TypeError,
4541 "cannot use unicode as modifyable buffer");
4542 return -1;
4543}
4544
4545static int
4546unicode_buffer_getsegcount(PyUnicodeObject *self,
4547 int *lenp)
4548{
4549 if (lenp)
4550 *lenp = PyUnicode_GET_DATA_SIZE(self);
4551 return 1;
4552}
4553
4554static int
4555unicode_buffer_getcharbuf(PyUnicodeObject *self,
4556 int index,
4557 const void **ptr)
4558{
4559 PyObject *str;
4560
4561 if (index != 0) {
4562 PyErr_SetString(PyExc_SystemError,
4563 "accessing non-existent unicode segment");
4564 return -1;
4565 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004566 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 if (str == NULL)
4568 return -1;
4569 *ptr = (void *) PyString_AS_STRING(str);
4570 return PyString_GET_SIZE(str);
4571}
4572
4573/* Helpers for PyUnicode_Format() */
4574
4575static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004576getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004577{
4578 int argidx = *p_argidx;
4579 if (argidx < arglen) {
4580 (*p_argidx)++;
4581 if (arglen < 0)
4582 return args;
4583 else
4584 return PyTuple_GetItem(args, argidx);
4585 }
4586 PyErr_SetString(PyExc_TypeError,
4587 "not enough arguments for format string");
4588 return NULL;
4589}
4590
4591#define F_LJUST (1<<0)
4592#define F_SIGN (1<<1)
4593#define F_BLANK (1<<2)
4594#define F_ALT (1<<3)
4595#define F_ZERO (1<<4)
4596
4597static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599{
4600 register int i;
4601 int len;
4602 va_list va;
4603 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004604 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605
4606 /* First, format the string as char array, then expand to Py_UNICODE
4607 array. */
4608 charbuffer = (char *)buffer;
4609 len = vsprintf(charbuffer, format, va);
4610 for (i = len - 1; i >= 0; i--)
4611 buffer[i] = (Py_UNICODE) charbuffer[i];
4612
4613 va_end(va);
4614 return len;
4615}
4616
4617static int
4618formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004619 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620 int flags,
4621 int prec,
4622 int type,
4623 PyObject *v)
4624{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004625 /* fmt = '%#.' + `prec` + `type`
4626 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004627 char fmt[20];
4628 double x;
4629
4630 x = PyFloat_AsDouble(v);
4631 if (x == -1.0 && PyErr_Occurred())
4632 return -1;
4633 if (prec < 0)
4634 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4636 type = 'g';
4637 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004638 /* worst case length calc to ensure no buffer overrun:
4639 fmt = %#.<prec>g
4640 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4641 for any double rep.)
4642 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4643 If prec=0 the effective precision is 1 (the leading digit is
4644 always given), therefore increase by one to 10+prec. */
4645 if (buflen <= (size_t)10 + (size_t)prec) {
4646 PyErr_SetString(PyExc_OverflowError,
4647 "formatted float is too long (precision too long?)");
4648 return -1;
4649 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004650 return usprintf(buf, fmt, x);
4651}
4652
Tim Peters38fd5b62000-09-21 05:43:11 +00004653static PyObject*
4654formatlong(PyObject *val, int flags, int prec, int type)
4655{
4656 char *buf;
4657 int i, len;
4658 PyObject *str; /* temporary string object. */
4659 PyUnicodeObject *result;
4660
4661 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4662 if (!str)
4663 return NULL;
4664 result = _PyUnicode_New(len);
4665 for (i = 0; i < len; i++)
4666 result->str[i] = buf[i];
4667 result->str[len] = 0;
4668 Py_DECREF(str);
4669 return (PyObject*)result;
4670}
4671
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672static int
4673formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004674 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 int flags,
4676 int prec,
4677 int type,
4678 PyObject *v)
4679{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004680 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004681 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4682 + 1 + 1 = 24*/
4683 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004684 long x;
4685
4686 x = PyInt_AsLong(v);
4687 if (x == -1 && PyErr_Occurred())
4688 return -1;
4689 if (prec < 0)
4690 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004691 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4692 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4693 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4694 PyErr_SetString(PyExc_OverflowError,
4695 "formatted integer is too long (precision too long?)");
4696 return -1;
4697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4699 return usprintf(buf, fmt, x);
4700}
4701
4702static int
4703formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004704 size_t buflen,
4705 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004706{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004707 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004708 if (PyUnicode_Check(v)) {
4709 if (PyUnicode_GET_SIZE(v) != 1)
4710 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004712 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004713
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004714 else if (PyString_Check(v)) {
4715 if (PyString_GET_SIZE(v) != 1)
4716 goto onError;
4717 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4718 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004719
4720 else {
4721 /* Integer input truncated to a character */
4722 long x;
4723 x = PyInt_AsLong(v);
4724 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004725 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004726 buf[0] = (char) x;
4727 }
4728 buf[1] = '\0';
4729 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004730
4731 onError:
4732 PyErr_SetString(PyExc_TypeError,
4733 "%c requires int or char");
4734 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735}
4736
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004737/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4738
4739 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4740 chars are formatted. XXX This is a magic number. Each formatting
4741 routine does bounds checking to ensure no overflow, but a better
4742 solution may be to malloc a buffer of appropriate size for each
4743 format. For now, the current solution is sufficient.
4744*/
4745#define FORMATBUFLEN (size_t)120
4746
Guido van Rossumd57fd912000-03-10 22:53:23 +00004747PyObject *PyUnicode_Format(PyObject *format,
4748 PyObject *args)
4749{
4750 Py_UNICODE *fmt, *res;
4751 int fmtcnt, rescnt, reslen, arglen, argidx;
4752 int args_owned = 0;
4753 PyUnicodeObject *result = NULL;
4754 PyObject *dict = NULL;
4755 PyObject *uformat;
4756
4757 if (format == NULL || args == NULL) {
4758 PyErr_BadInternalCall();
4759 return NULL;
4760 }
4761 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004762 if (uformat == NULL)
4763 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004764 fmt = PyUnicode_AS_UNICODE(uformat);
4765 fmtcnt = PyUnicode_GET_SIZE(uformat);
4766
4767 reslen = rescnt = fmtcnt + 100;
4768 result = _PyUnicode_New(reslen);
4769 if (result == NULL)
4770 goto onError;
4771 res = PyUnicode_AS_UNICODE(result);
4772
4773 if (PyTuple_Check(args)) {
4774 arglen = PyTuple_Size(args);
4775 argidx = 0;
4776 }
4777 else {
4778 arglen = -1;
4779 argidx = -2;
4780 }
4781 if (args->ob_type->tp_as_mapping)
4782 dict = args;
4783
4784 while (--fmtcnt >= 0) {
4785 if (*fmt != '%') {
4786 if (--rescnt < 0) {
4787 rescnt = fmtcnt + 100;
4788 reslen += rescnt;
4789 if (_PyUnicode_Resize(result, reslen) < 0)
4790 return NULL;
4791 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4792 --rescnt;
4793 }
4794 *res++ = *fmt++;
4795 }
4796 else {
4797 /* Got a format specifier */
4798 int flags = 0;
4799 int width = -1;
4800 int prec = -1;
4801 int size = 0;
4802 Py_UNICODE c = '\0';
4803 Py_UNICODE fill;
4804 PyObject *v = NULL;
4805 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004806 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004807 Py_UNICODE sign;
4808 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004809 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004810
4811 fmt++;
4812 if (*fmt == '(') {
4813 Py_UNICODE *keystart;
4814 int keylen;
4815 PyObject *key;
4816 int pcount = 1;
4817
4818 if (dict == NULL) {
4819 PyErr_SetString(PyExc_TypeError,
4820 "format requires a mapping");
4821 goto onError;
4822 }
4823 ++fmt;
4824 --fmtcnt;
4825 keystart = fmt;
4826 /* Skip over balanced parentheses */
4827 while (pcount > 0 && --fmtcnt >= 0) {
4828 if (*fmt == ')')
4829 --pcount;
4830 else if (*fmt == '(')
4831 ++pcount;
4832 fmt++;
4833 }
4834 keylen = fmt - keystart - 1;
4835 if (fmtcnt < 0 || pcount > 0) {
4836 PyErr_SetString(PyExc_ValueError,
4837 "incomplete format key");
4838 goto onError;
4839 }
Fred Drakee4315f52000-05-09 19:53:39 +00004840 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004841 then looked up since Python uses strings to hold
4842 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004843 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004844 key = PyUnicode_EncodeUTF8(keystart,
4845 keylen,
4846 NULL);
4847 if (key == NULL)
4848 goto onError;
4849 if (args_owned) {
4850 Py_DECREF(args);
4851 args_owned = 0;
4852 }
4853 args = PyObject_GetItem(dict, key);
4854 Py_DECREF(key);
4855 if (args == NULL) {
4856 goto onError;
4857 }
4858 args_owned = 1;
4859 arglen = -1;
4860 argidx = -2;
4861 }
4862 while (--fmtcnt >= 0) {
4863 switch (c = *fmt++) {
4864 case '-': flags |= F_LJUST; continue;
4865 case '+': flags |= F_SIGN; continue;
4866 case ' ': flags |= F_BLANK; continue;
4867 case '#': flags |= F_ALT; continue;
4868 case '0': flags |= F_ZERO; continue;
4869 }
4870 break;
4871 }
4872 if (c == '*') {
4873 v = getnextarg(args, arglen, &argidx);
4874 if (v == NULL)
4875 goto onError;
4876 if (!PyInt_Check(v)) {
4877 PyErr_SetString(PyExc_TypeError,
4878 "* wants int");
4879 goto onError;
4880 }
4881 width = PyInt_AsLong(v);
4882 if (width < 0) {
4883 flags |= F_LJUST;
4884 width = -width;
4885 }
4886 if (--fmtcnt >= 0)
4887 c = *fmt++;
4888 }
4889 else if (c >= '0' && c <= '9') {
4890 width = c - '0';
4891 while (--fmtcnt >= 0) {
4892 c = *fmt++;
4893 if (c < '0' || c > '9')
4894 break;
4895 if ((width*10) / 10 != width) {
4896 PyErr_SetString(PyExc_ValueError,
4897 "width too big");
4898 goto onError;
4899 }
4900 width = width*10 + (c - '0');
4901 }
4902 }
4903 if (c == '.') {
4904 prec = 0;
4905 if (--fmtcnt >= 0)
4906 c = *fmt++;
4907 if (c == '*') {
4908 v = getnextarg(args, arglen, &argidx);
4909 if (v == NULL)
4910 goto onError;
4911 if (!PyInt_Check(v)) {
4912 PyErr_SetString(PyExc_TypeError,
4913 "* wants int");
4914 goto onError;
4915 }
4916 prec = PyInt_AsLong(v);
4917 if (prec < 0)
4918 prec = 0;
4919 if (--fmtcnt >= 0)
4920 c = *fmt++;
4921 }
4922 else if (c >= '0' && c <= '9') {
4923 prec = c - '0';
4924 while (--fmtcnt >= 0) {
4925 c = Py_CHARMASK(*fmt++);
4926 if (c < '0' || c > '9')
4927 break;
4928 if ((prec*10) / 10 != prec) {
4929 PyErr_SetString(PyExc_ValueError,
4930 "prec too big");
4931 goto onError;
4932 }
4933 prec = prec*10 + (c - '0');
4934 }
4935 }
4936 } /* prec */
4937 if (fmtcnt >= 0) {
4938 if (c == 'h' || c == 'l' || c == 'L') {
4939 size = c;
4940 if (--fmtcnt >= 0)
4941 c = *fmt++;
4942 }
4943 }
4944 if (fmtcnt < 0) {
4945 PyErr_SetString(PyExc_ValueError,
4946 "incomplete format");
4947 goto onError;
4948 }
4949 if (c != '%') {
4950 v = getnextarg(args, arglen, &argidx);
4951 if (v == NULL)
4952 goto onError;
4953 }
4954 sign = 0;
4955 fill = ' ';
4956 switch (c) {
4957
4958 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004959 pbuf = formatbuf;
4960 /* presume that buffer length is at least 1 */
4961 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004962 len = 1;
4963 break;
4964
4965 case 's':
4966 case 'r':
4967 if (PyUnicode_Check(v) && c == 's') {
4968 temp = v;
4969 Py_INCREF(temp);
4970 }
4971 else {
4972 PyObject *unicode;
4973 if (c == 's')
4974 temp = PyObject_Str(v);
4975 else
4976 temp = PyObject_Repr(v);
4977 if (temp == NULL)
4978 goto onError;
4979 if (!PyString_Check(temp)) {
4980 /* XXX Note: this should never happen, since
4981 PyObject_Repr() and PyObject_Str() assure
4982 this */
4983 Py_DECREF(temp);
4984 PyErr_SetString(PyExc_TypeError,
4985 "%s argument has non-string str()");
4986 goto onError;
4987 }
Fred Drakee4315f52000-05-09 19:53:39 +00004988 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004989 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004990 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004991 "strict");
4992 Py_DECREF(temp);
4993 temp = unicode;
4994 if (temp == NULL)
4995 goto onError;
4996 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004997 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004998 len = PyUnicode_GET_SIZE(temp);
4999 if (prec >= 0 && len > prec)
5000 len = prec;
5001 break;
5002
5003 case 'i':
5004 case 'd':
5005 case 'u':
5006 case 'o':
5007 case 'x':
5008 case 'X':
5009 if (c == 'i')
5010 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005011 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005012 temp = formatlong(v, flags, prec, c);
5013 if (!temp)
5014 goto onError;
5015 pbuf = PyUnicode_AS_UNICODE(temp);
5016 len = PyUnicode_GET_SIZE(temp);
5017 /* unbounded ints can always produce
5018 a sign character! */
5019 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005021 else {
5022 pbuf = formatbuf;
5023 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5024 flags, prec, c, v);
5025 if (len < 0)
5026 goto onError;
5027 /* only d conversion is signed */
5028 sign = c == 'd';
5029 }
5030 if (flags & F_ZERO)
5031 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 break;
5033
5034 case 'e':
5035 case 'E':
5036 case 'f':
5037 case 'g':
5038 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005039 pbuf = formatbuf;
5040 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5041 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 if (len < 0)
5043 goto onError;
5044 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005045 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005046 fill = '0';
5047 break;
5048
5049 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005050 pbuf = formatbuf;
5051 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 if (len < 0)
5053 goto onError;
5054 break;
5055
5056 default:
5057 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005058 "unsupported format character '%c' (0x%x) "
5059 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005060 (31<=c && c<=126) ? c : '?',
5061 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005062 goto onError;
5063 }
5064 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005065 if (*pbuf == '-' || *pbuf == '+') {
5066 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005067 len--;
5068 }
5069 else if (flags & F_SIGN)
5070 sign = '+';
5071 else if (flags & F_BLANK)
5072 sign = ' ';
5073 else
5074 sign = 0;
5075 }
5076 if (width < len)
5077 width = len;
5078 if (rescnt < width + (sign != 0)) {
5079 reslen -= rescnt;
5080 rescnt = width + fmtcnt + 100;
5081 reslen += rescnt;
5082 if (_PyUnicode_Resize(result, reslen) < 0)
5083 return NULL;
5084 res = PyUnicode_AS_UNICODE(result)
5085 + reslen - rescnt;
5086 }
5087 if (sign) {
5088 if (fill != ' ')
5089 *res++ = sign;
5090 rescnt--;
5091 if (width > len)
5092 width--;
5093 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005094 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5095 assert(pbuf[0] == '0');
5096 assert(pbuf[1] == c);
5097 if (fill != ' ') {
5098 *res++ = *pbuf++;
5099 *res++ = *pbuf++;
5100 }
5101 rescnt -= 2;
5102 width -= 2;
5103 if (width < 0)
5104 width = 0;
5105 len -= 2;
5106 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005107 if (width > len && !(flags & F_LJUST)) {
5108 do {
5109 --rescnt;
5110 *res++ = fill;
5111 } while (--width > len);
5112 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005113 if (fill == ' ') {
5114 if (sign)
5115 *res++ = sign;
5116 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5117 assert(pbuf[0] == '0');
5118 assert(pbuf[1] == c);
5119 *res++ = *pbuf++;
5120 *res++ = *pbuf++;
5121 }
5122 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005123 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005124 res += len;
5125 rescnt -= len;
5126 while (--width >= len) {
5127 --rescnt;
5128 *res++ = ' ';
5129 }
5130 if (dict && (argidx < arglen) && c != '%') {
5131 PyErr_SetString(PyExc_TypeError,
5132 "not all arguments converted");
5133 goto onError;
5134 }
5135 Py_XDECREF(temp);
5136 } /* '%' */
5137 } /* until end */
5138 if (argidx < arglen && !dict) {
5139 PyErr_SetString(PyExc_TypeError,
5140 "not all arguments converted");
5141 goto onError;
5142 }
5143
5144 if (args_owned) {
5145 Py_DECREF(args);
5146 }
5147 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005148 if (_PyUnicode_Resize(result, reslen - rescnt))
5149 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005150 return (PyObject *)result;
5151
5152 onError:
5153 Py_XDECREF(result);
5154 Py_DECREF(uformat);
5155 if (args_owned) {
5156 Py_DECREF(args);
5157 }
5158 return NULL;
5159}
5160
5161static PyBufferProcs unicode_as_buffer = {
5162 (getreadbufferproc) unicode_buffer_getreadbuf,
5163 (getwritebufferproc) unicode_buffer_getwritebuf,
5164 (getsegcountproc) unicode_buffer_getsegcount,
5165 (getcharbufferproc) unicode_buffer_getcharbuf,
5166};
5167
5168PyTypeObject PyUnicode_Type = {
5169 PyObject_HEAD_INIT(&PyType_Type)
5170 0, /* ob_size */
5171 "unicode", /* tp_name */
5172 sizeof(PyUnicodeObject), /* tp_size */
5173 0, /* tp_itemsize */
5174 /* Slots */
5175 (destructor)_PyUnicode_Free, /* tp_dealloc */
5176 0, /* tp_print */
5177 (getattrfunc)unicode_getattr, /* tp_getattr */
5178 0, /* tp_setattr */
5179 (cmpfunc) unicode_compare, /* tp_compare */
5180 (reprfunc) unicode_repr, /* tp_repr */
5181 0, /* tp_as_number */
5182 &unicode_as_sequence, /* tp_as_sequence */
5183 0, /* tp_as_mapping */
5184 (hashfunc) unicode_hash, /* tp_hash*/
5185 0, /* tp_call*/
5186 (reprfunc) unicode_str, /* tp_str */
5187 (getattrofunc) NULL, /* tp_getattro */
5188 (setattrofunc) NULL, /* tp_setattro */
5189 &unicode_as_buffer, /* tp_as_buffer */
5190 Py_TPFLAGS_DEFAULT, /* tp_flags */
5191};
5192
5193/* Initialize the Unicode implementation */
5194
Thomas Wouters78890102000-07-22 19:25:51 +00005195void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005196{
5197 /* Doublecheck the configuration... */
5198 if (sizeof(Py_UNICODE) != 2)
5199 Py_FatalError("Unicode configuration error: "
5200 "sizeof(Py_UNICODE) != 2 bytes");
5201
Fred Drakee4315f52000-05-09 19:53:39 +00005202 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005203 unicode_freelist = NULL;
5204 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005206 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005207}
5208
5209/* Finalize the Unicode implementation */
5210
5211void
Thomas Wouters78890102000-07-22 19:25:51 +00005212_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005213{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005214 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005215
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005216 Py_XDECREF(unicode_empty);
5217 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005218
5219 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005220 PyUnicodeObject *v = u;
5221 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005222 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005223 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005224 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005225 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005226 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005227 unicode_freelist = NULL;
5228 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005229}