blob: c237789a79edfef77e4ecb224b24e7781dc2f263 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
86/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
89/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000090static PyUnicodeObject *unicode_freelist;
91static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Fred Drakee4315f52000-05-09 19:53:39 +000093/* Default encoding to use and assume when NULL is passed as encoding
94 parameter; it is initialized by _PyUnicode_Init().
95
96 Always use the PyUnicode_SetDefaultEncoding() and
97 PyUnicode_GetDefaultEncoding() APIs to access this global.
98
99*/
100
101static char unicode_default_encoding[100];
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* --- Unicode Object ----------------------------------------------------- */
104
105static
106int _PyUnicode_Resize(register PyUnicodeObject *unicode,
107 int length)
108{
109 void *oldstr;
110
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000111 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000112 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000113 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115 /* Resizing unicode_empty is not allowed. */
116 if (unicode == unicode_empty) {
117 PyErr_SetString(PyExc_SystemError,
118 "can't resize empty unicode object");
119 return -1;
120 }
121
122 /* We allocate one more byte to make sure the string is
123 Ux0000 terminated -- XXX is this needed ? */
124 oldstr = unicode->str;
125 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
126 if (!unicode->str) {
127 unicode->str = oldstr;
128 PyErr_NoMemory();
129 return -1;
130 }
131 unicode->str[length] = 0;
132 unicode->length = length;
133
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000136 if (unicode->defenc) {
137 Py_DECREF(unicode->defenc);
138 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 }
140 unicode->hash = -1;
141
142 return 0;
143}
144
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145int PyUnicode_Resize(PyObject **unicode,
146 int length)
147{
148 PyUnicodeObject *v;
149
150 if (unicode == NULL) {
151 PyErr_BadInternalCall();
152 return -1;
153 }
154 v = (PyUnicodeObject *)*unicode;
155 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
156 PyErr_BadInternalCall();
157 return -1;
158 }
159 return _PyUnicode_Resize(v, length);
160}
161
Guido van Rossumd57fd912000-03-10 22:53:23 +0000162/* We allocate one more byte to make sure the string is
163 Ux0000 terminated -- XXX is this needed ?
164
165 XXX This allocator could further be enhanced by assuring that the
166 free list never reduces its size below 1.
167
168*/
169
170static
171PyUnicodeObject *_PyUnicode_New(int length)
172{
173 register PyUnicodeObject *unicode;
174
175 /* Optimization for empty strings */
176 if (length == 0 && unicode_empty != NULL) {
177 Py_INCREF(unicode_empty);
178 return unicode_empty;
179 }
180
181 /* Unicode freelist & memory allocation */
182 if (unicode_freelist) {
183 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000184 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000187 /* Keep-Alive optimization: we only upsize the buffer,
188 never downsize it. */
189 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000191 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 }
194 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000195 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000197 }
198 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 }
200 else {
201 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
202 if (unicode == NULL)
203 return NULL;
204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
205 }
206
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000207 if (!unicode->str) {
208 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000209 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode->str[length] = 0;
212 unicode->length = length;
213 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000216
217 onError:
218 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000219 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221}
222
223static
224void _PyUnicode_Free(register PyUnicodeObject *unicode)
225{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 /* Keep-Alive optimization */
228 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->str = NULL;
231 unicode->length = 0;
232 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000233 if (unicode->defenc) {
234 Py_DECREF(unicode->defenc);
235 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000236 }
237 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 *(PyUnicodeObject **)unicode = unicode_freelist;
239 unicode_freelist = unicode;
240 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 }
242 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000243 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000244 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 }
247}
248
249PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
250 int size)
251{
252 PyUnicodeObject *unicode;
253
254 unicode = _PyUnicode_New(size);
255 if (!unicode)
256 return NULL;
257
258 /* Copy the Unicode data into the new object */
259 if (u != NULL)
260 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
261
262 return (PyObject *)unicode;
263}
264
265#ifdef HAVE_WCHAR_H
266
267PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
268 int size)
269{
270 PyUnicodeObject *unicode;
271
272 if (w == NULL) {
273 PyErr_BadInternalCall();
274 return NULL;
275 }
276
277 unicode = _PyUnicode_New(size);
278 if (!unicode)
279 return NULL;
280
281 /* Copy the wchar_t data into the new object */
282#ifdef HAVE_USABLE_WCHAR_T
283 memcpy(unicode->str, w, size * sizeof(wchar_t));
284#else
285 {
286 register Py_UNICODE *u;
287 register int i;
288 u = PyUnicode_AS_UNICODE(unicode);
289 for (i = size; i >= 0; i--)
290 *u++ = *w++;
291 }
292#endif
293
294 return (PyObject *)unicode;
295}
296
297int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
298 register wchar_t *w,
299 int size)
300{
301 if (unicode == NULL) {
302 PyErr_BadInternalCall();
303 return -1;
304 }
305 if (size > PyUnicode_GET_SIZE(unicode))
306 size = PyUnicode_GET_SIZE(unicode);
307#ifdef HAVE_USABLE_WCHAR_T
308 memcpy(w, unicode->str, size * sizeof(wchar_t));
309#else
310 {
311 register Py_UNICODE *u;
312 register int i;
313 u = PyUnicode_AS_UNICODE(unicode);
314 for (i = size; i >= 0; i--)
315 *w++ = *u++;
316 }
317#endif
318
319 return size;
320}
321
322#endif
323
324PyObject *PyUnicode_FromObject(register PyObject *obj)
325{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000326 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
327}
328
329PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
330 const char *encoding,
331 const char *errors)
332{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 const char *s;
334 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000335 int owned = 0;
336 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337
338 if (obj == NULL) {
339 PyErr_BadInternalCall();
340 return NULL;
341 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000342
343 /* Coerce object */
344 if (PyInstance_Check(obj)) {
345 PyObject *func;
346 func = PyObject_GetAttrString(obj, "__str__");
347 if (func == NULL) {
348 PyErr_SetString(PyExc_TypeError,
349 "coercing to Unicode: instance doesn't define __str__");
350 return NULL;
351 }
352 obj = PyEval_CallObject(func, NULL);
353 Py_DECREF(func);
354 if (obj == NULL)
355 return NULL;
356 owned = 1;
357 }
358 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000360 v = obj;
361 if (encoding) {
362 PyErr_SetString(PyExc_TypeError,
363 "decoding Unicode is not supported");
364 return NULL;
365 }
366 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 }
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
371 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000376 PyErr_Format(PyExc_TypeError,
377 "coercing to Unicode: need string or buffer, "
378 "%.80s found",
379 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000380 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000381 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382
383 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 if (len == 0) {
385 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000388 else
389 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000390
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000392 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000394 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000395 return v;
396
397 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000398 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000399 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000400 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402}
403
404PyObject *PyUnicode_Decode(const char *s,
405 int size,
406 const char *encoding,
407 const char *errors)
408{
409 PyObject *buffer = NULL, *unicode;
410
Fred Drakee4315f52000-05-09 19:53:39 +0000411 if (encoding == NULL)
412 encoding = PyUnicode_GetDefaultEncoding();
413
414 /* Shortcuts for common default encodings */
415 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000417 else if (strcmp(encoding, "latin-1") == 0)
418 return PyUnicode_DecodeLatin1(s, size, errors);
419 else if (strcmp(encoding, "ascii") == 0)
420 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421
422 /* Decode via the codec registry */
423 buffer = PyBuffer_FromMemory((void *)s, size);
424 if (buffer == NULL)
425 goto onError;
426 unicode = PyCodec_Decode(buffer, encoding, errors);
427 if (unicode == NULL)
428 goto onError;
429 if (!PyUnicode_Check(unicode)) {
430 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000431 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 unicode->ob_type->tp_name);
433 Py_DECREF(unicode);
434 goto onError;
435 }
436 Py_DECREF(buffer);
437 return unicode;
438
439 onError:
440 Py_XDECREF(buffer);
441 return NULL;
442}
443
444PyObject *PyUnicode_Encode(const Py_UNICODE *s,
445 int size,
446 const char *encoding,
447 const char *errors)
448{
449 PyObject *v, *unicode;
450
451 unicode = PyUnicode_FromUnicode(s, size);
452 if (unicode == NULL)
453 return NULL;
454 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
455 Py_DECREF(unicode);
456 return v;
457}
458
459PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
460 const char *encoding,
461 const char *errors)
462{
463 PyObject *v;
464
465 if (!PyUnicode_Check(unicode)) {
466 PyErr_BadArgument();
467 goto onError;
468 }
Fred Drakee4315f52000-05-09 19:53:39 +0000469
470 if (encoding == NULL)
471 encoding = PyUnicode_GetDefaultEncoding();
472
473 /* Shortcuts for common default encodings */
474 if (errors == NULL) {
475 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000477 else if (strcmp(encoding, "latin-1") == 0)
478 return PyUnicode_AsLatin1String(unicode);
479 else if (strcmp(encoding, "ascii") == 0)
480 return PyUnicode_AsASCIIString(unicode);
481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482
483 /* Encode via the codec registry */
484 v = PyCodec_Encode(unicode, encoding, errors);
485 if (v == NULL)
486 goto onError;
487 /* XXX Should we really enforce this ? */
488 if (!PyString_Check(v)) {
489 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000490 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 v->ob_type->tp_name);
492 Py_DECREF(v);
493 goto onError;
494 }
495 return v;
496
497 onError:
498 return NULL;
499}
500
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000501/* Return a Python string holding the default encoded value of the
502 Unicode object.
503
504 The resulting string is cached in the Unicode object for subsequent
505 usage by this function. The cached version is needed to implement
506 the character buffer interface and will live (at least) as long as
507 the Unicode object itself.
508
509 The refcount of the string is *not* incremented.
510
511 *** Exported for internal use by the interpreter only !!! ***
512
513*/
514
515PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
516 const char *errors)
517{
518 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
519
520 if (v)
521 return v;
522 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
523 if (v && errors == NULL)
524 ((PyUnicodeObject *)unicode)->defenc = v;
525 return v;
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
529{
530 if (!PyUnicode_Check(unicode)) {
531 PyErr_BadArgument();
532 goto onError;
533 }
534 return PyUnicode_AS_UNICODE(unicode);
535
536 onError:
537 return NULL;
538}
539
540int PyUnicode_GetSize(PyObject *unicode)
541{
542 if (!PyUnicode_Check(unicode)) {
543 PyErr_BadArgument();
544 goto onError;
545 }
546 return PyUnicode_GET_SIZE(unicode);
547
548 onError:
549 return -1;
550}
551
Thomas Wouters78890102000-07-22 19:25:51 +0000552const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000553{
554 return unicode_default_encoding;
555}
556
557int PyUnicode_SetDefaultEncoding(const char *encoding)
558{
559 PyObject *v;
560
561 /* Make sure the encoding is valid. As side effect, this also
562 loads the encoding into the codec registry cache. */
563 v = _PyCodec_Lookup(encoding);
564 if (v == NULL)
565 goto onError;
566 Py_DECREF(v);
567 strncpy(unicode_default_encoding,
568 encoding,
569 sizeof(unicode_default_encoding));
570 return 0;
571
572 onError:
573 return -1;
574}
575
Guido van Rossumd57fd912000-03-10 22:53:23 +0000576/* --- UTF-8 Codec -------------------------------------------------------- */
577
578static
579char utf8_code_length[256] = {
580 /* Map UTF-8 encoded prefix byte to sequence length. zero means
581 illegal prefix. see RFC 2279 for details */
582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
597 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
598};
599
600static
601int utf8_decoding_error(const char **source,
602 Py_UNICODE **dest,
603 const char *errors,
604 const char *details)
605{
606 if ((errors == NULL) ||
607 (strcmp(errors,"strict") == 0)) {
608 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000609 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 details);
611 return -1;
612 }
613 else if (strcmp(errors,"ignore") == 0) {
614 (*source)++;
615 return 0;
616 }
617 else if (strcmp(errors,"replace") == 0) {
618 (*source)++;
619 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
620 (*dest)++;
621 return 0;
622 }
623 else {
624 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 errors);
627 return -1;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631PyObject *PyUnicode_DecodeUTF8(const char *s,
632 int size,
633 const char *errors)
634{
635 int n;
636 const char *e;
637 PyUnicodeObject *unicode;
638 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000639 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Note: size will always be longer than the resulting Unicode
642 character count */
643 unicode = _PyUnicode_New(size);
644 if (!unicode)
645 return NULL;
646 if (size == 0)
647 return (PyObject *)unicode;
648
649 /* Unpack UTF-8 encoded data */
650 p = unicode->str;
651 e = s + size;
652
653 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000654 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655
656 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000657 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 s++;
659 continue;
660 }
661
662 n = utf8_code_length[ch];
663
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 if (s + n > e) {
665 errmsg = "unexpected end of data";
666 goto utf8Error;
667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668
669 switch (n) {
670
671 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000672 errmsg = "unexpected code byte";
673 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000674 break;
675
676 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000677 errmsg = "internal error";
678 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000679 break;
680
681 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000682 if ((s[1] & 0xc0) != 0x80) {
683 errmsg = "invalid data";
684 goto utf8Error;
685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000686 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000687 if (ch < 0x80) {
688 errmsg = "illegal encoding";
689 goto utf8Error;
690 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000692 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000693 break;
694
695 case 3:
696 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000697 (s[2] & 0xc0) != 0x80) {
698 errmsg = "invalid data";
699 goto utf8Error;
700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000701 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000702 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
703 errmsg = "illegal encoding";
704 goto utf8Error;
705 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000706 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000707 *p++ = (Py_UNICODE)ch;
708 break;
709
710 case 4:
711 if ((s[1] & 0xc0) != 0x80 ||
712 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 (s[3] & 0xc0) != 0x80) {
714 errmsg = "invalid data";
715 goto utf8Error;
716 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000717 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
718 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
719 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000720 if ((ch < 0x10000) || /* minimum value allowed for 4
721 byte encoding */
722 (ch > 0x10ffff)) { /* maximum value allowed for
723 UTF-16 */
724 errmsg = "illegal encoding";
725 goto utf8Error;
726 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000727 /* compute and append the two surrogates: */
728
729 /* translate from 10000..10FFFF to 0..FFFF */
730 ch -= 0x10000;
731
732 /* high surrogate = top 10 bits added to D800 */
733 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
734
735 /* low surrogate = bottom 10 bits added to DC00 */
736 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000737 break;
738
739 default:
740 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000741 errmsg = "unsupported Unicode code range";
742 goto utf8Error;
743 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000744 }
745 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 continue;
747
748 utf8Error:
749 if (utf8_decoding_error(&s, &p, errors, errmsg))
750 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000751 }
752
753 /* Adjust length */
754 if (_PyUnicode_Resize(unicode, p - unicode->str))
755 goto onError;
756
757 return (PyObject *)unicode;
758
759onError:
760 Py_DECREF(unicode);
761 return NULL;
762}
763
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000764/* Not used anymore, now that the encoder supports UTF-16
765 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000766#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767static
768int utf8_encoding_error(const Py_UNICODE **source,
769 char **dest,
770 const char *errors,
771 const char *details)
772{
773 if ((errors == NULL) ||
774 (strcmp(errors,"strict") == 0)) {
775 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000776 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777 details);
778 return -1;
779 }
780 else if (strcmp(errors,"ignore") == 0) {
781 return 0;
782 }
783 else if (strcmp(errors,"replace") == 0) {
784 **dest = '?';
785 (*dest)++;
786 return 0;
787 }
788 else {
789 PyErr_Format(PyExc_ValueError,
790 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000791 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000792 errors);
793 return -1;
794 }
795}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000796#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000797
798PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
799 int size,
800 const char *errors)
801{
802 PyObject *v;
803 char *p;
804 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000805 Py_UCS4 ch2;
806 unsigned int cbAllocated = 3 * size;
807 unsigned int cbWritten = 0;
808 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000809
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000810 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000811 if (v == NULL)
812 return NULL;
813 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000814 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000815
816 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000817 while (i < size) {
818 Py_UCS4 ch = s[i++];
819 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000821 cbWritten++;
822 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823 else if (ch < 0x0800) {
824 *p++ = 0xc0 | (ch >> 6);
825 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000826 cbWritten += 2;
827 }
828 else {
829 /* Check for high surrogate */
830 if (0xD800 <= ch && ch <= 0xDBFF) {
831 if (i != size) {
832 ch2 = s[i];
833 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
834
835 if (cbWritten >= (cbAllocated - 4)) {
836 /* Provide enough room for some more
837 surrogates */
838 cbAllocated += 4*10;
839 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000840 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000841 }
842
843 /* combine the two values */
844 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
845
846 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000847 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000848 i++;
849 cbWritten += 4;
850 }
851 }
852 }
853 else {
854 *p++ = (char)(0xe0 | (ch >> 12));
855 cbWritten += 3;
856 }
857 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
858 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000859 }
860 }
861 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000862 if (_PyString_Resize(&v, p - q))
863 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000864 return v;
865
866 onError:
867 Py_DECREF(v);
868 return NULL;
869}
870
Guido van Rossumd57fd912000-03-10 22:53:23 +0000871PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
872{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000873 if (!PyUnicode_Check(unicode)) {
874 PyErr_BadArgument();
875 return NULL;
876 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000877 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
878 PyUnicode_GET_SIZE(unicode),
879 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000880}
881
882/* --- UTF-16 Codec ------------------------------------------------------- */
883
884static
885int utf16_decoding_error(const Py_UNICODE **source,
886 Py_UNICODE **dest,
887 const char *errors,
888 const char *details)
889{
890 if ((errors == NULL) ||
891 (strcmp(errors,"strict") == 0)) {
892 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000893 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000894 details);
895 return -1;
896 }
897 else if (strcmp(errors,"ignore") == 0) {
898 return 0;
899 }
900 else if (strcmp(errors,"replace") == 0) {
901 if (dest) {
902 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
903 (*dest)++;
904 }
905 return 0;
906 }
907 else {
908 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000909 "UTF-16 decoding error; "
910 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000911 errors);
912 return -1;
913 }
914}
915
Guido van Rossumd57fd912000-03-10 22:53:23 +0000916PyObject *PyUnicode_DecodeUTF16(const char *s,
917 int size,
918 const char *errors,
919 int *byteorder)
920{
921 PyUnicodeObject *unicode;
922 Py_UNICODE *p;
923 const Py_UNICODE *q, *e;
924 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000925 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000926
927 /* size should be an even number */
928 if (size % sizeof(Py_UNICODE) != 0) {
929 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
930 return NULL;
931 /* The remaining input chars are ignored if we fall through
932 here... */
933 }
934
935 /* Note: size will always be longer than the resulting Unicode
936 character count */
937 unicode = _PyUnicode_New(size);
938 if (!unicode)
939 return NULL;
940 if (size == 0)
941 return (PyObject *)unicode;
942
943 /* Unpack UTF-16 encoded data */
944 p = unicode->str;
945 q = (Py_UNICODE *)s;
946 e = q + (size / sizeof(Py_UNICODE));
947
948 if (byteorder)
949 bo = *byteorder;
950
951 while (q < e) {
952 register Py_UNICODE ch = *q++;
953
954 /* Check for BOM marks (U+FEFF) in the input and adjust
955 current byte order setting accordingly. Swap input
956 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
957 !) */
958#ifdef BYTEORDER_IS_LITTLE_ENDIAN
959 if (ch == 0xFEFF) {
960 bo = -1;
961 continue;
962 } else if (ch == 0xFFFE) {
963 bo = 1;
964 continue;
965 }
966 if (bo == 1)
967 ch = (ch >> 8) | (ch << 8);
968#else
969 if (ch == 0xFEFF) {
970 bo = 1;
971 continue;
972 } else if (ch == 0xFFFE) {
973 bo = -1;
974 continue;
975 }
976 if (bo == -1)
977 ch = (ch >> 8) | (ch << 8);
978#endif
979 if (ch < 0xD800 || ch > 0xDFFF) {
980 *p++ = ch;
981 continue;
982 }
983
984 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000985 if (q >= e) {
986 errmsg = "unexpected end of data";
987 goto utf16Error;
988 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 if (0xDC00 <= *q && *q <= 0xDFFF) {
990 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000991 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000992 /* This is valid data (a UTF-16 surrogate pair), but
993 we are not able to store this information since our
994 Py_UNICODE type only has 16 bits... this might
995 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000996 errmsg = "code pairs are not supported";
997 goto utf16Error;
998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999 else
1000 continue;
1001 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001002 errmsg = "illegal encoding";
1003 /* Fall through to report the error */
1004
1005 utf16Error:
1006 if (utf16_decoding_error(&q, &p, errors, errmsg))
1007 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001008 }
1009
1010 if (byteorder)
1011 *byteorder = bo;
1012
1013 /* Adjust length */
1014 if (_PyUnicode_Resize(unicode, p - unicode->str))
1015 goto onError;
1016
1017 return (PyObject *)unicode;
1018
1019onError:
1020 Py_DECREF(unicode);
1021 return NULL;
1022}
1023
1024#undef UTF16_ERROR
1025
1026PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1027 int size,
1028 const char *errors,
1029 int byteorder)
1030{
1031 PyObject *v;
1032 Py_UNICODE *p;
1033 char *q;
1034
1035 /* We don't create UTF-16 pairs... */
1036 v = PyString_FromStringAndSize(NULL,
1037 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1038 if (v == NULL)
1039 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040
1041 q = PyString_AS_STRING(v);
1042 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001043 if (byteorder == 0)
1044 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001045 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001046 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001047 if (byteorder == 0 ||
1048#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1049 byteorder == -1
1050#else
1051 byteorder == 1
1052#endif
1053 )
1054 memcpy(p, s, size * sizeof(Py_UNICODE));
1055 else
1056 while (size-- > 0) {
1057 Py_UNICODE ch = *s++;
1058 *p++ = (ch >> 8) | (ch << 8);
1059 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060 return v;
1061}
1062
1063PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1064{
1065 if (!PyUnicode_Check(unicode)) {
1066 PyErr_BadArgument();
1067 return NULL;
1068 }
1069 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1070 PyUnicode_GET_SIZE(unicode),
1071 NULL,
1072 0);
1073}
1074
1075/* --- Unicode Escape Codec ----------------------------------------------- */
1076
1077static
1078int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001079 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 const char *errors,
1081 const char *details)
1082{
1083 if ((errors == NULL) ||
1084 (strcmp(errors,"strict") == 0)) {
1085 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001086 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 details);
1088 return -1;
1089 }
1090 else if (strcmp(errors,"ignore") == 0) {
1091 return 0;
1092 }
1093 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001094 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001095 return 0;
1096 }
1097 else {
1098 PyErr_Format(PyExc_ValueError,
1099 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001100 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001101 errors);
1102 return -1;
1103 }
1104}
1105
Fredrik Lundh06d12682001-01-24 07:59:11 +00001106static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001107
Guido van Rossumd57fd912000-03-10 22:53:23 +00001108PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1109 int size,
1110 const char *errors)
1111{
1112 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001113 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001114 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001115 char* message;
1116 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1117
Guido van Rossumd57fd912000-03-10 22:53:23 +00001118 /* Escaped strings will always be longer than the resulting
1119 Unicode string, so we start with size here and then reduce the
1120 length after conversion to the true value. */
1121 v = _PyUnicode_New(size);
1122 if (v == NULL)
1123 goto onError;
1124 if (size == 0)
1125 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001126
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 p = buf = PyUnicode_AS_UNICODE(v);
1128 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001129
Guido van Rossumd57fd912000-03-10 22:53:23 +00001130 while (s < end) {
1131 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001132 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001133 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134
1135 /* Non-escape characters are interpreted as Unicode ordinals */
1136 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001137 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138 continue;
1139 }
1140
1141 /* \ - Escapes */
1142 s++;
1143 switch (*s++) {
1144
1145 /* \x escapes */
1146 case '\n': break;
1147 case '\\': *p++ = '\\'; break;
1148 case '\'': *p++ = '\''; break;
1149 case '\"': *p++ = '\"'; break;
1150 case 'b': *p++ = '\b'; break;
1151 case 'f': *p++ = '\014'; break; /* FF */
1152 case 't': *p++ = '\t'; break;
1153 case 'n': *p++ = '\n'; break;
1154 case 'r': *p++ = '\r'; break;
1155 case 'v': *p++ = '\013'; break; /* VT */
1156 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1157
1158 /* \OOO (octal) escapes */
1159 case '0': case '1': case '2': case '3':
1160 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001161 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001163 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001164 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001165 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001166 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001167 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001168 break;
1169
Fredrik Lundhccc74732001-02-18 22:13:49 +00001170 /* hex escapes */
1171 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001172 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001173 digits = 2;
1174 message = "truncated \\xXX escape";
1175 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176
Fredrik Lundhccc74732001-02-18 22:13:49 +00001177 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001178 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001179 digits = 4;
1180 message = "truncated \\uXXXX escape";
1181 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001182
Fredrik Lundhccc74732001-02-18 22:13:49 +00001183 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001184 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001185 digits = 8;
1186 message = "truncated \\UXXXXXXXX escape";
1187 hexescape:
1188 chr = 0;
1189 for (i = 0; i < digits; i++) {
1190 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001191 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001192 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001193 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001194 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001195 i++;
1196 break;
1197 }
1198 chr = (chr<<4) & ~0xF;
1199 if (c >= '0' && c <= '9')
1200 chr += c - '0';
1201 else if (c >= 'a' && c <= 'f')
1202 chr += 10 + c - 'a';
1203 else
1204 chr += 10 + c - 'A';
1205 }
1206 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001207 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001208 /* when we get here, chr is a 32-bit unicode character */
1209 if (chr <= 0xffff)
1210 /* UCS-2 character */
1211 *p++ = (Py_UNICODE) chr;
1212 else if (chr <= 0x10ffff) {
1213 /* UCS-4 character. store as two surrogate characters */
1214 chr -= 0x10000L;
1215 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1216 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1217 } else {
1218 if (unicodeescape_decoding_error(
1219 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001220 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001221 )
1222 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001223 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001224 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001225 break;
1226
1227 /* \N{name} */
1228 case 'N':
1229 message = "malformed \\N character escape";
1230 if (ucnhash_CAPI == NULL) {
1231 /* load the unicode data module */
1232 PyObject *m, *v;
1233 m = PyImport_ImportModule("unicodedata");
1234 if (m == NULL)
1235 goto ucnhashError;
1236 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1237 Py_DECREF(m);
1238 if (v == NULL)
1239 goto ucnhashError;
1240 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1241 Py_DECREF(v);
1242 if (ucnhash_CAPI == NULL)
1243 goto ucnhashError;
1244 }
1245 if (*s == '{') {
1246 const char *start = s+1;
1247 /* look for the closing brace */
1248 while (*s != '}' && s < end)
1249 s++;
1250 if (s > start && s < end && *s == '}') {
1251 /* found a name. look it up in the unicode database */
1252 message = "unknown Unicode character name";
1253 s++;
1254 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1255 goto store;
1256 }
1257 }
1258 if (unicodeescape_decoding_error(&s, &x, errors, message))
1259 goto onError;
1260 *p++ = x;
1261 break;
1262
1263 default:
1264 *p++ = '\\';
1265 *p++ = (unsigned char)s[-1];
1266 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001267 }
1268 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001269 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001270 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001271 return (PyObject *)v;
1272
Fredrik Lundhccc74732001-02-18 22:13:49 +00001273ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001274 PyErr_SetString(
1275 PyExc_UnicodeError,
1276 "\\N escapes not supported (can't load unicodedata module)"
1277 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001278 return NULL;
1279
Fredrik Lundhccc74732001-02-18 22:13:49 +00001280onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001281 Py_XDECREF(v);
1282 return NULL;
1283}
1284
1285/* Return a Unicode-Escape string version of the Unicode object.
1286
1287 If quotes is true, the string is enclosed in u"" or u'' quotes as
1288 appropriate.
1289
1290*/
1291
Barry Warsaw51ac5802000-03-20 16:36:48 +00001292static const Py_UNICODE *findchar(const Py_UNICODE *s,
1293 int size,
1294 Py_UNICODE ch);
1295
Guido van Rossumd57fd912000-03-10 22:53:23 +00001296static
1297PyObject *unicodeescape_string(const Py_UNICODE *s,
1298 int size,
1299 int quotes)
1300{
1301 PyObject *repr;
1302 char *p;
1303 char *q;
1304
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001305 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001306
1307 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1308 if (repr == NULL)
1309 return NULL;
1310
1311 p = q = PyString_AS_STRING(repr);
1312
1313 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 *p++ = 'u';
1315 *p++ = (findchar(s, size, '\'') &&
1316 !findchar(s, size, '"')) ? '"' : '\'';
1317 }
1318 while (size-- > 0) {
1319 Py_UNICODE ch = *s++;
1320 /* Escape quotes */
1321 if (quotes && (ch == q[1] || ch == '\\')) {
1322 *p++ = '\\';
1323 *p++ = (char) ch;
1324 }
1325 /* Map 16-bit characters to '\uxxxx' */
1326 else if (ch >= 256) {
1327 *p++ = '\\';
1328 *p++ = 'u';
1329 *p++ = hexdigit[(ch >> 12) & 0xf];
1330 *p++ = hexdigit[(ch >> 8) & 0xf];
1331 *p++ = hexdigit[(ch >> 4) & 0xf];
1332 *p++ = hexdigit[ch & 15];
1333 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001334 /* Map special whitespace to '\t', \n', '\r' */
1335 else if (ch == '\t') {
1336 *p++ = '\\';
1337 *p++ = 't';
1338 }
1339 else if (ch == '\n') {
1340 *p++ = '\\';
1341 *p++ = 'n';
1342 }
1343 else if (ch == '\r') {
1344 *p++ = '\\';
1345 *p++ = 'r';
1346 }
1347 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 else if (ch < ' ' || ch >= 128) {
1349 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001350 *p++ = 'x';
1351 *p++ = hexdigit[(ch >> 4) & 0xf];
1352 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001353 }
1354 /* Copy everything else as-is */
1355 else
1356 *p++ = (char) ch;
1357 }
1358 if (quotes)
1359 *p++ = q[1];
1360
1361 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001362 if (_PyString_Resize(&repr, p - q))
1363 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001364
1365 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001366
1367 onError:
1368 Py_DECREF(repr);
1369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001370}
1371
1372PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1373 int size)
1374{
1375 return unicodeescape_string(s, size, 0);
1376}
1377
1378PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1379{
1380 if (!PyUnicode_Check(unicode)) {
1381 PyErr_BadArgument();
1382 return NULL;
1383 }
1384 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1385 PyUnicode_GET_SIZE(unicode));
1386}
1387
1388/* --- Raw Unicode Escape Codec ------------------------------------------- */
1389
1390PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1391 int size,
1392 const char *errors)
1393{
1394 PyUnicodeObject *v;
1395 Py_UNICODE *p, *buf;
1396 const char *end;
1397 const char *bs;
1398
1399 /* Escaped strings will always be longer than the resulting
1400 Unicode string, so we start with size here and then reduce the
1401 length after conversion to the true value. */
1402 v = _PyUnicode_New(size);
1403 if (v == NULL)
1404 goto onError;
1405 if (size == 0)
1406 return (PyObject *)v;
1407 p = buf = PyUnicode_AS_UNICODE(v);
1408 end = s + size;
1409 while (s < end) {
1410 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001411 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 int i;
1413
1414 /* Non-escape characters are interpreted as Unicode ordinals */
1415 if (*s != '\\') {
1416 *p++ = (unsigned char)*s++;
1417 continue;
1418 }
1419
1420 /* \u-escapes are only interpreted iff the number of leading
1421 backslashes if odd */
1422 bs = s;
1423 for (;s < end;) {
1424 if (*s != '\\')
1425 break;
1426 *p++ = (unsigned char)*s++;
1427 }
1428 if (((s - bs) & 1) == 0 ||
1429 s >= end ||
1430 *s != 'u') {
1431 continue;
1432 }
1433 p--;
1434 s++;
1435
1436 /* \uXXXX with 4 hex digits */
1437 for (x = 0, i = 0; i < 4; i++) {
1438 c = (unsigned char)s[i];
1439 if (!isxdigit(c)) {
1440 if (unicodeescape_decoding_error(&s, &x, errors,
1441 "truncated \\uXXXX"))
1442 goto onError;
1443 i++;
1444 break;
1445 }
1446 x = (x<<4) & ~0xF;
1447 if (c >= '0' && c <= '9')
1448 x += c - '0';
1449 else if (c >= 'a' && c <= 'f')
1450 x += 10 + c - 'a';
1451 else
1452 x += 10 + c - 'A';
1453 }
1454 s += i;
1455 *p++ = x;
1456 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001457 if (_PyUnicode_Resize(v, (int)(p - buf)))
1458 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001459 return (PyObject *)v;
1460
1461 onError:
1462 Py_XDECREF(v);
1463 return NULL;
1464}
1465
1466PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1467 int size)
1468{
1469 PyObject *repr;
1470 char *p;
1471 char *q;
1472
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001473 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001474
1475 repr = PyString_FromStringAndSize(NULL, 6 * size);
1476 if (repr == NULL)
1477 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001478 if (size == 0)
1479 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480
1481 p = q = PyString_AS_STRING(repr);
1482 while (size-- > 0) {
1483 Py_UNICODE ch = *s++;
1484 /* Map 16-bit characters to '\uxxxx' */
1485 if (ch >= 256) {
1486 *p++ = '\\';
1487 *p++ = 'u';
1488 *p++ = hexdigit[(ch >> 12) & 0xf];
1489 *p++ = hexdigit[(ch >> 8) & 0xf];
1490 *p++ = hexdigit[(ch >> 4) & 0xf];
1491 *p++ = hexdigit[ch & 15];
1492 }
1493 /* Copy everything else as-is */
1494 else
1495 *p++ = (char) ch;
1496 }
1497 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001498 if (_PyString_Resize(&repr, p - q))
1499 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001500
1501 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001502
1503 onError:
1504 Py_DECREF(repr);
1505 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506}
1507
1508PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1509{
1510 if (!PyUnicode_Check(unicode)) {
1511 PyErr_BadArgument();
1512 return NULL;
1513 }
1514 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1515 PyUnicode_GET_SIZE(unicode));
1516}
1517
1518/* --- Latin-1 Codec ------------------------------------------------------ */
1519
1520PyObject *PyUnicode_DecodeLatin1(const char *s,
1521 int size,
1522 const char *errors)
1523{
1524 PyUnicodeObject *v;
1525 Py_UNICODE *p;
1526
1527 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1528 v = _PyUnicode_New(size);
1529 if (v == NULL)
1530 goto onError;
1531 if (size == 0)
1532 return (PyObject *)v;
1533 p = PyUnicode_AS_UNICODE(v);
1534 while (size-- > 0)
1535 *p++ = (unsigned char)*s++;
1536 return (PyObject *)v;
1537
1538 onError:
1539 Py_XDECREF(v);
1540 return NULL;
1541}
1542
1543static
1544int latin1_encoding_error(const Py_UNICODE **source,
1545 char **dest,
1546 const char *errors,
1547 const char *details)
1548{
1549 if ((errors == NULL) ||
1550 (strcmp(errors,"strict") == 0)) {
1551 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001552 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001553 details);
1554 return -1;
1555 }
1556 else if (strcmp(errors,"ignore") == 0) {
1557 return 0;
1558 }
1559 else if (strcmp(errors,"replace") == 0) {
1560 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001561 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001562 return 0;
1563 }
1564 else {
1565 PyErr_Format(PyExc_ValueError,
1566 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001567 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568 errors);
1569 return -1;
1570 }
1571}
1572
1573PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1574 int size,
1575 const char *errors)
1576{
1577 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001578 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001579
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 repr = PyString_FromStringAndSize(NULL, size);
1581 if (repr == NULL)
1582 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001583 if (size == 0)
1584 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585
1586 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001587 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001588 while (size-- > 0) {
1589 Py_UNICODE ch = *p++;
1590 if (ch >= 256) {
1591 if (latin1_encoding_error(&p, &s, errors,
1592 "ordinal not in range(256)"))
1593 goto onError;
1594 }
1595 else
1596 *s++ = (char)ch;
1597 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001598 /* Resize if error handling skipped some characters */
1599 if (s - start < PyString_GET_SIZE(repr))
1600 if (_PyString_Resize(&repr, s - start))
1601 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602 return repr;
1603
1604 onError:
1605 Py_DECREF(repr);
1606 return NULL;
1607}
1608
1609PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1610{
1611 if (!PyUnicode_Check(unicode)) {
1612 PyErr_BadArgument();
1613 return NULL;
1614 }
1615 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1616 PyUnicode_GET_SIZE(unicode),
1617 NULL);
1618}
1619
1620/* --- 7-bit ASCII Codec -------------------------------------------------- */
1621
1622static
1623int ascii_decoding_error(const char **source,
1624 Py_UNICODE **dest,
1625 const char *errors,
1626 const char *details)
1627{
1628 if ((errors == NULL) ||
1629 (strcmp(errors,"strict") == 0)) {
1630 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001631 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001632 details);
1633 return -1;
1634 }
1635 else if (strcmp(errors,"ignore") == 0) {
1636 return 0;
1637 }
1638 else if (strcmp(errors,"replace") == 0) {
1639 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1640 (*dest)++;
1641 return 0;
1642 }
1643 else {
1644 PyErr_Format(PyExc_ValueError,
1645 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001646 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 errors);
1648 return -1;
1649 }
1650}
1651
1652PyObject *PyUnicode_DecodeASCII(const char *s,
1653 int size,
1654 const char *errors)
1655{
1656 PyUnicodeObject *v;
1657 Py_UNICODE *p;
1658
1659 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1660 v = _PyUnicode_New(size);
1661 if (v == NULL)
1662 goto onError;
1663 if (size == 0)
1664 return (PyObject *)v;
1665 p = PyUnicode_AS_UNICODE(v);
1666 while (size-- > 0) {
1667 register unsigned char c;
1668
1669 c = (unsigned char)*s++;
1670 if (c < 128)
1671 *p++ = c;
1672 else if (ascii_decoding_error(&s, &p, errors,
1673 "ordinal not in range(128)"))
1674 goto onError;
1675 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001676 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1677 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1678 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001679 return (PyObject *)v;
1680
1681 onError:
1682 Py_XDECREF(v);
1683 return NULL;
1684}
1685
1686static
1687int ascii_encoding_error(const Py_UNICODE **source,
1688 char **dest,
1689 const char *errors,
1690 const char *details)
1691{
1692 if ((errors == NULL) ||
1693 (strcmp(errors,"strict") == 0)) {
1694 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001695 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001696 details);
1697 return -1;
1698 }
1699 else if (strcmp(errors,"ignore") == 0) {
1700 return 0;
1701 }
1702 else if (strcmp(errors,"replace") == 0) {
1703 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001704 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001705 return 0;
1706 }
1707 else {
1708 PyErr_Format(PyExc_ValueError,
1709 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001710 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001711 errors);
1712 return -1;
1713 }
1714}
1715
1716PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1717 int size,
1718 const char *errors)
1719{
1720 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001721 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001722
Guido van Rossumd57fd912000-03-10 22:53:23 +00001723 repr = PyString_FromStringAndSize(NULL, size);
1724 if (repr == NULL)
1725 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001726 if (size == 0)
1727 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728
1729 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001730 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 while (size-- > 0) {
1732 Py_UNICODE ch = *p++;
1733 if (ch >= 128) {
1734 if (ascii_encoding_error(&p, &s, errors,
1735 "ordinal not in range(128)"))
1736 goto onError;
1737 }
1738 else
1739 *s++ = (char)ch;
1740 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001741 /* Resize if error handling skipped some characters */
1742 if (s - start < PyString_GET_SIZE(repr))
1743 if (_PyString_Resize(&repr, s - start))
1744 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 return repr;
1746
1747 onError:
1748 Py_DECREF(repr);
1749 return NULL;
1750}
1751
1752PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1753{
1754 if (!PyUnicode_Check(unicode)) {
1755 PyErr_BadArgument();
1756 return NULL;
1757 }
1758 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1759 PyUnicode_GET_SIZE(unicode),
1760 NULL);
1761}
1762
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001763#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001764
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001765/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001766
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001767PyObject *PyUnicode_DecodeMBCS(const char *s,
1768 int size,
1769 const char *errors)
1770{
1771 PyUnicodeObject *v;
1772 Py_UNICODE *p;
1773
1774 /* First get the size of the result */
1775 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001776 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001777 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1778
1779 v = _PyUnicode_New(usize);
1780 if (v == NULL)
1781 return NULL;
1782 if (usize == 0)
1783 return (PyObject *)v;
1784 p = PyUnicode_AS_UNICODE(v);
1785 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1786 Py_DECREF(v);
1787 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1788 }
1789
1790 return (PyObject *)v;
1791}
1792
1793PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1794 int size,
1795 const char *errors)
1796{
1797 PyObject *repr;
1798 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001799 DWORD mbcssize;
1800
1801 /* If there are no characters, bail now! */
1802 if (size==0)
1803 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001804
1805 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001806 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001807 if (mbcssize==0)
1808 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1809
1810 repr = PyString_FromStringAndSize(NULL, mbcssize);
1811 if (repr == NULL)
1812 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001813 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001814 return repr;
1815
1816 /* Do the conversion */
1817 s = PyString_AS_STRING(repr);
1818 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1819 Py_DECREF(repr);
1820 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1821 }
1822 return repr;
1823}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001824
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001825#endif /* MS_WIN32 */
1826
Guido van Rossumd57fd912000-03-10 22:53:23 +00001827/* --- Character Mapping Codec -------------------------------------------- */
1828
1829static
1830int charmap_decoding_error(const char **source,
1831 Py_UNICODE **dest,
1832 const char *errors,
1833 const char *details)
1834{
1835 if ((errors == NULL) ||
1836 (strcmp(errors,"strict") == 0)) {
1837 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001838 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001839 details);
1840 return -1;
1841 }
1842 else if (strcmp(errors,"ignore") == 0) {
1843 return 0;
1844 }
1845 else if (strcmp(errors,"replace") == 0) {
1846 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1847 (*dest)++;
1848 return 0;
1849 }
1850 else {
1851 PyErr_Format(PyExc_ValueError,
1852 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001853 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001854 errors);
1855 return -1;
1856 }
1857}
1858
1859PyObject *PyUnicode_DecodeCharmap(const char *s,
1860 int size,
1861 PyObject *mapping,
1862 const char *errors)
1863{
1864 PyUnicodeObject *v;
1865 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001866 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001867
1868 /* Default to Latin-1 */
1869 if (mapping == NULL)
1870 return PyUnicode_DecodeLatin1(s, size, errors);
1871
1872 v = _PyUnicode_New(size);
1873 if (v == NULL)
1874 goto onError;
1875 if (size == 0)
1876 return (PyObject *)v;
1877 p = PyUnicode_AS_UNICODE(v);
1878 while (size-- > 0) {
1879 unsigned char ch = *s++;
1880 PyObject *w, *x;
1881
1882 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1883 w = PyInt_FromLong((long)ch);
1884 if (w == NULL)
1885 goto onError;
1886 x = PyObject_GetItem(mapping, w);
1887 Py_DECREF(w);
1888 if (x == NULL) {
1889 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001890 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001891 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001892 x = Py_None;
1893 Py_INCREF(x);
1894 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001895 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001896 }
1897
1898 /* Apply mapping */
1899 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001900 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 if (value < 0 || value > 65535) {
1902 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001903 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001904 Py_DECREF(x);
1905 goto onError;
1906 }
1907 *p++ = (Py_UNICODE)value;
1908 }
1909 else if (x == Py_None) {
1910 /* undefined mapping */
1911 if (charmap_decoding_error(&s, &p, errors,
1912 "character maps to <undefined>")) {
1913 Py_DECREF(x);
1914 goto onError;
1915 }
1916 }
1917 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001918 int targetsize = PyUnicode_GET_SIZE(x);
1919
1920 if (targetsize == 1)
1921 /* 1-1 mapping */
1922 *p++ = *PyUnicode_AS_UNICODE(x);
1923
1924 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001925 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001926 if (targetsize > extrachars) {
1927 /* resize first */
1928 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1929 int needed = (targetsize - extrachars) + \
1930 (targetsize << 2);
1931 extrachars += needed;
1932 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001933 Py_DECREF(x);
1934 goto onError;
1935 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001936 p = PyUnicode_AS_UNICODE(v) + oldpos;
1937 }
1938 Py_UNICODE_COPY(p,
1939 PyUnicode_AS_UNICODE(x),
1940 targetsize);
1941 p += targetsize;
1942 extrachars -= targetsize;
1943 }
1944 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001945 }
1946 else {
1947 /* wrong return value */
1948 PyErr_SetString(PyExc_TypeError,
1949 "character mapping must return integer, None or unicode");
1950 Py_DECREF(x);
1951 goto onError;
1952 }
1953 Py_DECREF(x);
1954 }
1955 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1956 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1957 goto onError;
1958 return (PyObject *)v;
1959
1960 onError:
1961 Py_XDECREF(v);
1962 return NULL;
1963}
1964
1965static
1966int charmap_encoding_error(const Py_UNICODE **source,
1967 char **dest,
1968 const char *errors,
1969 const char *details)
1970{
1971 if ((errors == NULL) ||
1972 (strcmp(errors,"strict") == 0)) {
1973 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001974 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001975 details);
1976 return -1;
1977 }
1978 else if (strcmp(errors,"ignore") == 0) {
1979 return 0;
1980 }
1981 else if (strcmp(errors,"replace") == 0) {
1982 **dest = '?';
1983 (*dest)++;
1984 return 0;
1985 }
1986 else {
1987 PyErr_Format(PyExc_ValueError,
1988 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001989 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001990 errors);
1991 return -1;
1992 }
1993}
1994
1995PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1996 int size,
1997 PyObject *mapping,
1998 const char *errors)
1999{
2000 PyObject *v;
2001 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002002 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002003
2004 /* Default to Latin-1 */
2005 if (mapping == NULL)
2006 return PyUnicode_EncodeLatin1(p, size, errors);
2007
2008 v = PyString_FromStringAndSize(NULL, size);
2009 if (v == NULL)
2010 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002011 if (size == 0)
2012 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002013 s = PyString_AS_STRING(v);
2014 while (size-- > 0) {
2015 Py_UNICODE ch = *p++;
2016 PyObject *w, *x;
2017
2018 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2019 w = PyInt_FromLong((long)ch);
2020 if (w == NULL)
2021 goto onError;
2022 x = PyObject_GetItem(mapping, w);
2023 Py_DECREF(w);
2024 if (x == NULL) {
2025 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002026 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002027 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002028 x = Py_None;
2029 Py_INCREF(x);
2030 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002031 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002032 }
2033
2034 /* Apply mapping */
2035 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002036 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002037 if (value < 0 || value > 255) {
2038 PyErr_SetString(PyExc_TypeError,
2039 "character mapping must be in range(256)");
2040 Py_DECREF(x);
2041 goto onError;
2042 }
2043 *s++ = (char)value;
2044 }
2045 else if (x == Py_None) {
2046 /* undefined mapping */
2047 if (charmap_encoding_error(&p, &s, errors,
2048 "character maps to <undefined>")) {
2049 Py_DECREF(x);
2050 goto onError;
2051 }
2052 }
2053 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002054 int targetsize = PyString_GET_SIZE(x);
2055
2056 if (targetsize == 1)
2057 /* 1-1 mapping */
2058 *s++ = *PyString_AS_STRING(x);
2059
2060 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002061 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002062 if (targetsize > extrachars) {
2063 /* resize first */
2064 int oldpos = (int)(s - PyString_AS_STRING(v));
2065 int needed = (targetsize - extrachars) + \
2066 (targetsize << 2);
2067 extrachars += needed;
2068 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002069 Py_DECREF(x);
2070 goto onError;
2071 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002072 s = PyString_AS_STRING(v) + oldpos;
2073 }
2074 memcpy(s,
2075 PyString_AS_STRING(x),
2076 targetsize);
2077 s += targetsize;
2078 extrachars -= targetsize;
2079 }
2080 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002081 }
2082 else {
2083 /* wrong return value */
2084 PyErr_SetString(PyExc_TypeError,
2085 "character mapping must return integer, None or unicode");
2086 Py_DECREF(x);
2087 goto onError;
2088 }
2089 Py_DECREF(x);
2090 }
2091 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2092 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2093 goto onError;
2094 return v;
2095
2096 onError:
2097 Py_DECREF(v);
2098 return NULL;
2099}
2100
2101PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2102 PyObject *mapping)
2103{
2104 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2105 PyErr_BadArgument();
2106 return NULL;
2107 }
2108 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2109 PyUnicode_GET_SIZE(unicode),
2110 mapping,
2111 NULL);
2112}
2113
2114static
2115int translate_error(const Py_UNICODE **source,
2116 Py_UNICODE **dest,
2117 const char *errors,
2118 const char *details)
2119{
2120 if ((errors == NULL) ||
2121 (strcmp(errors,"strict") == 0)) {
2122 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002123 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124 details);
2125 return -1;
2126 }
2127 else if (strcmp(errors,"ignore") == 0) {
2128 return 0;
2129 }
2130 else if (strcmp(errors,"replace") == 0) {
2131 **dest = '?';
2132 (*dest)++;
2133 return 0;
2134 }
2135 else {
2136 PyErr_Format(PyExc_ValueError,
2137 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002138 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002139 errors);
2140 return -1;
2141 }
2142}
2143
2144PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2145 int size,
2146 PyObject *mapping,
2147 const char *errors)
2148{
2149 PyUnicodeObject *v;
2150 Py_UNICODE *p;
2151
2152 if (mapping == NULL) {
2153 PyErr_BadArgument();
2154 return NULL;
2155 }
2156
2157 /* Output will never be longer than input */
2158 v = _PyUnicode_New(size);
2159 if (v == NULL)
2160 goto onError;
2161 if (size == 0)
2162 goto done;
2163 p = PyUnicode_AS_UNICODE(v);
2164 while (size-- > 0) {
2165 Py_UNICODE ch = *s++;
2166 PyObject *w, *x;
2167
2168 /* Get mapping */
2169 w = PyInt_FromLong(ch);
2170 if (w == NULL)
2171 goto onError;
2172 x = PyObject_GetItem(mapping, w);
2173 Py_DECREF(w);
2174 if (x == NULL) {
2175 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2176 /* No mapping found: default to 1-1 mapping */
2177 PyErr_Clear();
2178 *p++ = ch;
2179 continue;
2180 }
2181 goto onError;
2182 }
2183
2184 /* Apply mapping */
2185 if (PyInt_Check(x))
2186 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2187 else if (x == Py_None) {
2188 /* undefined mapping */
2189 if (translate_error(&s, &p, errors,
2190 "character maps to <undefined>")) {
2191 Py_DECREF(x);
2192 goto onError;
2193 }
2194 }
2195 else if (PyUnicode_Check(x)) {
2196 if (PyUnicode_GET_SIZE(x) != 1) {
2197 /* 1-n mapping */
2198 PyErr_SetString(PyExc_NotImplementedError,
2199 "1-n mappings are currently not implemented");
2200 Py_DECREF(x);
2201 goto onError;
2202 }
2203 *p++ = *PyUnicode_AS_UNICODE(x);
2204 }
2205 else {
2206 /* wrong return value */
2207 PyErr_SetString(PyExc_TypeError,
2208 "translate mapping must return integer, None or unicode");
2209 Py_DECREF(x);
2210 goto onError;
2211 }
2212 Py_DECREF(x);
2213 }
2214 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002215 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2216 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002217
2218 done:
2219 return (PyObject *)v;
2220
2221 onError:
2222 Py_XDECREF(v);
2223 return NULL;
2224}
2225
2226PyObject *PyUnicode_Translate(PyObject *str,
2227 PyObject *mapping,
2228 const char *errors)
2229{
2230 PyObject *result;
2231
2232 str = PyUnicode_FromObject(str);
2233 if (str == NULL)
2234 goto onError;
2235 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2236 PyUnicode_GET_SIZE(str),
2237 mapping,
2238 errors);
2239 Py_DECREF(str);
2240 return result;
2241
2242 onError:
2243 Py_XDECREF(str);
2244 return NULL;
2245}
2246
Guido van Rossum9e896b32000-04-05 20:11:21 +00002247/* --- Decimal Encoder ---------------------------------------------------- */
2248
2249int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2250 int length,
2251 char *output,
2252 const char *errors)
2253{
2254 Py_UNICODE *p, *end;
2255
2256 if (output == NULL) {
2257 PyErr_BadArgument();
2258 return -1;
2259 }
2260
2261 p = s;
2262 end = s + length;
2263 while (p < end) {
2264 register Py_UNICODE ch = *p++;
2265 int decimal;
2266
2267 if (Py_UNICODE_ISSPACE(ch)) {
2268 *output++ = ' ';
2269 continue;
2270 }
2271 decimal = Py_UNICODE_TODECIMAL(ch);
2272 if (decimal >= 0) {
2273 *output++ = '0' + decimal;
2274 continue;
2275 }
Guido van Rossumba477042000-04-06 18:18:10 +00002276 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002277 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002278 continue;
2279 }
2280 /* All other characters are considered invalid */
2281 if (errors == NULL || strcmp(errors, "strict") == 0) {
2282 PyErr_SetString(PyExc_ValueError,
2283 "invalid decimal Unicode string");
2284 goto onError;
2285 }
2286 else if (strcmp(errors, "ignore") == 0)
2287 continue;
2288 else if (strcmp(errors, "replace") == 0) {
2289 *output++ = '?';
2290 continue;
2291 }
2292 }
2293 /* 0-terminate the output string */
2294 *output++ = '\0';
2295 return 0;
2296
2297 onError:
2298 return -1;
2299}
2300
Guido van Rossumd57fd912000-03-10 22:53:23 +00002301/* --- Helpers ------------------------------------------------------------ */
2302
2303static
2304int count(PyUnicodeObject *self,
2305 int start,
2306 int end,
2307 PyUnicodeObject *substring)
2308{
2309 int count = 0;
2310
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002311 if (start < 0)
2312 start += self->length;
2313 if (start < 0)
2314 start = 0;
2315 if (end > self->length)
2316 end = self->length;
2317 if (end < 0)
2318 end += self->length;
2319 if (end < 0)
2320 end = 0;
2321
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002322 if (substring->length == 0)
2323 return (end - start + 1);
2324
Guido van Rossumd57fd912000-03-10 22:53:23 +00002325 end -= substring->length;
2326
2327 while (start <= end)
2328 if (Py_UNICODE_MATCH(self, start, substring)) {
2329 count++;
2330 start += substring->length;
2331 } else
2332 start++;
2333
2334 return count;
2335}
2336
2337int PyUnicode_Count(PyObject *str,
2338 PyObject *substr,
2339 int start,
2340 int end)
2341{
2342 int result;
2343
2344 str = PyUnicode_FromObject(str);
2345 if (str == NULL)
2346 return -1;
2347 substr = PyUnicode_FromObject(substr);
2348 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002349 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002350 return -1;
2351 }
2352
2353 result = count((PyUnicodeObject *)str,
2354 start, end,
2355 (PyUnicodeObject *)substr);
2356
2357 Py_DECREF(str);
2358 Py_DECREF(substr);
2359 return result;
2360}
2361
2362static
2363int findstring(PyUnicodeObject *self,
2364 PyUnicodeObject *substring,
2365 int start,
2366 int end,
2367 int direction)
2368{
2369 if (start < 0)
2370 start += self->length;
2371 if (start < 0)
2372 start = 0;
2373
2374 if (substring->length == 0)
2375 return start;
2376
2377 if (end > self->length)
2378 end = self->length;
2379 if (end < 0)
2380 end += self->length;
2381 if (end < 0)
2382 end = 0;
2383
2384 end -= substring->length;
2385
2386 if (direction < 0) {
2387 for (; end >= start; end--)
2388 if (Py_UNICODE_MATCH(self, end, substring))
2389 return end;
2390 } else {
2391 for (; start <= end; start++)
2392 if (Py_UNICODE_MATCH(self, start, substring))
2393 return start;
2394 }
2395
2396 return -1;
2397}
2398
2399int PyUnicode_Find(PyObject *str,
2400 PyObject *substr,
2401 int start,
2402 int end,
2403 int direction)
2404{
2405 int result;
2406
2407 str = PyUnicode_FromObject(str);
2408 if (str == NULL)
2409 return -1;
2410 substr = PyUnicode_FromObject(substr);
2411 if (substr == NULL) {
2412 Py_DECREF(substr);
2413 return -1;
2414 }
2415
2416 result = findstring((PyUnicodeObject *)str,
2417 (PyUnicodeObject *)substr,
2418 start, end, direction);
2419 Py_DECREF(str);
2420 Py_DECREF(substr);
2421 return result;
2422}
2423
2424static
2425int tailmatch(PyUnicodeObject *self,
2426 PyUnicodeObject *substring,
2427 int start,
2428 int end,
2429 int direction)
2430{
2431 if (start < 0)
2432 start += self->length;
2433 if (start < 0)
2434 start = 0;
2435
2436 if (substring->length == 0)
2437 return 1;
2438
2439 if (end > self->length)
2440 end = self->length;
2441 if (end < 0)
2442 end += self->length;
2443 if (end < 0)
2444 end = 0;
2445
2446 end -= substring->length;
2447 if (end < start)
2448 return 0;
2449
2450 if (direction > 0) {
2451 if (Py_UNICODE_MATCH(self, end, substring))
2452 return 1;
2453 } else {
2454 if (Py_UNICODE_MATCH(self, start, substring))
2455 return 1;
2456 }
2457
2458 return 0;
2459}
2460
2461int PyUnicode_Tailmatch(PyObject *str,
2462 PyObject *substr,
2463 int start,
2464 int end,
2465 int direction)
2466{
2467 int result;
2468
2469 str = PyUnicode_FromObject(str);
2470 if (str == NULL)
2471 return -1;
2472 substr = PyUnicode_FromObject(substr);
2473 if (substr == NULL) {
2474 Py_DECREF(substr);
2475 return -1;
2476 }
2477
2478 result = tailmatch((PyUnicodeObject *)str,
2479 (PyUnicodeObject *)substr,
2480 start, end, direction);
2481 Py_DECREF(str);
2482 Py_DECREF(substr);
2483 return result;
2484}
2485
2486static
2487const Py_UNICODE *findchar(const Py_UNICODE *s,
2488 int size,
2489 Py_UNICODE ch)
2490{
2491 /* like wcschr, but doesn't stop at NULL characters */
2492
2493 while (size-- > 0) {
2494 if (*s == ch)
2495 return s;
2496 s++;
2497 }
2498
2499 return NULL;
2500}
2501
2502/* Apply fixfct filter to the Unicode object self and return a
2503 reference to the modified object */
2504
2505static
2506PyObject *fixup(PyUnicodeObject *self,
2507 int (*fixfct)(PyUnicodeObject *s))
2508{
2509
2510 PyUnicodeObject *u;
2511
2512 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2513 self->length);
2514 if (u == NULL)
2515 return NULL;
2516 if (!fixfct(u)) {
2517 /* fixfct should return TRUE if it modified the buffer. If
2518 FALSE, return a reference to the original buffer instead
2519 (to save space, not time) */
2520 Py_INCREF(self);
2521 Py_DECREF(u);
2522 return (PyObject*) self;
2523 }
2524 return (PyObject*) u;
2525}
2526
2527static
2528int fixupper(PyUnicodeObject *self)
2529{
2530 int len = self->length;
2531 Py_UNICODE *s = self->str;
2532 int status = 0;
2533
2534 while (len-- > 0) {
2535 register Py_UNICODE ch;
2536
2537 ch = Py_UNICODE_TOUPPER(*s);
2538 if (ch != *s) {
2539 status = 1;
2540 *s = ch;
2541 }
2542 s++;
2543 }
2544
2545 return status;
2546}
2547
2548static
2549int fixlower(PyUnicodeObject *self)
2550{
2551 int len = self->length;
2552 Py_UNICODE *s = self->str;
2553 int status = 0;
2554
2555 while (len-- > 0) {
2556 register Py_UNICODE ch;
2557
2558 ch = Py_UNICODE_TOLOWER(*s);
2559 if (ch != *s) {
2560 status = 1;
2561 *s = ch;
2562 }
2563 s++;
2564 }
2565
2566 return status;
2567}
2568
2569static
2570int fixswapcase(PyUnicodeObject *self)
2571{
2572 int len = self->length;
2573 Py_UNICODE *s = self->str;
2574 int status = 0;
2575
2576 while (len-- > 0) {
2577 if (Py_UNICODE_ISUPPER(*s)) {
2578 *s = Py_UNICODE_TOLOWER(*s);
2579 status = 1;
2580 } else if (Py_UNICODE_ISLOWER(*s)) {
2581 *s = Py_UNICODE_TOUPPER(*s);
2582 status = 1;
2583 }
2584 s++;
2585 }
2586
2587 return status;
2588}
2589
2590static
2591int fixcapitalize(PyUnicodeObject *self)
2592{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002593 int len = self->length;
2594 Py_UNICODE *s = self->str;
2595 int status = 0;
2596
2597 if (len == 0)
2598 return 0;
2599 if (Py_UNICODE_ISLOWER(*s)) {
2600 *s = Py_UNICODE_TOUPPER(*s);
2601 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002603 s++;
2604 while (--len > 0) {
2605 if (Py_UNICODE_ISUPPER(*s)) {
2606 *s = Py_UNICODE_TOLOWER(*s);
2607 status = 1;
2608 }
2609 s++;
2610 }
2611 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612}
2613
2614static
2615int fixtitle(PyUnicodeObject *self)
2616{
2617 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2618 register Py_UNICODE *e;
2619 int previous_is_cased;
2620
2621 /* Shortcut for single character strings */
2622 if (PyUnicode_GET_SIZE(self) == 1) {
2623 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2624 if (*p != ch) {
2625 *p = ch;
2626 return 1;
2627 }
2628 else
2629 return 0;
2630 }
2631
2632 e = p + PyUnicode_GET_SIZE(self);
2633 previous_is_cased = 0;
2634 for (; p < e; p++) {
2635 register const Py_UNICODE ch = *p;
2636
2637 if (previous_is_cased)
2638 *p = Py_UNICODE_TOLOWER(ch);
2639 else
2640 *p = Py_UNICODE_TOTITLE(ch);
2641
2642 if (Py_UNICODE_ISLOWER(ch) ||
2643 Py_UNICODE_ISUPPER(ch) ||
2644 Py_UNICODE_ISTITLE(ch))
2645 previous_is_cased = 1;
2646 else
2647 previous_is_cased = 0;
2648 }
2649 return 1;
2650}
2651
2652PyObject *PyUnicode_Join(PyObject *separator,
2653 PyObject *seq)
2654{
2655 Py_UNICODE *sep;
2656 int seplen;
2657 PyUnicodeObject *res = NULL;
2658 int reslen = 0;
2659 Py_UNICODE *p;
2660 int seqlen = 0;
2661 int sz = 100;
2662 int i;
2663
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002664 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002665 if (seqlen < 0 && PyErr_Occurred())
2666 return NULL;
2667
2668 if (separator == NULL) {
2669 Py_UNICODE blank = ' ';
2670 sep = &blank;
2671 seplen = 1;
2672 }
2673 else {
2674 separator = PyUnicode_FromObject(separator);
2675 if (separator == NULL)
2676 return NULL;
2677 sep = PyUnicode_AS_UNICODE(separator);
2678 seplen = PyUnicode_GET_SIZE(separator);
2679 }
2680
2681 res = _PyUnicode_New(sz);
2682 if (res == NULL)
2683 goto onError;
2684 p = PyUnicode_AS_UNICODE(res);
2685 reslen = 0;
2686
2687 for (i = 0; i < seqlen; i++) {
2688 int itemlen;
2689 PyObject *item;
2690
2691 item = PySequence_GetItem(seq, i);
2692 if (item == NULL)
2693 goto onError;
2694 if (!PyUnicode_Check(item)) {
2695 PyObject *v;
2696 v = PyUnicode_FromObject(item);
2697 Py_DECREF(item);
2698 item = v;
2699 if (item == NULL)
2700 goto onError;
2701 }
2702 itemlen = PyUnicode_GET_SIZE(item);
2703 while (reslen + itemlen + seplen >= sz) {
2704 if (_PyUnicode_Resize(res, sz*2))
2705 goto onError;
2706 sz *= 2;
2707 p = PyUnicode_AS_UNICODE(res) + reslen;
2708 }
2709 if (i > 0) {
2710 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2711 p += seplen;
2712 reslen += seplen;
2713 }
2714 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2715 p += itemlen;
2716 reslen += itemlen;
2717 Py_DECREF(item);
2718 }
2719 if (_PyUnicode_Resize(res, reslen))
2720 goto onError;
2721
2722 Py_XDECREF(separator);
2723 return (PyObject *)res;
2724
2725 onError:
2726 Py_XDECREF(separator);
2727 Py_DECREF(res);
2728 return NULL;
2729}
2730
2731static
2732PyUnicodeObject *pad(PyUnicodeObject *self,
2733 int left,
2734 int right,
2735 Py_UNICODE fill)
2736{
2737 PyUnicodeObject *u;
2738
2739 if (left < 0)
2740 left = 0;
2741 if (right < 0)
2742 right = 0;
2743
2744 if (left == 0 && right == 0) {
2745 Py_INCREF(self);
2746 return self;
2747 }
2748
2749 u = _PyUnicode_New(left + self->length + right);
2750 if (u) {
2751 if (left)
2752 Py_UNICODE_FILL(u->str, fill, left);
2753 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2754 if (right)
2755 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2756 }
2757
2758 return u;
2759}
2760
2761#define SPLIT_APPEND(data, left, right) \
2762 str = PyUnicode_FromUnicode(data + left, right - left); \
2763 if (!str) \
2764 goto onError; \
2765 if (PyList_Append(list, str)) { \
2766 Py_DECREF(str); \
2767 goto onError; \
2768 } \
2769 else \
2770 Py_DECREF(str);
2771
2772static
2773PyObject *split_whitespace(PyUnicodeObject *self,
2774 PyObject *list,
2775 int maxcount)
2776{
2777 register int i;
2778 register int j;
2779 int len = self->length;
2780 PyObject *str;
2781
2782 for (i = j = 0; i < len; ) {
2783 /* find a token */
2784 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2785 i++;
2786 j = i;
2787 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2788 i++;
2789 if (j < i) {
2790 if (maxcount-- <= 0)
2791 break;
2792 SPLIT_APPEND(self->str, j, i);
2793 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2794 i++;
2795 j = i;
2796 }
2797 }
2798 if (j < len) {
2799 SPLIT_APPEND(self->str, j, len);
2800 }
2801 return list;
2802
2803 onError:
2804 Py_DECREF(list);
2805 return NULL;
2806}
2807
2808PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002809 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002810{
2811 register int i;
2812 register int j;
2813 int len;
2814 PyObject *list;
2815 PyObject *str;
2816 Py_UNICODE *data;
2817
2818 string = PyUnicode_FromObject(string);
2819 if (string == NULL)
2820 return NULL;
2821 data = PyUnicode_AS_UNICODE(string);
2822 len = PyUnicode_GET_SIZE(string);
2823
Guido van Rossumd57fd912000-03-10 22:53:23 +00002824 list = PyList_New(0);
2825 if (!list)
2826 goto onError;
2827
2828 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002829 int eol;
2830
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831 /* Find a line and append it */
2832 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2833 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834
2835 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002836 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002837 if (i < len) {
2838 if (data[i] == '\r' && i + 1 < len &&
2839 data[i+1] == '\n')
2840 i += 2;
2841 else
2842 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002843 if (keepends)
2844 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845 }
Guido van Rossum86662912000-04-11 15:38:46 +00002846 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002847 j = i;
2848 }
2849 if (j < len) {
2850 SPLIT_APPEND(data, j, len);
2851 }
2852
2853 Py_DECREF(string);
2854 return list;
2855
2856 onError:
2857 Py_DECREF(list);
2858 Py_DECREF(string);
2859 return NULL;
2860}
2861
2862static
2863PyObject *split_char(PyUnicodeObject *self,
2864 PyObject *list,
2865 Py_UNICODE ch,
2866 int maxcount)
2867{
2868 register int i;
2869 register int j;
2870 int len = self->length;
2871 PyObject *str;
2872
2873 for (i = j = 0; i < len; ) {
2874 if (self->str[i] == ch) {
2875 if (maxcount-- <= 0)
2876 break;
2877 SPLIT_APPEND(self->str, j, i);
2878 i = j = i + 1;
2879 } else
2880 i++;
2881 }
2882 if (j <= len) {
2883 SPLIT_APPEND(self->str, j, len);
2884 }
2885 return list;
2886
2887 onError:
2888 Py_DECREF(list);
2889 return NULL;
2890}
2891
2892static
2893PyObject *split_substring(PyUnicodeObject *self,
2894 PyObject *list,
2895 PyUnicodeObject *substring,
2896 int maxcount)
2897{
2898 register int i;
2899 register int j;
2900 int len = self->length;
2901 int sublen = substring->length;
2902 PyObject *str;
2903
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002904 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905 if (Py_UNICODE_MATCH(self, i, substring)) {
2906 if (maxcount-- <= 0)
2907 break;
2908 SPLIT_APPEND(self->str, j, i);
2909 i = j = i + sublen;
2910 } else
2911 i++;
2912 }
2913 if (j <= len) {
2914 SPLIT_APPEND(self->str, j, len);
2915 }
2916 return list;
2917
2918 onError:
2919 Py_DECREF(list);
2920 return NULL;
2921}
2922
2923#undef SPLIT_APPEND
2924
2925static
2926PyObject *split(PyUnicodeObject *self,
2927 PyUnicodeObject *substring,
2928 int maxcount)
2929{
2930 PyObject *list;
2931
2932 if (maxcount < 0)
2933 maxcount = INT_MAX;
2934
2935 list = PyList_New(0);
2936 if (!list)
2937 return NULL;
2938
2939 if (substring == NULL)
2940 return split_whitespace(self,list,maxcount);
2941
2942 else if (substring->length == 1)
2943 return split_char(self,list,substring->str[0],maxcount);
2944
2945 else if (substring->length == 0) {
2946 Py_DECREF(list);
2947 PyErr_SetString(PyExc_ValueError, "empty separator");
2948 return NULL;
2949 }
2950 else
2951 return split_substring(self,list,substring,maxcount);
2952}
2953
2954static
2955PyObject *strip(PyUnicodeObject *self,
2956 int left,
2957 int right)
2958{
2959 Py_UNICODE *p = self->str;
2960 int start = 0;
2961 int end = self->length;
2962
2963 if (left)
2964 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2965 start++;
2966
2967 if (right)
2968 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2969 end--;
2970
2971 if (start == 0 && end == self->length) {
2972 /* couldn't strip anything off, return original string */
2973 Py_INCREF(self);
2974 return (PyObject*) self;
2975 }
2976
2977 return (PyObject*) PyUnicode_FromUnicode(
2978 self->str + start,
2979 end - start
2980 );
2981}
2982
2983static
2984PyObject *replace(PyUnicodeObject *self,
2985 PyUnicodeObject *str1,
2986 PyUnicodeObject *str2,
2987 int maxcount)
2988{
2989 PyUnicodeObject *u;
2990
2991 if (maxcount < 0)
2992 maxcount = INT_MAX;
2993
2994 if (str1->length == 1 && str2->length == 1) {
2995 int i;
2996
2997 /* replace characters */
2998 if (!findchar(self->str, self->length, str1->str[0])) {
2999 /* nothing to replace, return original string */
3000 Py_INCREF(self);
3001 u = self;
3002 } else {
3003 Py_UNICODE u1 = str1->str[0];
3004 Py_UNICODE u2 = str2->str[0];
3005
3006 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3007 self->str,
3008 self->length
3009 );
3010 if (u)
3011 for (i = 0; i < u->length; i++)
3012 if (u->str[i] == u1) {
3013 if (--maxcount < 0)
3014 break;
3015 u->str[i] = u2;
3016 }
3017 }
3018
3019 } else {
3020 int n, i;
3021 Py_UNICODE *p;
3022
3023 /* replace strings */
3024 n = count(self, 0, self->length, str1);
3025 if (n > maxcount)
3026 n = maxcount;
3027 if (n == 0) {
3028 /* nothing to replace, return original string */
3029 Py_INCREF(self);
3030 u = self;
3031 } else {
3032 u = _PyUnicode_New(
3033 self->length + n * (str2->length - str1->length));
3034 if (u) {
3035 i = 0;
3036 p = u->str;
3037 while (i <= self->length - str1->length)
3038 if (Py_UNICODE_MATCH(self, i, str1)) {
3039 /* replace string segment */
3040 Py_UNICODE_COPY(p, str2->str, str2->length);
3041 p += str2->length;
3042 i += str1->length;
3043 if (--n <= 0) {
3044 /* copy remaining part */
3045 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3046 break;
3047 }
3048 } else
3049 *p++ = self->str[i++];
3050 }
3051 }
3052 }
3053
3054 return (PyObject *) u;
3055}
3056
3057/* --- Unicode Object Methods --------------------------------------------- */
3058
3059static char title__doc__[] =
3060"S.title() -> unicode\n\
3061\n\
3062Return a titlecased version of S, i.e. words start with title case\n\
3063characters, all remaining cased characters have lower case.";
3064
3065static PyObject*
3066unicode_title(PyUnicodeObject *self, PyObject *args)
3067{
3068 if (!PyArg_NoArgs(args))
3069 return NULL;
3070 return fixup(self, fixtitle);
3071}
3072
3073static char capitalize__doc__[] =
3074"S.capitalize() -> unicode\n\
3075\n\
3076Return a capitalized version of S, i.e. make the first character\n\
3077have upper case.";
3078
3079static PyObject*
3080unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3081{
3082 if (!PyArg_NoArgs(args))
3083 return NULL;
3084 return fixup(self, fixcapitalize);
3085}
3086
3087#if 0
3088static char capwords__doc__[] =
3089"S.capwords() -> unicode\n\
3090\n\
3091Apply .capitalize() to all words in S and return the result with\n\
3092normalized whitespace (all whitespace strings are replaced by ' ').";
3093
3094static PyObject*
3095unicode_capwords(PyUnicodeObject *self, PyObject *args)
3096{
3097 PyObject *list;
3098 PyObject *item;
3099 int i;
3100
3101 if (!PyArg_NoArgs(args))
3102 return NULL;
3103
3104 /* Split into words */
3105 list = split(self, NULL, -1);
3106 if (!list)
3107 return NULL;
3108
3109 /* Capitalize each word */
3110 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3111 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3112 fixcapitalize);
3113 if (item == NULL)
3114 goto onError;
3115 Py_DECREF(PyList_GET_ITEM(list, i));
3116 PyList_SET_ITEM(list, i, item);
3117 }
3118
3119 /* Join the words to form a new string */
3120 item = PyUnicode_Join(NULL, list);
3121
3122onError:
3123 Py_DECREF(list);
3124 return (PyObject *)item;
3125}
3126#endif
3127
3128static char center__doc__[] =
3129"S.center(width) -> unicode\n\
3130\n\
3131Return S centered in a Unicode string of length width. Padding is done\n\
3132using spaces.";
3133
3134static PyObject *
3135unicode_center(PyUnicodeObject *self, PyObject *args)
3136{
3137 int marg, left;
3138 int width;
3139
3140 if (!PyArg_ParseTuple(args, "i:center", &width))
3141 return NULL;
3142
3143 if (self->length >= width) {
3144 Py_INCREF(self);
3145 return (PyObject*) self;
3146 }
3147
3148 marg = width - self->length;
3149 left = marg / 2 + (marg & width & 1);
3150
3151 return (PyObject*) pad(self, left, marg - left, ' ');
3152}
3153
Marc-André Lemburge5034372000-08-08 08:04:29 +00003154#if 0
3155
3156/* This code should go into some future Unicode collation support
3157 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003158 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003159
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003160/* speedy UTF-16 code point order comparison */
3161/* gleaned from: */
3162/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3163
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003164static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003165{
3166 0, 0, 0, 0, 0, 0, 0, 0,
3167 0, 0, 0, 0, 0, 0, 0, 0,
3168 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003169 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003170};
3171
Guido van Rossumd57fd912000-03-10 22:53:23 +00003172static int
3173unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3174{
3175 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003176
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177 Py_UNICODE *s1 = str1->str;
3178 Py_UNICODE *s2 = str2->str;
3179
3180 len1 = str1->length;
3181 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003182
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003184 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003185 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003186
3187 c1 = *s1++;
3188 c2 = *s2++;
3189 if (c1 > (1<<11) * 26)
3190 c1 += utf16Fixup[c1>>11];
3191 if (c2 > (1<<11) * 26)
3192 c2 += utf16Fixup[c2>>11];
3193
3194 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003195 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003196 if (diff)
3197 return (diff < 0) ? -1 : (diff != 0);
3198 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003199 }
3200
3201 return (len1 < len2) ? -1 : (len1 != len2);
3202}
3203
Marc-André Lemburge5034372000-08-08 08:04:29 +00003204#else
3205
3206static int
3207unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3208{
3209 register int len1, len2;
3210
3211 Py_UNICODE *s1 = str1->str;
3212 Py_UNICODE *s2 = str2->str;
3213
3214 len1 = str1->length;
3215 len2 = str2->length;
3216
3217 while (len1 > 0 && len2 > 0) {
3218 register long diff;
3219
3220 diff = (long)*s1++ - (long)*s2++;
3221 if (diff)
3222 return (diff < 0) ? -1 : (diff != 0);
3223 len1--; len2--;
3224 }
3225
3226 return (len1 < len2) ? -1 : (len1 != len2);
3227}
3228
3229#endif
3230
Guido van Rossumd57fd912000-03-10 22:53:23 +00003231int PyUnicode_Compare(PyObject *left,
3232 PyObject *right)
3233{
3234 PyUnicodeObject *u = NULL, *v = NULL;
3235 int result;
3236
3237 /* Coerce the two arguments */
3238 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3239 if (u == NULL)
3240 goto onError;
3241 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3242 if (v == NULL)
3243 goto onError;
3244
Thomas Wouters7e474022000-07-16 12:04:32 +00003245 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003246 if (v == u) {
3247 Py_DECREF(u);
3248 Py_DECREF(v);
3249 return 0;
3250 }
3251
3252 result = unicode_compare(u, v);
3253
3254 Py_DECREF(u);
3255 Py_DECREF(v);
3256 return result;
3257
3258onError:
3259 Py_XDECREF(u);
3260 Py_XDECREF(v);
3261 return -1;
3262}
3263
Guido van Rossum403d68b2000-03-13 15:55:09 +00003264int PyUnicode_Contains(PyObject *container,
3265 PyObject *element)
3266{
3267 PyUnicodeObject *u = NULL, *v = NULL;
3268 int result;
3269 register const Py_UNICODE *p, *e;
3270 register Py_UNICODE ch;
3271
3272 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003273 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003274 if (v == NULL) {
3275 PyErr_SetString(PyExc_TypeError,
3276 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003277 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003278 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003279 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3280 if (u == NULL) {
3281 Py_DECREF(v);
3282 goto onError;
3283 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003284
3285 /* Check v in u */
3286 if (PyUnicode_GET_SIZE(v) != 1) {
3287 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003288 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003289 goto onError;
3290 }
3291 ch = *PyUnicode_AS_UNICODE(v);
3292 p = PyUnicode_AS_UNICODE(u);
3293 e = p + PyUnicode_GET_SIZE(u);
3294 result = 0;
3295 while (p < e) {
3296 if (*p++ == ch) {
3297 result = 1;
3298 break;
3299 }
3300 }
3301
3302 Py_DECREF(u);
3303 Py_DECREF(v);
3304 return result;
3305
3306onError:
3307 Py_XDECREF(u);
3308 Py_XDECREF(v);
3309 return -1;
3310}
3311
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312/* Concat to string or Unicode object giving a new Unicode object. */
3313
3314PyObject *PyUnicode_Concat(PyObject *left,
3315 PyObject *right)
3316{
3317 PyUnicodeObject *u = NULL, *v = NULL, *w;
3318
3319 /* Coerce the two arguments */
3320 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3321 if (u == NULL)
3322 goto onError;
3323 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3324 if (v == NULL)
3325 goto onError;
3326
3327 /* Shortcuts */
3328 if (v == unicode_empty) {
3329 Py_DECREF(v);
3330 return (PyObject *)u;
3331 }
3332 if (u == unicode_empty) {
3333 Py_DECREF(u);
3334 return (PyObject *)v;
3335 }
3336
3337 /* Concat the two Unicode strings */
3338 w = _PyUnicode_New(u->length + v->length);
3339 if (w == NULL)
3340 goto onError;
3341 Py_UNICODE_COPY(w->str, u->str, u->length);
3342 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3343
3344 Py_DECREF(u);
3345 Py_DECREF(v);
3346 return (PyObject *)w;
3347
3348onError:
3349 Py_XDECREF(u);
3350 Py_XDECREF(v);
3351 return NULL;
3352}
3353
3354static char count__doc__[] =
3355"S.count(sub[, start[, end]]) -> int\n\
3356\n\
3357Return the number of occurrences of substring sub in Unicode string\n\
3358S[start:end]. Optional arguments start and end are\n\
3359interpreted as in slice notation.";
3360
3361static PyObject *
3362unicode_count(PyUnicodeObject *self, PyObject *args)
3363{
3364 PyUnicodeObject *substring;
3365 int start = 0;
3366 int end = INT_MAX;
3367 PyObject *result;
3368
Guido van Rossumb8872e62000-05-09 14:14:27 +00003369 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3370 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 return NULL;
3372
3373 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3374 (PyObject *)substring);
3375 if (substring == NULL)
3376 return NULL;
3377
Guido van Rossumd57fd912000-03-10 22:53:23 +00003378 if (start < 0)
3379 start += self->length;
3380 if (start < 0)
3381 start = 0;
3382 if (end > self->length)
3383 end = self->length;
3384 if (end < 0)
3385 end += self->length;
3386 if (end < 0)
3387 end = 0;
3388
3389 result = PyInt_FromLong((long) count(self, start, end, substring));
3390
3391 Py_DECREF(substring);
3392 return result;
3393}
3394
3395static char encode__doc__[] =
3396"S.encode([encoding[,errors]]) -> string\n\
3397\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003398Return an encoded string version of S. Default encoding is the current\n\
3399default string encoding. errors may be given to set a different error\n\
3400handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3401a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003402
3403static PyObject *
3404unicode_encode(PyUnicodeObject *self, PyObject *args)
3405{
3406 char *encoding = NULL;
3407 char *errors = NULL;
3408 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3409 return NULL;
3410 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3411}
3412
3413static char expandtabs__doc__[] =
3414"S.expandtabs([tabsize]) -> unicode\n\
3415\n\
3416Return a copy of S where all tab characters are expanded using spaces.\n\
3417If tabsize is not given, a tab size of 8 characters is assumed.";
3418
3419static PyObject*
3420unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3421{
3422 Py_UNICODE *e;
3423 Py_UNICODE *p;
3424 Py_UNICODE *q;
3425 int i, j;
3426 PyUnicodeObject *u;
3427 int tabsize = 8;
3428
3429 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3430 return NULL;
3431
Thomas Wouters7e474022000-07-16 12:04:32 +00003432 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 i = j = 0;
3434 e = self->str + self->length;
3435 for (p = self->str; p < e; p++)
3436 if (*p == '\t') {
3437 if (tabsize > 0)
3438 j += tabsize - (j % tabsize);
3439 }
3440 else {
3441 j++;
3442 if (*p == '\n' || *p == '\r') {
3443 i += j;
3444 j = 0;
3445 }
3446 }
3447
3448 /* Second pass: create output string and fill it */
3449 u = _PyUnicode_New(i + j);
3450 if (!u)
3451 return NULL;
3452
3453 j = 0;
3454 q = u->str;
3455
3456 for (p = self->str; p < e; p++)
3457 if (*p == '\t') {
3458 if (tabsize > 0) {
3459 i = tabsize - (j % tabsize);
3460 j += i;
3461 while (i--)
3462 *q++ = ' ';
3463 }
3464 }
3465 else {
3466 j++;
3467 *q++ = *p;
3468 if (*p == '\n' || *p == '\r')
3469 j = 0;
3470 }
3471
3472 return (PyObject*) u;
3473}
3474
3475static char find__doc__[] =
3476"S.find(sub [,start [,end]]) -> int\n\
3477\n\
3478Return the lowest index in S where substring sub is found,\n\
3479such that sub is contained within s[start,end]. Optional\n\
3480arguments start and end are interpreted as in slice notation.\n\
3481\n\
3482Return -1 on failure.";
3483
3484static PyObject *
3485unicode_find(PyUnicodeObject *self, PyObject *args)
3486{
3487 PyUnicodeObject *substring;
3488 int start = 0;
3489 int end = INT_MAX;
3490 PyObject *result;
3491
Guido van Rossumb8872e62000-05-09 14:14:27 +00003492 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3493 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003494 return NULL;
3495 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3496 (PyObject *)substring);
3497 if (substring == NULL)
3498 return NULL;
3499
3500 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3501
3502 Py_DECREF(substring);
3503 return result;
3504}
3505
3506static PyObject *
3507unicode_getitem(PyUnicodeObject *self, int index)
3508{
3509 if (index < 0 || index >= self->length) {
3510 PyErr_SetString(PyExc_IndexError, "string index out of range");
3511 return NULL;
3512 }
3513
3514 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3515}
3516
3517static long
3518unicode_hash(PyUnicodeObject *self)
3519{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003520 /* Since Unicode objects compare equal to their ASCII string
3521 counterparts, they should use the individual character values
3522 as basis for their hash value. This is needed to assure that
3523 strings and Unicode objects behave in the same way as
3524 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003525
Fredrik Lundhdde61642000-07-10 18:27:47 +00003526 register int len;
3527 register Py_UNICODE *p;
3528 register long x;
3529
Guido van Rossumd57fd912000-03-10 22:53:23 +00003530 if (self->hash != -1)
3531 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003532 len = PyUnicode_GET_SIZE(self);
3533 p = PyUnicode_AS_UNICODE(self);
3534 x = *p << 7;
3535 while (--len >= 0)
3536 x = (1000003*x) ^ *p++;
3537 x ^= PyUnicode_GET_SIZE(self);
3538 if (x == -1)
3539 x = -2;
3540 self->hash = x;
3541 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003542}
3543
3544static char index__doc__[] =
3545"S.index(sub [,start [,end]]) -> int\n\
3546\n\
3547Like S.find() but raise ValueError when the substring is not found.";
3548
3549static PyObject *
3550unicode_index(PyUnicodeObject *self, PyObject *args)
3551{
3552 int result;
3553 PyUnicodeObject *substring;
3554 int start = 0;
3555 int end = INT_MAX;
3556
Guido van Rossumb8872e62000-05-09 14:14:27 +00003557 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3558 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003559 return NULL;
3560
3561 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3562 (PyObject *)substring);
3563 if (substring == NULL)
3564 return NULL;
3565
3566 result = findstring(self, substring, start, end, 1);
3567
3568 Py_DECREF(substring);
3569 if (result < 0) {
3570 PyErr_SetString(PyExc_ValueError, "substring not found");
3571 return NULL;
3572 }
3573 return PyInt_FromLong(result);
3574}
3575
3576static char islower__doc__[] =
3577"S.islower() -> int\n\
3578\n\
3579Return 1 if all cased characters in S are lowercase and there is\n\
3580at least one cased character in S, 0 otherwise.";
3581
3582static PyObject*
3583unicode_islower(PyUnicodeObject *self, PyObject *args)
3584{
3585 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3586 register const Py_UNICODE *e;
3587 int cased;
3588
3589 if (!PyArg_NoArgs(args))
3590 return NULL;
3591
3592 /* Shortcut for single character strings */
3593 if (PyUnicode_GET_SIZE(self) == 1)
3594 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3595
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003596 /* Special case for empty strings */
3597 if (PyString_GET_SIZE(self) == 0)
3598 return PyInt_FromLong(0);
3599
Guido van Rossumd57fd912000-03-10 22:53:23 +00003600 e = p + PyUnicode_GET_SIZE(self);
3601 cased = 0;
3602 for (; p < e; p++) {
3603 register const Py_UNICODE ch = *p;
3604
3605 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3606 return PyInt_FromLong(0);
3607 else if (!cased && Py_UNICODE_ISLOWER(ch))
3608 cased = 1;
3609 }
3610 return PyInt_FromLong(cased);
3611}
3612
3613static char isupper__doc__[] =
3614"S.isupper() -> int\n\
3615\n\
3616Return 1 if all cased characters in S are uppercase and there is\n\
3617at least one cased character in S, 0 otherwise.";
3618
3619static PyObject*
3620unicode_isupper(PyUnicodeObject *self, PyObject *args)
3621{
3622 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3623 register const Py_UNICODE *e;
3624 int cased;
3625
3626 if (!PyArg_NoArgs(args))
3627 return NULL;
3628
3629 /* Shortcut for single character strings */
3630 if (PyUnicode_GET_SIZE(self) == 1)
3631 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3632
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003633 /* Special case for empty strings */
3634 if (PyString_GET_SIZE(self) == 0)
3635 return PyInt_FromLong(0);
3636
Guido van Rossumd57fd912000-03-10 22:53:23 +00003637 e = p + PyUnicode_GET_SIZE(self);
3638 cased = 0;
3639 for (; p < e; p++) {
3640 register const Py_UNICODE ch = *p;
3641
3642 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3643 return PyInt_FromLong(0);
3644 else if (!cased && Py_UNICODE_ISUPPER(ch))
3645 cased = 1;
3646 }
3647 return PyInt_FromLong(cased);
3648}
3649
3650static char istitle__doc__[] =
3651"S.istitle() -> int\n\
3652\n\
3653Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3654may only follow uncased characters and lowercase characters only cased\n\
3655ones. Return 0 otherwise.";
3656
3657static PyObject*
3658unicode_istitle(PyUnicodeObject *self, PyObject *args)
3659{
3660 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3661 register const Py_UNICODE *e;
3662 int cased, previous_is_cased;
3663
3664 if (!PyArg_NoArgs(args))
3665 return NULL;
3666
3667 /* Shortcut for single character strings */
3668 if (PyUnicode_GET_SIZE(self) == 1)
3669 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3670 (Py_UNICODE_ISUPPER(*p) != 0));
3671
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003672 /* Special case for empty strings */
3673 if (PyString_GET_SIZE(self) == 0)
3674 return PyInt_FromLong(0);
3675
Guido van Rossumd57fd912000-03-10 22:53:23 +00003676 e = p + PyUnicode_GET_SIZE(self);
3677 cased = 0;
3678 previous_is_cased = 0;
3679 for (; p < e; p++) {
3680 register const Py_UNICODE ch = *p;
3681
3682 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3683 if (previous_is_cased)
3684 return PyInt_FromLong(0);
3685 previous_is_cased = 1;
3686 cased = 1;
3687 }
3688 else if (Py_UNICODE_ISLOWER(ch)) {
3689 if (!previous_is_cased)
3690 return PyInt_FromLong(0);
3691 previous_is_cased = 1;
3692 cased = 1;
3693 }
3694 else
3695 previous_is_cased = 0;
3696 }
3697 return PyInt_FromLong(cased);
3698}
3699
3700static char isspace__doc__[] =
3701"S.isspace() -> int\n\
3702\n\
3703Return 1 if there are only whitespace characters in S,\n\
37040 otherwise.";
3705
3706static PyObject*
3707unicode_isspace(PyUnicodeObject *self, PyObject *args)
3708{
3709 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3710 register const Py_UNICODE *e;
3711
3712 if (!PyArg_NoArgs(args))
3713 return NULL;
3714
3715 /* Shortcut for single character strings */
3716 if (PyUnicode_GET_SIZE(self) == 1 &&
3717 Py_UNICODE_ISSPACE(*p))
3718 return PyInt_FromLong(1);
3719
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003720 /* Special case for empty strings */
3721 if (PyString_GET_SIZE(self) == 0)
3722 return PyInt_FromLong(0);
3723
Guido van Rossumd57fd912000-03-10 22:53:23 +00003724 e = p + PyUnicode_GET_SIZE(self);
3725 for (; p < e; p++) {
3726 if (!Py_UNICODE_ISSPACE(*p))
3727 return PyInt_FromLong(0);
3728 }
3729 return PyInt_FromLong(1);
3730}
3731
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003732static char isalpha__doc__[] =
3733"S.isalpha() -> int\n\
3734\n\
3735Return 1 if all characters in S are alphabetic\n\
3736and there is at least one character in S, 0 otherwise.";
3737
3738static PyObject*
3739unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3740{
3741 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3742 register const Py_UNICODE *e;
3743
3744 if (!PyArg_NoArgs(args))
3745 return NULL;
3746
3747 /* Shortcut for single character strings */
3748 if (PyUnicode_GET_SIZE(self) == 1 &&
3749 Py_UNICODE_ISALPHA(*p))
3750 return PyInt_FromLong(1);
3751
3752 /* Special case for empty strings */
3753 if (PyString_GET_SIZE(self) == 0)
3754 return PyInt_FromLong(0);
3755
3756 e = p + PyUnicode_GET_SIZE(self);
3757 for (; p < e; p++) {
3758 if (!Py_UNICODE_ISALPHA(*p))
3759 return PyInt_FromLong(0);
3760 }
3761 return PyInt_FromLong(1);
3762}
3763
3764static char isalnum__doc__[] =
3765"S.isalnum() -> int\n\
3766\n\
3767Return 1 if all characters in S are alphanumeric\n\
3768and there is at least one character in S, 0 otherwise.";
3769
3770static PyObject*
3771unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3772{
3773 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3774 register const Py_UNICODE *e;
3775
3776 if (!PyArg_NoArgs(args))
3777 return NULL;
3778
3779 /* Shortcut for single character strings */
3780 if (PyUnicode_GET_SIZE(self) == 1 &&
3781 Py_UNICODE_ISALNUM(*p))
3782 return PyInt_FromLong(1);
3783
3784 /* Special case for empty strings */
3785 if (PyString_GET_SIZE(self) == 0)
3786 return PyInt_FromLong(0);
3787
3788 e = p + PyUnicode_GET_SIZE(self);
3789 for (; p < e; p++) {
3790 if (!Py_UNICODE_ISALNUM(*p))
3791 return PyInt_FromLong(0);
3792 }
3793 return PyInt_FromLong(1);
3794}
3795
Guido van Rossumd57fd912000-03-10 22:53:23 +00003796static char isdecimal__doc__[] =
3797"S.isdecimal() -> int\n\
3798\n\
3799Return 1 if there are only decimal characters in S,\n\
38000 otherwise.";
3801
3802static PyObject*
3803unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3804{
3805 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3806 register const Py_UNICODE *e;
3807
3808 if (!PyArg_NoArgs(args))
3809 return NULL;
3810
3811 /* Shortcut for single character strings */
3812 if (PyUnicode_GET_SIZE(self) == 1 &&
3813 Py_UNICODE_ISDECIMAL(*p))
3814 return PyInt_FromLong(1);
3815
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003816 /* Special case for empty strings */
3817 if (PyString_GET_SIZE(self) == 0)
3818 return PyInt_FromLong(0);
3819
Guido van Rossumd57fd912000-03-10 22:53:23 +00003820 e = p + PyUnicode_GET_SIZE(self);
3821 for (; p < e; p++) {
3822 if (!Py_UNICODE_ISDECIMAL(*p))
3823 return PyInt_FromLong(0);
3824 }
3825 return PyInt_FromLong(1);
3826}
3827
3828static char isdigit__doc__[] =
3829"S.isdigit() -> int\n\
3830\n\
3831Return 1 if there are only digit characters in S,\n\
38320 otherwise.";
3833
3834static PyObject*
3835unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3836{
3837 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3838 register const Py_UNICODE *e;
3839
3840 if (!PyArg_NoArgs(args))
3841 return NULL;
3842
3843 /* Shortcut for single character strings */
3844 if (PyUnicode_GET_SIZE(self) == 1 &&
3845 Py_UNICODE_ISDIGIT(*p))
3846 return PyInt_FromLong(1);
3847
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003848 /* Special case for empty strings */
3849 if (PyString_GET_SIZE(self) == 0)
3850 return PyInt_FromLong(0);
3851
Guido van Rossumd57fd912000-03-10 22:53:23 +00003852 e = p + PyUnicode_GET_SIZE(self);
3853 for (; p < e; p++) {
3854 if (!Py_UNICODE_ISDIGIT(*p))
3855 return PyInt_FromLong(0);
3856 }
3857 return PyInt_FromLong(1);
3858}
3859
3860static char isnumeric__doc__[] =
3861"S.isnumeric() -> int\n\
3862\n\
3863Return 1 if there are only numeric characters in S,\n\
38640 otherwise.";
3865
3866static PyObject*
3867unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3868{
3869 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3870 register const Py_UNICODE *e;
3871
3872 if (!PyArg_NoArgs(args))
3873 return NULL;
3874
3875 /* Shortcut for single character strings */
3876 if (PyUnicode_GET_SIZE(self) == 1 &&
3877 Py_UNICODE_ISNUMERIC(*p))
3878 return PyInt_FromLong(1);
3879
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003880 /* Special case for empty strings */
3881 if (PyString_GET_SIZE(self) == 0)
3882 return PyInt_FromLong(0);
3883
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884 e = p + PyUnicode_GET_SIZE(self);
3885 for (; p < e; p++) {
3886 if (!Py_UNICODE_ISNUMERIC(*p))
3887 return PyInt_FromLong(0);
3888 }
3889 return PyInt_FromLong(1);
3890}
3891
3892static char join__doc__[] =
3893"S.join(sequence) -> unicode\n\
3894\n\
3895Return a string which is the concatenation of the strings in the\n\
3896sequence. The separator between elements is S.";
3897
3898static PyObject*
3899unicode_join(PyUnicodeObject *self, PyObject *args)
3900{
3901 PyObject *data;
3902 if (!PyArg_ParseTuple(args, "O:join", &data))
3903 return NULL;
3904
3905 return PyUnicode_Join((PyObject *)self, data);
3906}
3907
3908static int
3909unicode_length(PyUnicodeObject *self)
3910{
3911 return self->length;
3912}
3913
3914static char ljust__doc__[] =
3915"S.ljust(width) -> unicode\n\
3916\n\
3917Return S left justified in a Unicode string of length width. Padding is\n\
3918done using spaces.";
3919
3920static PyObject *
3921unicode_ljust(PyUnicodeObject *self, PyObject *args)
3922{
3923 int width;
3924 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3925 return NULL;
3926
3927 if (self->length >= width) {
3928 Py_INCREF(self);
3929 return (PyObject*) self;
3930 }
3931
3932 return (PyObject*) pad(self, 0, width - self->length, ' ');
3933}
3934
3935static char lower__doc__[] =
3936"S.lower() -> unicode\n\
3937\n\
3938Return a copy of the string S converted to lowercase.";
3939
3940static PyObject*
3941unicode_lower(PyUnicodeObject *self, PyObject *args)
3942{
3943 if (!PyArg_NoArgs(args))
3944 return NULL;
3945 return fixup(self, fixlower);
3946}
3947
3948static char lstrip__doc__[] =
3949"S.lstrip() -> unicode\n\
3950\n\
3951Return a copy of the string S with leading whitespace removed.";
3952
3953static PyObject *
3954unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3955{
3956 if (!PyArg_NoArgs(args))
3957 return NULL;
3958 return strip(self, 1, 0);
3959}
3960
3961static PyObject*
3962unicode_repeat(PyUnicodeObject *str, int len)
3963{
3964 PyUnicodeObject *u;
3965 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00003966 int nchars;
3967 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003968
3969 if (len < 0)
3970 len = 0;
3971
3972 if (len == 1) {
3973 /* no repeat, return original string */
3974 Py_INCREF(str);
3975 return (PyObject*) str;
3976 }
Tim Peters8f422462000-09-09 06:13:41 +00003977
3978 /* ensure # of chars needed doesn't overflow int and # of bytes
3979 * needed doesn't overflow size_t
3980 */
3981 nchars = len * str->length;
3982 if (len && nchars / len != str->length) {
3983 PyErr_SetString(PyExc_OverflowError,
3984 "repeated string is too long");
3985 return NULL;
3986 }
3987 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
3988 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
3989 PyErr_SetString(PyExc_OverflowError,
3990 "repeated string is too long");
3991 return NULL;
3992 }
3993 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003994 if (!u)
3995 return NULL;
3996
3997 p = u->str;
3998
3999 while (len-- > 0) {
4000 Py_UNICODE_COPY(p, str->str, str->length);
4001 p += str->length;
4002 }
4003
4004 return (PyObject*) u;
4005}
4006
4007PyObject *PyUnicode_Replace(PyObject *obj,
4008 PyObject *subobj,
4009 PyObject *replobj,
4010 int maxcount)
4011{
4012 PyObject *self;
4013 PyObject *str1;
4014 PyObject *str2;
4015 PyObject *result;
4016
4017 self = PyUnicode_FromObject(obj);
4018 if (self == NULL)
4019 return NULL;
4020 str1 = PyUnicode_FromObject(subobj);
4021 if (str1 == NULL) {
4022 Py_DECREF(self);
4023 return NULL;
4024 }
4025 str2 = PyUnicode_FromObject(replobj);
4026 if (str2 == NULL) {
4027 Py_DECREF(self);
4028 Py_DECREF(str1);
4029 return NULL;
4030 }
4031 result = replace((PyUnicodeObject *)self,
4032 (PyUnicodeObject *)str1,
4033 (PyUnicodeObject *)str2,
4034 maxcount);
4035 Py_DECREF(self);
4036 Py_DECREF(str1);
4037 Py_DECREF(str2);
4038 return result;
4039}
4040
4041static char replace__doc__[] =
4042"S.replace (old, new[, maxsplit]) -> unicode\n\
4043\n\
4044Return a copy of S with all occurrences of substring\n\
4045old replaced by new. If the optional argument maxsplit is\n\
4046given, only the first maxsplit occurrences are replaced.";
4047
4048static PyObject*
4049unicode_replace(PyUnicodeObject *self, PyObject *args)
4050{
4051 PyUnicodeObject *str1;
4052 PyUnicodeObject *str2;
4053 int maxcount = -1;
4054 PyObject *result;
4055
4056 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4057 return NULL;
4058 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4059 if (str1 == NULL)
4060 return NULL;
4061 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4062 if (str2 == NULL)
4063 return NULL;
4064
4065 result = replace(self, str1, str2, maxcount);
4066
4067 Py_DECREF(str1);
4068 Py_DECREF(str2);
4069 return result;
4070}
4071
4072static
4073PyObject *unicode_repr(PyObject *unicode)
4074{
4075 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4076 PyUnicode_GET_SIZE(unicode),
4077 1);
4078}
4079
4080static char rfind__doc__[] =
4081"S.rfind(sub [,start [,end]]) -> int\n\
4082\n\
4083Return the highest index in S where substring sub is found,\n\
4084such that sub is contained within s[start,end]. Optional\n\
4085arguments start and end are interpreted as in slice notation.\n\
4086\n\
4087Return -1 on failure.";
4088
4089static PyObject *
4090unicode_rfind(PyUnicodeObject *self, PyObject *args)
4091{
4092 PyUnicodeObject *substring;
4093 int start = 0;
4094 int end = INT_MAX;
4095 PyObject *result;
4096
Guido van Rossumb8872e62000-05-09 14:14:27 +00004097 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4098 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004099 return NULL;
4100 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4101 (PyObject *)substring);
4102 if (substring == NULL)
4103 return NULL;
4104
4105 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4106
4107 Py_DECREF(substring);
4108 return result;
4109}
4110
4111static char rindex__doc__[] =
4112"S.rindex(sub [,start [,end]]) -> int\n\
4113\n\
4114Like S.rfind() but raise ValueError when the substring is not found.";
4115
4116static PyObject *
4117unicode_rindex(PyUnicodeObject *self, PyObject *args)
4118{
4119 int result;
4120 PyUnicodeObject *substring;
4121 int start = 0;
4122 int end = INT_MAX;
4123
Guido van Rossumb8872e62000-05-09 14:14:27 +00004124 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4125 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004126 return NULL;
4127 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4128 (PyObject *)substring);
4129 if (substring == NULL)
4130 return NULL;
4131
4132 result = findstring(self, substring, start, end, -1);
4133
4134 Py_DECREF(substring);
4135 if (result < 0) {
4136 PyErr_SetString(PyExc_ValueError, "substring not found");
4137 return NULL;
4138 }
4139 return PyInt_FromLong(result);
4140}
4141
4142static char rjust__doc__[] =
4143"S.rjust(width) -> unicode\n\
4144\n\
4145Return S right justified in a Unicode string of length width. Padding is\n\
4146done using spaces.";
4147
4148static PyObject *
4149unicode_rjust(PyUnicodeObject *self, PyObject *args)
4150{
4151 int width;
4152 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4153 return NULL;
4154
4155 if (self->length >= width) {
4156 Py_INCREF(self);
4157 return (PyObject*) self;
4158 }
4159
4160 return (PyObject*) pad(self, width - self->length, 0, ' ');
4161}
4162
4163static char rstrip__doc__[] =
4164"S.rstrip() -> unicode\n\
4165\n\
4166Return a copy of the string S with trailing whitespace removed.";
4167
4168static PyObject *
4169unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4170{
4171 if (!PyArg_NoArgs(args))
4172 return NULL;
4173 return strip(self, 0, 1);
4174}
4175
4176static PyObject*
4177unicode_slice(PyUnicodeObject *self, int start, int end)
4178{
4179 /* standard clamping */
4180 if (start < 0)
4181 start = 0;
4182 if (end < 0)
4183 end = 0;
4184 if (end > self->length)
4185 end = self->length;
4186 if (start == 0 && end == self->length) {
4187 /* full slice, return original string */
4188 Py_INCREF(self);
4189 return (PyObject*) self;
4190 }
4191 if (start > end)
4192 start = end;
4193 /* copy slice */
4194 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4195 end - start);
4196}
4197
4198PyObject *PyUnicode_Split(PyObject *s,
4199 PyObject *sep,
4200 int maxsplit)
4201{
4202 PyObject *result;
4203
4204 s = PyUnicode_FromObject(s);
4205 if (s == NULL)
4206 return NULL;
4207 if (sep != NULL) {
4208 sep = PyUnicode_FromObject(sep);
4209 if (sep == NULL) {
4210 Py_DECREF(s);
4211 return NULL;
4212 }
4213 }
4214
4215 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4216
4217 Py_DECREF(s);
4218 Py_XDECREF(sep);
4219 return result;
4220}
4221
4222static char split__doc__[] =
4223"S.split([sep [,maxsplit]]) -> list of strings\n\
4224\n\
4225Return a list of the words in S, using sep as the\n\
4226delimiter string. If maxsplit is given, at most maxsplit\n\
4227splits are done. If sep is not specified, any whitespace string\n\
4228is a separator.";
4229
4230static PyObject*
4231unicode_split(PyUnicodeObject *self, PyObject *args)
4232{
4233 PyObject *substring = Py_None;
4234 int maxcount = -1;
4235
4236 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4237 return NULL;
4238
4239 if (substring == Py_None)
4240 return split(self, NULL, maxcount);
4241 else if (PyUnicode_Check(substring))
4242 return split(self, (PyUnicodeObject *)substring, maxcount);
4243 else
4244 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4245}
4246
4247static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004248"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004249\n\
4250Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004251Line breaks are not included in the resulting list unless keepends\n\
4252is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004253
4254static PyObject*
4255unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4256{
Guido van Rossum86662912000-04-11 15:38:46 +00004257 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004258
Guido van Rossum86662912000-04-11 15:38:46 +00004259 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260 return NULL;
4261
Guido van Rossum86662912000-04-11 15:38:46 +00004262 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004263}
4264
4265static
4266PyObject *unicode_str(PyUnicodeObject *self)
4267{
Fred Drakee4315f52000-05-09 19:53:39 +00004268 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004269}
4270
4271static char strip__doc__[] =
4272"S.strip() -> unicode\n\
4273\n\
4274Return a copy of S with leading and trailing whitespace removed.";
4275
4276static PyObject *
4277unicode_strip(PyUnicodeObject *self, PyObject *args)
4278{
4279 if (!PyArg_NoArgs(args))
4280 return NULL;
4281 return strip(self, 1, 1);
4282}
4283
4284static char swapcase__doc__[] =
4285"S.swapcase() -> unicode\n\
4286\n\
4287Return a copy of S with uppercase characters converted to lowercase\n\
4288and vice versa.";
4289
4290static PyObject*
4291unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4292{
4293 if (!PyArg_NoArgs(args))
4294 return NULL;
4295 return fixup(self, fixswapcase);
4296}
4297
4298static char translate__doc__[] =
4299"S.translate(table) -> unicode\n\
4300\n\
4301Return a copy of the string S, where all characters have been mapped\n\
4302through the given translation table, which must be a mapping of\n\
4303Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4304are left untouched. Characters mapped to None are deleted.";
4305
4306static PyObject*
4307unicode_translate(PyUnicodeObject *self, PyObject *args)
4308{
4309 PyObject *table;
4310
4311 if (!PyArg_ParseTuple(args, "O:translate", &table))
4312 return NULL;
4313 return PyUnicode_TranslateCharmap(self->str,
4314 self->length,
4315 table,
4316 "ignore");
4317}
4318
4319static char upper__doc__[] =
4320"S.upper() -> unicode\n\
4321\n\
4322Return a copy of S converted to uppercase.";
4323
4324static PyObject*
4325unicode_upper(PyUnicodeObject *self, PyObject *args)
4326{
4327 if (!PyArg_NoArgs(args))
4328 return NULL;
4329 return fixup(self, fixupper);
4330}
4331
4332#if 0
4333static char zfill__doc__[] =
4334"S.zfill(width) -> unicode\n\
4335\n\
4336Pad a numeric string x with zeros on the left, to fill a field\n\
4337of the specified width. The string x is never truncated.";
4338
4339static PyObject *
4340unicode_zfill(PyUnicodeObject *self, PyObject *args)
4341{
4342 int fill;
4343 PyUnicodeObject *u;
4344
4345 int width;
4346 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4347 return NULL;
4348
4349 if (self->length >= width) {
4350 Py_INCREF(self);
4351 return (PyObject*) self;
4352 }
4353
4354 fill = width - self->length;
4355
4356 u = pad(self, fill, 0, '0');
4357
4358 if (u->str[fill] == '+' || u->str[fill] == '-') {
4359 /* move sign to beginning of string */
4360 u->str[0] = u->str[fill];
4361 u->str[fill] = '0';
4362 }
4363
4364 return (PyObject*) u;
4365}
4366#endif
4367
4368#if 0
4369static PyObject*
4370unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4371{
4372 if (!PyArg_NoArgs(args))
4373 return NULL;
4374 return PyInt_FromLong(unicode_freelist_size);
4375}
4376#endif
4377
4378static char startswith__doc__[] =
4379"S.startswith(prefix[, start[, end]]) -> int\n\
4380\n\
4381Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4382optional start, test S beginning at that position. With optional end, stop\n\
4383comparing S at that position.";
4384
4385static PyObject *
4386unicode_startswith(PyUnicodeObject *self,
4387 PyObject *args)
4388{
4389 PyUnicodeObject *substring;
4390 int start = 0;
4391 int end = INT_MAX;
4392 PyObject *result;
4393
Guido van Rossumb8872e62000-05-09 14:14:27 +00004394 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4395 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 return NULL;
4397 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4398 (PyObject *)substring);
4399 if (substring == NULL)
4400 return NULL;
4401
4402 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4403
4404 Py_DECREF(substring);
4405 return result;
4406}
4407
4408
4409static char endswith__doc__[] =
4410"S.endswith(suffix[, start[, end]]) -> int\n\
4411\n\
4412Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4413optional start, test S beginning at that position. With optional end, stop\n\
4414comparing S at that position.";
4415
4416static PyObject *
4417unicode_endswith(PyUnicodeObject *self,
4418 PyObject *args)
4419{
4420 PyUnicodeObject *substring;
4421 int start = 0;
4422 int end = INT_MAX;
4423 PyObject *result;
4424
Guido van Rossumb8872e62000-05-09 14:14:27 +00004425 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4426 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004427 return NULL;
4428 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4429 (PyObject *)substring);
4430 if (substring == NULL)
4431 return NULL;
4432
4433 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4434
4435 Py_DECREF(substring);
4436 return result;
4437}
4438
4439
4440static PyMethodDef unicode_methods[] = {
4441
4442 /* Order is according to common usage: often used methods should
4443 appear first, since lookup is done sequentially. */
4444
4445 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4446 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4447 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4448 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4449 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4450 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4451 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4452 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4453 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4454 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4455 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4456 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4457 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4458 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4459/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4460 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4461 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4462 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4463 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4464 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4465 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4466 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4467 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4468 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4469 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4470 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4471 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4472 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4473 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4474 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4475 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4476 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4477 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004478 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4479 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004480#if 0
4481 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4482 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4483#endif
4484
4485#if 0
4486 /* This one is just used for debugging the implementation. */
4487 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4488#endif
4489
4490 {NULL, NULL}
4491};
4492
4493static PyObject *
4494unicode_getattr(PyUnicodeObject *self, char *name)
4495{
4496 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4497}
4498
4499static PySequenceMethods unicode_as_sequence = {
4500 (inquiry) unicode_length, /* sq_length */
4501 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4502 (intargfunc) unicode_repeat, /* sq_repeat */
4503 (intargfunc) unicode_getitem, /* sq_item */
4504 (intintargfunc) unicode_slice, /* sq_slice */
4505 0, /* sq_ass_item */
4506 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004507 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004508};
4509
4510static int
4511unicode_buffer_getreadbuf(PyUnicodeObject *self,
4512 int index,
4513 const void **ptr)
4514{
4515 if (index != 0) {
4516 PyErr_SetString(PyExc_SystemError,
4517 "accessing non-existent unicode segment");
4518 return -1;
4519 }
4520 *ptr = (void *) self->str;
4521 return PyUnicode_GET_DATA_SIZE(self);
4522}
4523
4524static int
4525unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4526 const void **ptr)
4527{
4528 PyErr_SetString(PyExc_TypeError,
4529 "cannot use unicode as modifyable buffer");
4530 return -1;
4531}
4532
4533static int
4534unicode_buffer_getsegcount(PyUnicodeObject *self,
4535 int *lenp)
4536{
4537 if (lenp)
4538 *lenp = PyUnicode_GET_DATA_SIZE(self);
4539 return 1;
4540}
4541
4542static int
4543unicode_buffer_getcharbuf(PyUnicodeObject *self,
4544 int index,
4545 const void **ptr)
4546{
4547 PyObject *str;
4548
4549 if (index != 0) {
4550 PyErr_SetString(PyExc_SystemError,
4551 "accessing non-existent unicode segment");
4552 return -1;
4553 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004554 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004555 if (str == NULL)
4556 return -1;
4557 *ptr = (void *) PyString_AS_STRING(str);
4558 return PyString_GET_SIZE(str);
4559}
4560
4561/* Helpers for PyUnicode_Format() */
4562
4563static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004564getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004565{
4566 int argidx = *p_argidx;
4567 if (argidx < arglen) {
4568 (*p_argidx)++;
4569 if (arglen < 0)
4570 return args;
4571 else
4572 return PyTuple_GetItem(args, argidx);
4573 }
4574 PyErr_SetString(PyExc_TypeError,
4575 "not enough arguments for format string");
4576 return NULL;
4577}
4578
4579#define F_LJUST (1<<0)
4580#define F_SIGN (1<<1)
4581#define F_BLANK (1<<2)
4582#define F_ALT (1<<3)
4583#define F_ZERO (1<<4)
4584
4585static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004586int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004587{
4588 register int i;
4589 int len;
4590 va_list va;
4591 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004592 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004593
4594 /* First, format the string as char array, then expand to Py_UNICODE
4595 array. */
4596 charbuffer = (char *)buffer;
4597 len = vsprintf(charbuffer, format, va);
4598 for (i = len - 1; i >= 0; i--)
4599 buffer[i] = (Py_UNICODE) charbuffer[i];
4600
4601 va_end(va);
4602 return len;
4603}
4604
4605static int
4606formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004607 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004608 int flags,
4609 int prec,
4610 int type,
4611 PyObject *v)
4612{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004613 /* fmt = '%#.' + `prec` + `type`
4614 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004615 char fmt[20];
4616 double x;
4617
4618 x = PyFloat_AsDouble(v);
4619 if (x == -1.0 && PyErr_Occurred())
4620 return -1;
4621 if (prec < 0)
4622 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004623 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4624 type = 'g';
4625 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004626 /* worst case length calc to ensure no buffer overrun:
4627 fmt = %#.<prec>g
4628 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4629 for any double rep.)
4630 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4631 If prec=0 the effective precision is 1 (the leading digit is
4632 always given), therefore increase by one to 10+prec. */
4633 if (buflen <= (size_t)10 + (size_t)prec) {
4634 PyErr_SetString(PyExc_OverflowError,
4635 "formatted float is too long (precision too long?)");
4636 return -1;
4637 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004638 return usprintf(buf, fmt, x);
4639}
4640
Tim Peters38fd5b62000-09-21 05:43:11 +00004641static PyObject*
4642formatlong(PyObject *val, int flags, int prec, int type)
4643{
4644 char *buf;
4645 int i, len;
4646 PyObject *str; /* temporary string object. */
4647 PyUnicodeObject *result;
4648
4649 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4650 if (!str)
4651 return NULL;
4652 result = _PyUnicode_New(len);
4653 for (i = 0; i < len; i++)
4654 result->str[i] = buf[i];
4655 result->str[len] = 0;
4656 Py_DECREF(str);
4657 return (PyObject*)result;
4658}
4659
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660static int
4661formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004662 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 int flags,
4664 int prec,
4665 int type,
4666 PyObject *v)
4667{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004668 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004669 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4670 + 1 + 1 = 24*/
4671 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672 long x;
4673
4674 x = PyInt_AsLong(v);
4675 if (x == -1 && PyErr_Occurred())
4676 return -1;
4677 if (prec < 0)
4678 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004679 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4680 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4681 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4682 PyErr_SetString(PyExc_OverflowError,
4683 "formatted integer is too long (precision too long?)");
4684 return -1;
4685 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4687 return usprintf(buf, fmt, x);
4688}
4689
4690static int
4691formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004692 size_t buflen,
4693 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004695 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004696 if (PyUnicode_Check(v)) {
4697 if (PyUnicode_GET_SIZE(v) != 1)
4698 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004699 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004700 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004702 else if (PyString_Check(v)) {
4703 if (PyString_GET_SIZE(v) != 1)
4704 goto onError;
4705 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4706 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004707
4708 else {
4709 /* Integer input truncated to a character */
4710 long x;
4711 x = PyInt_AsLong(v);
4712 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004713 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714 buf[0] = (char) x;
4715 }
4716 buf[1] = '\0';
4717 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004718
4719 onError:
4720 PyErr_SetString(PyExc_TypeError,
4721 "%c requires int or char");
4722 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004723}
4724
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004725/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4726
4727 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4728 chars are formatted. XXX This is a magic number. Each formatting
4729 routine does bounds checking to ensure no overflow, but a better
4730 solution may be to malloc a buffer of appropriate size for each
4731 format. For now, the current solution is sufficient.
4732*/
4733#define FORMATBUFLEN (size_t)120
4734
Guido van Rossumd57fd912000-03-10 22:53:23 +00004735PyObject *PyUnicode_Format(PyObject *format,
4736 PyObject *args)
4737{
4738 Py_UNICODE *fmt, *res;
4739 int fmtcnt, rescnt, reslen, arglen, argidx;
4740 int args_owned = 0;
4741 PyUnicodeObject *result = NULL;
4742 PyObject *dict = NULL;
4743 PyObject *uformat;
4744
4745 if (format == NULL || args == NULL) {
4746 PyErr_BadInternalCall();
4747 return NULL;
4748 }
4749 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004750 if (uformat == NULL)
4751 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004752 fmt = PyUnicode_AS_UNICODE(uformat);
4753 fmtcnt = PyUnicode_GET_SIZE(uformat);
4754
4755 reslen = rescnt = fmtcnt + 100;
4756 result = _PyUnicode_New(reslen);
4757 if (result == NULL)
4758 goto onError;
4759 res = PyUnicode_AS_UNICODE(result);
4760
4761 if (PyTuple_Check(args)) {
4762 arglen = PyTuple_Size(args);
4763 argidx = 0;
4764 }
4765 else {
4766 arglen = -1;
4767 argidx = -2;
4768 }
4769 if (args->ob_type->tp_as_mapping)
4770 dict = args;
4771
4772 while (--fmtcnt >= 0) {
4773 if (*fmt != '%') {
4774 if (--rescnt < 0) {
4775 rescnt = fmtcnt + 100;
4776 reslen += rescnt;
4777 if (_PyUnicode_Resize(result, reslen) < 0)
4778 return NULL;
4779 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4780 --rescnt;
4781 }
4782 *res++ = *fmt++;
4783 }
4784 else {
4785 /* Got a format specifier */
4786 int flags = 0;
4787 int width = -1;
4788 int prec = -1;
4789 int size = 0;
4790 Py_UNICODE c = '\0';
4791 Py_UNICODE fill;
4792 PyObject *v = NULL;
4793 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004794 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004795 Py_UNICODE sign;
4796 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004797 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798
4799 fmt++;
4800 if (*fmt == '(') {
4801 Py_UNICODE *keystart;
4802 int keylen;
4803 PyObject *key;
4804 int pcount = 1;
4805
4806 if (dict == NULL) {
4807 PyErr_SetString(PyExc_TypeError,
4808 "format requires a mapping");
4809 goto onError;
4810 }
4811 ++fmt;
4812 --fmtcnt;
4813 keystart = fmt;
4814 /* Skip over balanced parentheses */
4815 while (pcount > 0 && --fmtcnt >= 0) {
4816 if (*fmt == ')')
4817 --pcount;
4818 else if (*fmt == '(')
4819 ++pcount;
4820 fmt++;
4821 }
4822 keylen = fmt - keystart - 1;
4823 if (fmtcnt < 0 || pcount > 0) {
4824 PyErr_SetString(PyExc_ValueError,
4825 "incomplete format key");
4826 goto onError;
4827 }
Fred Drakee4315f52000-05-09 19:53:39 +00004828 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004829 then looked up since Python uses strings to hold
4830 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004831 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 key = PyUnicode_EncodeUTF8(keystart,
4833 keylen,
4834 NULL);
4835 if (key == NULL)
4836 goto onError;
4837 if (args_owned) {
4838 Py_DECREF(args);
4839 args_owned = 0;
4840 }
4841 args = PyObject_GetItem(dict, key);
4842 Py_DECREF(key);
4843 if (args == NULL) {
4844 goto onError;
4845 }
4846 args_owned = 1;
4847 arglen = -1;
4848 argidx = -2;
4849 }
4850 while (--fmtcnt >= 0) {
4851 switch (c = *fmt++) {
4852 case '-': flags |= F_LJUST; continue;
4853 case '+': flags |= F_SIGN; continue;
4854 case ' ': flags |= F_BLANK; continue;
4855 case '#': flags |= F_ALT; continue;
4856 case '0': flags |= F_ZERO; continue;
4857 }
4858 break;
4859 }
4860 if (c == '*') {
4861 v = getnextarg(args, arglen, &argidx);
4862 if (v == NULL)
4863 goto onError;
4864 if (!PyInt_Check(v)) {
4865 PyErr_SetString(PyExc_TypeError,
4866 "* wants int");
4867 goto onError;
4868 }
4869 width = PyInt_AsLong(v);
4870 if (width < 0) {
4871 flags |= F_LJUST;
4872 width = -width;
4873 }
4874 if (--fmtcnt >= 0)
4875 c = *fmt++;
4876 }
4877 else if (c >= '0' && c <= '9') {
4878 width = c - '0';
4879 while (--fmtcnt >= 0) {
4880 c = *fmt++;
4881 if (c < '0' || c > '9')
4882 break;
4883 if ((width*10) / 10 != width) {
4884 PyErr_SetString(PyExc_ValueError,
4885 "width too big");
4886 goto onError;
4887 }
4888 width = width*10 + (c - '0');
4889 }
4890 }
4891 if (c == '.') {
4892 prec = 0;
4893 if (--fmtcnt >= 0)
4894 c = *fmt++;
4895 if (c == '*') {
4896 v = getnextarg(args, arglen, &argidx);
4897 if (v == NULL)
4898 goto onError;
4899 if (!PyInt_Check(v)) {
4900 PyErr_SetString(PyExc_TypeError,
4901 "* wants int");
4902 goto onError;
4903 }
4904 prec = PyInt_AsLong(v);
4905 if (prec < 0)
4906 prec = 0;
4907 if (--fmtcnt >= 0)
4908 c = *fmt++;
4909 }
4910 else if (c >= '0' && c <= '9') {
4911 prec = c - '0';
4912 while (--fmtcnt >= 0) {
4913 c = Py_CHARMASK(*fmt++);
4914 if (c < '0' || c > '9')
4915 break;
4916 if ((prec*10) / 10 != prec) {
4917 PyErr_SetString(PyExc_ValueError,
4918 "prec too big");
4919 goto onError;
4920 }
4921 prec = prec*10 + (c - '0');
4922 }
4923 }
4924 } /* prec */
4925 if (fmtcnt >= 0) {
4926 if (c == 'h' || c == 'l' || c == 'L') {
4927 size = c;
4928 if (--fmtcnt >= 0)
4929 c = *fmt++;
4930 }
4931 }
4932 if (fmtcnt < 0) {
4933 PyErr_SetString(PyExc_ValueError,
4934 "incomplete format");
4935 goto onError;
4936 }
4937 if (c != '%') {
4938 v = getnextarg(args, arglen, &argidx);
4939 if (v == NULL)
4940 goto onError;
4941 }
4942 sign = 0;
4943 fill = ' ';
4944 switch (c) {
4945
4946 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004947 pbuf = formatbuf;
4948 /* presume that buffer length is at least 1 */
4949 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004950 len = 1;
4951 break;
4952
4953 case 's':
4954 case 'r':
4955 if (PyUnicode_Check(v) && c == 's') {
4956 temp = v;
4957 Py_INCREF(temp);
4958 }
4959 else {
4960 PyObject *unicode;
4961 if (c == 's')
4962 temp = PyObject_Str(v);
4963 else
4964 temp = PyObject_Repr(v);
4965 if (temp == NULL)
4966 goto onError;
4967 if (!PyString_Check(temp)) {
4968 /* XXX Note: this should never happen, since
4969 PyObject_Repr() and PyObject_Str() assure
4970 this */
4971 Py_DECREF(temp);
4972 PyErr_SetString(PyExc_TypeError,
4973 "%s argument has non-string str()");
4974 goto onError;
4975 }
Fred Drakee4315f52000-05-09 19:53:39 +00004976 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004977 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004978 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 "strict");
4980 Py_DECREF(temp);
4981 temp = unicode;
4982 if (temp == NULL)
4983 goto onError;
4984 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004985 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004986 len = PyUnicode_GET_SIZE(temp);
4987 if (prec >= 0 && len > prec)
4988 len = prec;
4989 break;
4990
4991 case 'i':
4992 case 'd':
4993 case 'u':
4994 case 'o':
4995 case 'x':
4996 case 'X':
4997 if (c == 'i')
4998 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00004999 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005000 temp = formatlong(v, flags, prec, c);
5001 if (!temp)
5002 goto onError;
5003 pbuf = PyUnicode_AS_UNICODE(temp);
5004 len = PyUnicode_GET_SIZE(temp);
5005 /* unbounded ints can always produce
5006 a sign character! */
5007 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005008 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005009 else {
5010 pbuf = formatbuf;
5011 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5012 flags, prec, c, v);
5013 if (len < 0)
5014 goto onError;
5015 /* only d conversion is signed */
5016 sign = c == 'd';
5017 }
5018 if (flags & F_ZERO)
5019 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005020 break;
5021
5022 case 'e':
5023 case 'E':
5024 case 'f':
5025 case 'g':
5026 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005027 pbuf = formatbuf;
5028 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5029 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005030 if (len < 0)
5031 goto onError;
5032 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005033 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005034 fill = '0';
5035 break;
5036
5037 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005038 pbuf = formatbuf;
5039 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005040 if (len < 0)
5041 goto onError;
5042 break;
5043
5044 default:
5045 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005046 "unsupported format character '%c' (0x%x) "
5047 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005048 (31<=c && c<=126) ? c : '?',
5049 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 goto onError;
5051 }
5052 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005053 if (*pbuf == '-' || *pbuf == '+') {
5054 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005055 len--;
5056 }
5057 else if (flags & F_SIGN)
5058 sign = '+';
5059 else if (flags & F_BLANK)
5060 sign = ' ';
5061 else
5062 sign = 0;
5063 }
5064 if (width < len)
5065 width = len;
5066 if (rescnt < width + (sign != 0)) {
5067 reslen -= rescnt;
5068 rescnt = width + fmtcnt + 100;
5069 reslen += rescnt;
5070 if (_PyUnicode_Resize(result, reslen) < 0)
5071 return NULL;
5072 res = PyUnicode_AS_UNICODE(result)
5073 + reslen - rescnt;
5074 }
5075 if (sign) {
5076 if (fill != ' ')
5077 *res++ = sign;
5078 rescnt--;
5079 if (width > len)
5080 width--;
5081 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005082 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5083 assert(pbuf[0] == '0');
5084 assert(pbuf[1] == c);
5085 if (fill != ' ') {
5086 *res++ = *pbuf++;
5087 *res++ = *pbuf++;
5088 }
5089 rescnt -= 2;
5090 width -= 2;
5091 if (width < 0)
5092 width = 0;
5093 len -= 2;
5094 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005095 if (width > len && !(flags & F_LJUST)) {
5096 do {
5097 --rescnt;
5098 *res++ = fill;
5099 } while (--width > len);
5100 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005101 if (fill == ' ') {
5102 if (sign)
5103 *res++ = sign;
5104 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5105 assert(pbuf[0] == '0');
5106 assert(pbuf[1] == c);
5107 *res++ = *pbuf++;
5108 *res++ = *pbuf++;
5109 }
5110 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005111 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005112 res += len;
5113 rescnt -= len;
5114 while (--width >= len) {
5115 --rescnt;
5116 *res++ = ' ';
5117 }
5118 if (dict && (argidx < arglen) && c != '%') {
5119 PyErr_SetString(PyExc_TypeError,
5120 "not all arguments converted");
5121 goto onError;
5122 }
5123 Py_XDECREF(temp);
5124 } /* '%' */
5125 } /* until end */
5126 if (argidx < arglen && !dict) {
5127 PyErr_SetString(PyExc_TypeError,
5128 "not all arguments converted");
5129 goto onError;
5130 }
5131
5132 if (args_owned) {
5133 Py_DECREF(args);
5134 }
5135 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005136 if (_PyUnicode_Resize(result, reslen - rescnt))
5137 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005138 return (PyObject *)result;
5139
5140 onError:
5141 Py_XDECREF(result);
5142 Py_DECREF(uformat);
5143 if (args_owned) {
5144 Py_DECREF(args);
5145 }
5146 return NULL;
5147}
5148
5149static PyBufferProcs unicode_as_buffer = {
5150 (getreadbufferproc) unicode_buffer_getreadbuf,
5151 (getwritebufferproc) unicode_buffer_getwritebuf,
5152 (getsegcountproc) unicode_buffer_getsegcount,
5153 (getcharbufferproc) unicode_buffer_getcharbuf,
5154};
5155
5156PyTypeObject PyUnicode_Type = {
5157 PyObject_HEAD_INIT(&PyType_Type)
5158 0, /* ob_size */
5159 "unicode", /* tp_name */
5160 sizeof(PyUnicodeObject), /* tp_size */
5161 0, /* tp_itemsize */
5162 /* Slots */
5163 (destructor)_PyUnicode_Free, /* tp_dealloc */
5164 0, /* tp_print */
5165 (getattrfunc)unicode_getattr, /* tp_getattr */
5166 0, /* tp_setattr */
5167 (cmpfunc) unicode_compare, /* tp_compare */
5168 (reprfunc) unicode_repr, /* tp_repr */
5169 0, /* tp_as_number */
5170 &unicode_as_sequence, /* tp_as_sequence */
5171 0, /* tp_as_mapping */
5172 (hashfunc) unicode_hash, /* tp_hash*/
5173 0, /* tp_call*/
5174 (reprfunc) unicode_str, /* tp_str */
5175 (getattrofunc) NULL, /* tp_getattro */
5176 (setattrofunc) NULL, /* tp_setattro */
5177 &unicode_as_buffer, /* tp_as_buffer */
5178 Py_TPFLAGS_DEFAULT, /* tp_flags */
5179};
5180
5181/* Initialize the Unicode implementation */
5182
Thomas Wouters78890102000-07-22 19:25:51 +00005183void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005184{
5185 /* Doublecheck the configuration... */
5186 if (sizeof(Py_UNICODE) != 2)
5187 Py_FatalError("Unicode configuration error: "
5188 "sizeof(Py_UNICODE) != 2 bytes");
5189
Fred Drakee4315f52000-05-09 19:53:39 +00005190 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005191 unicode_freelist = NULL;
5192 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005193 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005194 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195}
5196
5197/* Finalize the Unicode implementation */
5198
5199void
Thomas Wouters78890102000-07-22 19:25:51 +00005200_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005201{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005202 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005204 Py_XDECREF(unicode_empty);
5205 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005206
5207 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005208 PyUnicodeObject *v = u;
5209 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005210 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005211 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005212 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005213 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005214 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005215 unicode_freelist = NULL;
5216 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005217}