blob: b3c8ba4790f88041a762b8aa969332f905c51703 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +00009--------------------------------------------------------------------
10The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000011
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000012 Copyright (c) 1999 by Secret Labs AB
13 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000014
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000015By obtaining, using, and/or copying this software and/or its
16associated documentation, you agree that you have read, understood,
17and will comply with the following terms and conditions:
18
19Permission to use, copy, modify, and distribute this software and its
20associated documentation for any purpose and without fee is hereby
21granted, provided that the above copyright notice appears in all
22copies, and that both that copyright notice and this permission notice
23appear in supporting documentation, and that the name of Secret Labs
24AB or the author not be used in advertising or publicity pertaining to
25distribution of the software without specific, written prior
26permission.
27
28SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
29THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
30FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
31ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
32WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
33ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
34OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
35--------------------------------------------------------------------
36
37*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000038
39#include "Python.h"
40
Guido van Rossumd57fd912000-03-10 22:53:23 +000041#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000042#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000043
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000044#ifdef MS_WIN32
45#include <windows.h>
46#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000047
Guido van Rossumd57fd912000-03-10 22:53:23 +000048/* Limit for the Unicode object free list */
49
50#define MAX_UNICODE_FREELIST_SIZE 1024
51
52/* Limit for the Unicode object free list stay alive optimization.
53
54 The implementation will keep allocated Unicode memory intact for
55 all objects on the free list having a size less than this
56 limit. This reduces malloc() overhead for small Unicode objects.
57
Barry Warsaw51ac5802000-03-20 16:36:48 +000058 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000059 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000060 malloc()-overhead) bytes of unused garbage.
61
62 Setting the limit to 0 effectively turns the feature off.
63
Guido van Rossumfd4b9572000-04-10 13:51:10 +000064 Note: This is an experimental feature ! If you get core dumps when
65 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000066
67*/
68
Guido van Rossumfd4b9572000-04-10 13:51:10 +000069#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000070
71/* Endianness switches; defaults to little endian */
72
73#ifdef WORDS_BIGENDIAN
74# define BYTEORDER_IS_BIG_ENDIAN
75#else
76# define BYTEORDER_IS_LITTLE_ENDIAN
77#endif
78
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000079/* --- Globals ------------------------------------------------------------
80
81 The globals are initialized by the _PyUnicode_Init() API and should
82 not be used before calling that API.
83
84*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000085
86/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
89/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000090static PyUnicodeObject *unicode_freelist;
91static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
Fred Drakee4315f52000-05-09 19:53:39 +000093/* Default encoding to use and assume when NULL is passed as encoding
94 parameter; it is initialized by _PyUnicode_Init().
95
96 Always use the PyUnicode_SetDefaultEncoding() and
97 PyUnicode_GetDefaultEncoding() APIs to access this global.
98
99*/
100
101static char unicode_default_encoding[100];
102
Guido van Rossumd57fd912000-03-10 22:53:23 +0000103/* --- Unicode Object ----------------------------------------------------- */
104
105static
106int _PyUnicode_Resize(register PyUnicodeObject *unicode,
107 int length)
108{
109 void *oldstr;
110
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000111 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000112 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000113 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115 /* Resizing unicode_empty is not allowed. */
116 if (unicode == unicode_empty) {
117 PyErr_SetString(PyExc_SystemError,
118 "can't resize empty unicode object");
119 return -1;
120 }
121
122 /* We allocate one more byte to make sure the string is
123 Ux0000 terminated -- XXX is this needed ? */
124 oldstr = unicode->str;
125 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
126 if (!unicode->str) {
127 unicode->str = oldstr;
128 PyErr_NoMemory();
129 return -1;
130 }
131 unicode->str[length] = 0;
132 unicode->length = length;
133
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000134 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000136 if (unicode->defenc) {
137 Py_DECREF(unicode->defenc);
138 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 }
140 unicode->hash = -1;
141
142 return 0;
143}
144
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145int PyUnicode_Resize(PyObject **unicode,
146 int length)
147{
148 PyUnicodeObject *v;
149
150 if (unicode == NULL) {
151 PyErr_BadInternalCall();
152 return -1;
153 }
154 v = (PyUnicodeObject *)*unicode;
155 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
156 PyErr_BadInternalCall();
157 return -1;
158 }
159 return _PyUnicode_Resize(v, length);
160}
161
Guido van Rossumd57fd912000-03-10 22:53:23 +0000162/* We allocate one more byte to make sure the string is
163 Ux0000 terminated -- XXX is this needed ?
164
165 XXX This allocator could further be enhanced by assuring that the
166 free list never reduces its size below 1.
167
168*/
169
170static
171PyUnicodeObject *_PyUnicode_New(int length)
172{
173 register PyUnicodeObject *unicode;
174
175 /* Optimization for empty strings */
176 if (length == 0 && unicode_empty != NULL) {
177 Py_INCREF(unicode_empty);
178 return unicode_empty;
179 }
180
181 /* Unicode freelist & memory allocation */
182 if (unicode_freelist) {
183 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000184 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000185 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000186 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000187 /* Keep-Alive optimization: we only upsize the buffer,
188 never downsize it. */
189 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000190 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000191 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000192 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000193 }
194 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000195 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000197 }
198 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000199 }
200 else {
201 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
202 if (unicode == NULL)
203 return NULL;
204 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
205 }
206
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000207 if (!unicode->str) {
208 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000209 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode->str[length] = 0;
212 unicode->length = length;
213 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000214 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000216
217 onError:
218 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000219 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000220 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221}
222
223static
224void _PyUnicode_Free(register PyUnicodeObject *unicode)
225{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000227 /* Keep-Alive optimization */
228 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000229 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 unicode->str = NULL;
231 unicode->length = 0;
232 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000233 if (unicode->defenc) {
234 Py_DECREF(unicode->defenc);
235 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000236 }
237 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238 *(PyUnicodeObject **)unicode = unicode_freelist;
239 unicode_freelist = unicode;
240 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 }
242 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000243 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000244 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 }
247}
248
249PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
250 int size)
251{
252 PyUnicodeObject *unicode;
253
254 unicode = _PyUnicode_New(size);
255 if (!unicode)
256 return NULL;
257
258 /* Copy the Unicode data into the new object */
259 if (u != NULL)
260 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
261
262 return (PyObject *)unicode;
263}
264
265#ifdef HAVE_WCHAR_H
266
267PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
268 int size)
269{
270 PyUnicodeObject *unicode;
271
272 if (w == NULL) {
273 PyErr_BadInternalCall();
274 return NULL;
275 }
276
277 unicode = _PyUnicode_New(size);
278 if (!unicode)
279 return NULL;
280
281 /* Copy the wchar_t data into the new object */
282#ifdef HAVE_USABLE_WCHAR_T
283 memcpy(unicode->str, w, size * sizeof(wchar_t));
284#else
285 {
286 register Py_UNICODE *u;
287 register int i;
288 u = PyUnicode_AS_UNICODE(unicode);
289 for (i = size; i >= 0; i--)
290 *u++ = *w++;
291 }
292#endif
293
294 return (PyObject *)unicode;
295}
296
297int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
298 register wchar_t *w,
299 int size)
300{
301 if (unicode == NULL) {
302 PyErr_BadInternalCall();
303 return -1;
304 }
305 if (size > PyUnicode_GET_SIZE(unicode))
306 size = PyUnicode_GET_SIZE(unicode);
307#ifdef HAVE_USABLE_WCHAR_T
308 memcpy(w, unicode->str, size * sizeof(wchar_t));
309#else
310 {
311 register Py_UNICODE *u;
312 register int i;
313 u = PyUnicode_AS_UNICODE(unicode);
314 for (i = size; i >= 0; i--)
315 *w++ = *u++;
316 }
317#endif
318
319 return size;
320}
321
322#endif
323
324PyObject *PyUnicode_FromObject(register PyObject *obj)
325{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000326 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
327}
328
329PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
330 const char *encoding,
331 const char *errors)
332{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000333 const char *s;
334 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000335 int owned = 0;
336 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000337
338 if (obj == NULL) {
339 PyErr_BadInternalCall();
340 return NULL;
341 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000342
343 /* Coerce object */
344 if (PyInstance_Check(obj)) {
345 PyObject *func;
346 func = PyObject_GetAttrString(obj, "__str__");
347 if (func == NULL) {
348 PyErr_SetString(PyExc_TypeError,
349 "coercing to Unicode: instance doesn't define __str__");
350 return NULL;
351 }
352 obj = PyEval_CallObject(func, NULL);
353 Py_DECREF(func);
354 if (obj == NULL)
355 return NULL;
356 owned = 1;
357 }
358 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000360 v = obj;
361 if (encoding) {
362 PyErr_SetString(PyExc_TypeError,
363 "decoding Unicode is not supported");
364 return NULL;
365 }
366 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000367 }
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
371 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000376 PyErr_Format(PyExc_TypeError,
377 "coercing to Unicode: need string or buffer, "
378 "%.80s found",
379 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000380 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000381 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000382
383 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000384 if (len == 0) {
385 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000387 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000388 else
389 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000390
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000391 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000392 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000393 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000394 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000395 return v;
396
397 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000398 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000399 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000400 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000401 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000402}
403
404PyObject *PyUnicode_Decode(const char *s,
405 int size,
406 const char *encoding,
407 const char *errors)
408{
409 PyObject *buffer = NULL, *unicode;
410
Fred Drakee4315f52000-05-09 19:53:39 +0000411 if (encoding == NULL)
412 encoding = PyUnicode_GetDefaultEncoding();
413
414 /* Shortcuts for common default encodings */
415 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000417 else if (strcmp(encoding, "latin-1") == 0)
418 return PyUnicode_DecodeLatin1(s, size, errors);
419 else if (strcmp(encoding, "ascii") == 0)
420 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000421
422 /* Decode via the codec registry */
423 buffer = PyBuffer_FromMemory((void *)s, size);
424 if (buffer == NULL)
425 goto onError;
426 unicode = PyCodec_Decode(buffer, encoding, errors);
427 if (unicode == NULL)
428 goto onError;
429 if (!PyUnicode_Check(unicode)) {
430 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000431 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000432 unicode->ob_type->tp_name);
433 Py_DECREF(unicode);
434 goto onError;
435 }
436 Py_DECREF(buffer);
437 return unicode;
438
439 onError:
440 Py_XDECREF(buffer);
441 return NULL;
442}
443
444PyObject *PyUnicode_Encode(const Py_UNICODE *s,
445 int size,
446 const char *encoding,
447 const char *errors)
448{
449 PyObject *v, *unicode;
450
451 unicode = PyUnicode_FromUnicode(s, size);
452 if (unicode == NULL)
453 return NULL;
454 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
455 Py_DECREF(unicode);
456 return v;
457}
458
459PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
460 const char *encoding,
461 const char *errors)
462{
463 PyObject *v;
464
465 if (!PyUnicode_Check(unicode)) {
466 PyErr_BadArgument();
467 goto onError;
468 }
Fred Drakee4315f52000-05-09 19:53:39 +0000469
470 if (encoding == NULL)
471 encoding = PyUnicode_GetDefaultEncoding();
472
473 /* Shortcuts for common default encodings */
474 if (errors == NULL) {
475 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000476 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000477 else if (strcmp(encoding, "latin-1") == 0)
478 return PyUnicode_AsLatin1String(unicode);
479 else if (strcmp(encoding, "ascii") == 0)
480 return PyUnicode_AsASCIIString(unicode);
481 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000482
483 /* Encode via the codec registry */
484 v = PyCodec_Encode(unicode, encoding, errors);
485 if (v == NULL)
486 goto onError;
487 /* XXX Should we really enforce this ? */
488 if (!PyString_Check(v)) {
489 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000490 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000491 v->ob_type->tp_name);
492 Py_DECREF(v);
493 goto onError;
494 }
495 return v;
496
497 onError:
498 return NULL;
499}
500
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000501/* Return a Python string holding the default encoded value of the
502 Unicode object.
503
504 The resulting string is cached in the Unicode object for subsequent
505 usage by this function. The cached version is needed to implement
506 the character buffer interface and will live (at least) as long as
507 the Unicode object itself.
508
509 The refcount of the string is *not* incremented.
510
511 *** Exported for internal use by the interpreter only !!! ***
512
513*/
514
515PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
516 const char *errors)
517{
518 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
519
520 if (v)
521 return v;
522 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
523 if (v && errors == NULL)
524 ((PyUnicodeObject *)unicode)->defenc = v;
525 return v;
526}
527
Guido van Rossumd57fd912000-03-10 22:53:23 +0000528Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
529{
530 if (!PyUnicode_Check(unicode)) {
531 PyErr_BadArgument();
532 goto onError;
533 }
534 return PyUnicode_AS_UNICODE(unicode);
535
536 onError:
537 return NULL;
538}
539
540int PyUnicode_GetSize(PyObject *unicode)
541{
542 if (!PyUnicode_Check(unicode)) {
543 PyErr_BadArgument();
544 goto onError;
545 }
546 return PyUnicode_GET_SIZE(unicode);
547
548 onError:
549 return -1;
550}
551
Thomas Wouters78890102000-07-22 19:25:51 +0000552const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000553{
554 return unicode_default_encoding;
555}
556
557int PyUnicode_SetDefaultEncoding(const char *encoding)
558{
559 PyObject *v;
560
561 /* Make sure the encoding is valid. As side effect, this also
562 loads the encoding into the codec registry cache. */
563 v = _PyCodec_Lookup(encoding);
564 if (v == NULL)
565 goto onError;
566 Py_DECREF(v);
567 strncpy(unicode_default_encoding,
568 encoding,
569 sizeof(unicode_default_encoding));
570 return 0;
571
572 onError:
573 return -1;
574}
575
Guido van Rossumd57fd912000-03-10 22:53:23 +0000576/* --- UTF-8 Codec -------------------------------------------------------- */
577
578static
579char utf8_code_length[256] = {
580 /* Map UTF-8 encoded prefix byte to sequence length. zero means
581 illegal prefix. see RFC 2279 for details */
582 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
583 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
584 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
585 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
586 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
587 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
588 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
589 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
590 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
591 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
592 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
593 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
595 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
596 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
597 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
598};
599
600static
601int utf8_decoding_error(const char **source,
602 Py_UNICODE **dest,
603 const char *errors,
604 const char *details)
605{
606 if ((errors == NULL) ||
607 (strcmp(errors,"strict") == 0)) {
608 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000609 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000610 details);
611 return -1;
612 }
613 else if (strcmp(errors,"ignore") == 0) {
614 (*source)++;
615 return 0;
616 }
617 else if (strcmp(errors,"replace") == 0) {
618 (*source)++;
619 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
620 (*dest)++;
621 return 0;
622 }
623 else {
624 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000625 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 errors);
627 return -1;
628 }
629}
630
Guido van Rossumd57fd912000-03-10 22:53:23 +0000631PyObject *PyUnicode_DecodeUTF8(const char *s,
632 int size,
633 const char *errors)
634{
635 int n;
636 const char *e;
637 PyUnicodeObject *unicode;
638 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000639 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000640
641 /* Note: size will always be longer than the resulting Unicode
642 character count */
643 unicode = _PyUnicode_New(size);
644 if (!unicode)
645 return NULL;
646 if (size == 0)
647 return (PyObject *)unicode;
648
649 /* Unpack UTF-8 encoded data */
650 p = unicode->str;
651 e = s + size;
652
653 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000654 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000655
656 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000657 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000658 s++;
659 continue;
660 }
661
662 n = utf8_code_length[ch];
663
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000664 if (s + n > e) {
665 errmsg = "unexpected end of data";
666 goto utf8Error;
667 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000668
669 switch (n) {
670
671 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000672 errmsg = "unexpected code byte";
673 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000674
675 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000676 errmsg = "internal error";
677 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000678
679 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000680 if ((s[1] & 0xc0) != 0x80) {
681 errmsg = "invalid data";
682 goto utf8Error;
683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000684 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000685 if (ch < 0x80) {
686 errmsg = "illegal encoding";
687 goto utf8Error;
688 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000690 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000691 break;
692
693 case 3:
694 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000695 (s[2] & 0xc0) != 0x80) {
696 errmsg = "invalid data";
697 goto utf8Error;
698 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000700 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
701 errmsg = "illegal encoding";
702 goto utf8Error;
703 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000704 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000705 *p++ = (Py_UNICODE)ch;
706 break;
707
708 case 4:
709 if ((s[1] & 0xc0) != 0x80 ||
710 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000711 (s[3] & 0xc0) != 0x80) {
712 errmsg = "invalid data";
713 goto utf8Error;
714 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000715 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
716 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
717 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 if ((ch < 0x10000) || /* minimum value allowed for 4
719 byte encoding */
720 (ch > 0x10ffff)) { /* maximum value allowed for
721 UTF-16 */
722 errmsg = "illegal encoding";
723 goto utf8Error;
724 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000725 /* compute and append the two surrogates: */
726
727 /* translate from 10000..10FFFF to 0..FFFF */
728 ch -= 0x10000;
729
730 /* high surrogate = top 10 bits added to D800 */
731 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
732
733 /* low surrogate = bottom 10 bits added to DC00 */
734 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 break;
736
737 default:
738 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000739 errmsg = "unsupported Unicode code range";
740 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000741 }
742 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000743 continue;
744
745 utf8Error:
746 if (utf8_decoding_error(&s, &p, errors, errmsg))
747 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000748 }
749
750 /* Adjust length */
751 if (_PyUnicode_Resize(unicode, p - unicode->str))
752 goto onError;
753
754 return (PyObject *)unicode;
755
756onError:
757 Py_DECREF(unicode);
758 return NULL;
759}
760
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000761/* Not used anymore, now that the encoder supports UTF-16
762 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000763#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000764static
765int utf8_encoding_error(const Py_UNICODE **source,
766 char **dest,
767 const char *errors,
768 const char *details)
769{
770 if ((errors == NULL) ||
771 (strcmp(errors,"strict") == 0)) {
772 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000773 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000774 details);
775 return -1;
776 }
777 else if (strcmp(errors,"ignore") == 0) {
778 return 0;
779 }
780 else if (strcmp(errors,"replace") == 0) {
781 **dest = '?';
782 (*dest)++;
783 return 0;
784 }
785 else {
786 PyErr_Format(PyExc_ValueError,
787 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000788 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000789 errors);
790 return -1;
791 }
792}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000793#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794
795PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
796 int size,
797 const char *errors)
798{
799 PyObject *v;
800 char *p;
801 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000802 Py_UCS4 ch2;
803 unsigned int cbAllocated = 3 * size;
804 unsigned int cbWritten = 0;
805 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000806
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000807 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 if (v == NULL)
809 return NULL;
810 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000811 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812
813 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000814 while (i < size) {
815 Py_UCS4 ch = s[i++];
816 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000817 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000818 cbWritten++;
819 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000820 else if (ch < 0x0800) {
821 *p++ = 0xc0 | (ch >> 6);
822 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000823 cbWritten += 2;
824 }
825 else {
826 /* Check for high surrogate */
827 if (0xD800 <= ch && ch <= 0xDBFF) {
828 if (i != size) {
829 ch2 = s[i];
830 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
831
832 if (cbWritten >= (cbAllocated - 4)) {
833 /* Provide enough room for some more
834 surrogates */
835 cbAllocated += 4*10;
836 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000837 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000838 }
839
840 /* combine the two values */
841 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
842
843 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000844 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000845 i++;
846 cbWritten += 4;
847 }
848 }
849 }
850 else {
851 *p++ = (char)(0xe0 | (ch >> 12));
852 cbWritten += 3;
853 }
854 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
855 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000856 }
857 }
858 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000859 if (_PyString_Resize(&v, p - q))
860 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000861 return v;
862
863 onError:
864 Py_DECREF(v);
865 return NULL;
866}
867
Guido van Rossumd57fd912000-03-10 22:53:23 +0000868PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
869{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000870 if (!PyUnicode_Check(unicode)) {
871 PyErr_BadArgument();
872 return NULL;
873 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000874 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
875 PyUnicode_GET_SIZE(unicode),
876 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000877}
878
879/* --- UTF-16 Codec ------------------------------------------------------- */
880
881static
882int utf16_decoding_error(const Py_UNICODE **source,
883 Py_UNICODE **dest,
884 const char *errors,
885 const char *details)
886{
887 if ((errors == NULL) ||
888 (strcmp(errors,"strict") == 0)) {
889 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000890 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000891 details);
892 return -1;
893 }
894 else if (strcmp(errors,"ignore") == 0) {
895 return 0;
896 }
897 else if (strcmp(errors,"replace") == 0) {
898 if (dest) {
899 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
900 (*dest)++;
901 }
902 return 0;
903 }
904 else {
905 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000906 "UTF-16 decoding error; "
907 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000908 errors);
909 return -1;
910 }
911}
912
Guido van Rossumd57fd912000-03-10 22:53:23 +0000913PyObject *PyUnicode_DecodeUTF16(const char *s,
914 int size,
915 const char *errors,
916 int *byteorder)
917{
918 PyUnicodeObject *unicode;
919 Py_UNICODE *p;
920 const Py_UNICODE *q, *e;
921 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000922 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000923
924 /* size should be an even number */
925 if (size % sizeof(Py_UNICODE) != 0) {
926 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
927 return NULL;
928 /* The remaining input chars are ignored if we fall through
929 here... */
930 }
931
932 /* Note: size will always be longer than the resulting Unicode
933 character count */
934 unicode = _PyUnicode_New(size);
935 if (!unicode)
936 return NULL;
937 if (size == 0)
938 return (PyObject *)unicode;
939
940 /* Unpack UTF-16 encoded data */
941 p = unicode->str;
942 q = (Py_UNICODE *)s;
943 e = q + (size / sizeof(Py_UNICODE));
944
945 if (byteorder)
946 bo = *byteorder;
947
948 while (q < e) {
949 register Py_UNICODE ch = *q++;
950
951 /* Check for BOM marks (U+FEFF) in the input and adjust
952 current byte order setting accordingly. Swap input
953 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
954 !) */
955#ifdef BYTEORDER_IS_LITTLE_ENDIAN
956 if (ch == 0xFEFF) {
957 bo = -1;
958 continue;
959 } else if (ch == 0xFFFE) {
960 bo = 1;
961 continue;
962 }
963 if (bo == 1)
964 ch = (ch >> 8) | (ch << 8);
965#else
966 if (ch == 0xFEFF) {
967 bo = 1;
968 continue;
969 } else if (ch == 0xFFFE) {
970 bo = -1;
971 continue;
972 }
973 if (bo == -1)
974 ch = (ch >> 8) | (ch << 8);
975#endif
976 if (ch < 0xD800 || ch > 0xDFFF) {
977 *p++ = ch;
978 continue;
979 }
980
981 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000982 if (q >= e) {
983 errmsg = "unexpected end of data";
984 goto utf16Error;
985 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000986 if (0xDC00 <= *q && *q <= 0xDFFF) {
987 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000988 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000989 /* This is valid data (a UTF-16 surrogate pair), but
990 we are not able to store this information since our
991 Py_UNICODE type only has 16 bits... this might
992 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000993 errmsg = "code pairs are not supported";
994 goto utf16Error;
995 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000996 else
997 continue;
998 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000999 errmsg = "illegal encoding";
1000 /* Fall through to report the error */
1001
1002 utf16Error:
1003 if (utf16_decoding_error(&q, &p, errors, errmsg))
1004 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001005 }
1006
1007 if (byteorder)
1008 *byteorder = bo;
1009
1010 /* Adjust length */
1011 if (_PyUnicode_Resize(unicode, p - unicode->str))
1012 goto onError;
1013
1014 return (PyObject *)unicode;
1015
1016onError:
1017 Py_DECREF(unicode);
1018 return NULL;
1019}
1020
1021#undef UTF16_ERROR
1022
1023PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1024 int size,
1025 const char *errors,
1026 int byteorder)
1027{
1028 PyObject *v;
1029 Py_UNICODE *p;
1030 char *q;
1031
1032 /* We don't create UTF-16 pairs... */
1033 v = PyString_FromStringAndSize(NULL,
1034 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1035 if (v == NULL)
1036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001037
1038 q = PyString_AS_STRING(v);
1039 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001040 if (byteorder == 0)
1041 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001042 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001043 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001044 if (byteorder == 0 ||
1045#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1046 byteorder == -1
1047#else
1048 byteorder == 1
1049#endif
1050 )
1051 memcpy(p, s, size * sizeof(Py_UNICODE));
1052 else
1053 while (size-- > 0) {
1054 Py_UNICODE ch = *s++;
1055 *p++ = (ch >> 8) | (ch << 8);
1056 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001057 return v;
1058}
1059
1060PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1061{
1062 if (!PyUnicode_Check(unicode)) {
1063 PyErr_BadArgument();
1064 return NULL;
1065 }
1066 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1067 PyUnicode_GET_SIZE(unicode),
1068 NULL,
1069 0);
1070}
1071
1072/* --- Unicode Escape Codec ----------------------------------------------- */
1073
1074static
1075int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001076 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077 const char *errors,
1078 const char *details)
1079{
1080 if ((errors == NULL) ||
1081 (strcmp(errors,"strict") == 0)) {
1082 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001083 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001084 details);
1085 return -1;
1086 }
1087 else if (strcmp(errors,"ignore") == 0) {
1088 return 0;
1089 }
1090 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001091 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001092 return 0;
1093 }
1094 else {
1095 PyErr_Format(PyExc_ValueError,
1096 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001097 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 errors);
1099 return -1;
1100 }
1101}
1102
Fredrik Lundh06d12682001-01-24 07:59:11 +00001103static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001104
Guido van Rossumd57fd912000-03-10 22:53:23 +00001105PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1106 int size,
1107 const char *errors)
1108{
1109 PyUnicodeObject *v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001110 Py_UNICODE *p, *buf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001111 const char *end;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001112 char* message;
1113 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
1114
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 /* Escaped strings will always be longer than the resulting
1116 Unicode string, so we start with size here and then reduce the
1117 length after conversion to the true value. */
1118 v = _PyUnicode_New(size);
1119 if (v == NULL)
1120 goto onError;
1121 if (size == 0)
1122 return (PyObject *)v;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001123
Guido van Rossumd57fd912000-03-10 22:53:23 +00001124 p = buf = PyUnicode_AS_UNICODE(v);
1125 end = s + size;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001126
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 while (s < end) {
1128 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001129 Py_UNICODE x;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001130 int i, digits;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001131
1132 /* Non-escape characters are interpreted as Unicode ordinals */
1133 if (*s != '\\') {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001134 *p++ = (unsigned char) *s++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135 continue;
1136 }
1137
1138 /* \ - Escapes */
1139 s++;
1140 switch (*s++) {
1141
1142 /* \x escapes */
1143 case '\n': break;
1144 case '\\': *p++ = '\\'; break;
1145 case '\'': *p++ = '\''; break;
1146 case '\"': *p++ = '\"'; break;
1147 case 'b': *p++ = '\b'; break;
1148 case 'f': *p++ = '\014'; break; /* FF */
1149 case 't': *p++ = '\t'; break;
1150 case 'n': *p++ = '\n'; break;
1151 case 'r': *p++ = '\r'; break;
1152 case 'v': *p++ = '\013'; break; /* VT */
1153 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1154
1155 /* \OOO (octal) escapes */
1156 case '0': case '1': case '2': case '3':
1157 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001158 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001160 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001162 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001163 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001164 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 break;
1166
Fredrik Lundhccc74732001-02-18 22:13:49 +00001167 /* hex escapes */
1168 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 case 'x':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001170 digits = 2;
1171 message = "truncated \\xXX escape";
1172 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001173
Fredrik Lundhccc74732001-02-18 22:13:49 +00001174 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001175 case 'u':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001176 digits = 4;
1177 message = "truncated \\uXXXX escape";
1178 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001179
Fredrik Lundhccc74732001-02-18 22:13:49 +00001180 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001181 case 'U':
Fredrik Lundhccc74732001-02-18 22:13:49 +00001182 digits = 8;
1183 message = "truncated \\UXXXXXXXX escape";
1184 hexescape:
1185 chr = 0;
1186 for (i = 0; i < digits; i++) {
1187 c = (unsigned char) s[i];
Fredrik Lundhdf846752000-09-03 11:29:49 +00001188 if (!isxdigit(c)) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00001189 if (unicodeescape_decoding_error(&s, &x, errors, message))
Fredrik Lundhdf846752000-09-03 11:29:49 +00001190 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001191 chr = x;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001192 i++;
1193 break;
1194 }
1195 chr = (chr<<4) & ~0xF;
1196 if (c >= '0' && c <= '9')
1197 chr += c - '0';
1198 else if (c >= 'a' && c <= 'f')
1199 chr += 10 + c - 'a';
1200 else
1201 chr += 10 + c - 'A';
1202 }
1203 s += i;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001204 store:
Fredrik Lundhdf846752000-09-03 11:29:49 +00001205 /* when we get here, chr is a 32-bit unicode character */
1206 if (chr <= 0xffff)
1207 /* UCS-2 character */
1208 *p++ = (Py_UNICODE) chr;
1209 else if (chr <= 0x10ffff) {
1210 /* UCS-4 character. store as two surrogate characters */
1211 chr -= 0x10000L;
1212 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1213 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1214 } else {
1215 if (unicodeescape_decoding_error(
1216 &s, &x, errors,
Fredrik Lundhccc74732001-02-18 22:13:49 +00001217 "illegal Unicode character")
Fredrik Lundhdf846752000-09-03 11:29:49 +00001218 )
1219 goto onError;
Fredrik Lundhccc74732001-02-18 22:13:49 +00001220 *p++ = x; /* store replacement character */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001221 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00001222 break;
1223
1224 /* \N{name} */
1225 case 'N':
1226 message = "malformed \\N character escape";
1227 if (ucnhash_CAPI == NULL) {
1228 /* load the unicode data module */
1229 PyObject *m, *v;
1230 m = PyImport_ImportModule("unicodedata");
1231 if (m == NULL)
1232 goto ucnhashError;
1233 v = PyObject_GetAttrString(m, "ucnhash_CAPI");
1234 Py_DECREF(m);
1235 if (v == NULL)
1236 goto ucnhashError;
1237 ucnhash_CAPI = PyCObject_AsVoidPtr(v);
1238 Py_DECREF(v);
1239 if (ucnhash_CAPI == NULL)
1240 goto ucnhashError;
1241 }
1242 if (*s == '{') {
1243 const char *start = s+1;
1244 /* look for the closing brace */
1245 while (*s != '}' && s < end)
1246 s++;
1247 if (s > start && s < end && *s == '}') {
1248 /* found a name. look it up in the unicode database */
1249 message = "unknown Unicode character name";
1250 s++;
1251 if (ucnhash_CAPI->getcode(start, s-start-1, &chr))
1252 goto store;
1253 }
1254 }
1255 if (unicodeescape_decoding_error(&s, &x, errors, message))
1256 goto onError;
1257 *p++ = x;
1258 break;
1259
1260 default:
1261 *p++ = '\\';
1262 *p++ = (unsigned char)s[-1];
1263 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001264 }
1265 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001266 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001267 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001268 return (PyObject *)v;
1269
Fredrik Lundhccc74732001-02-18 22:13:49 +00001270ucnhashError:
Fredrik Lundh06d12682001-01-24 07:59:11 +00001271 PyErr_SetString(
1272 PyExc_UnicodeError,
1273 "\\N escapes not supported (can't load unicodedata module)"
1274 );
Fredrik Lundhf6056062001-01-20 11:15:25 +00001275 return NULL;
1276
Fredrik Lundhccc74732001-02-18 22:13:49 +00001277onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001278 Py_XDECREF(v);
1279 return NULL;
1280}
1281
1282/* Return a Unicode-Escape string version of the Unicode object.
1283
1284 If quotes is true, the string is enclosed in u"" or u'' quotes as
1285 appropriate.
1286
1287*/
1288
Barry Warsaw51ac5802000-03-20 16:36:48 +00001289static const Py_UNICODE *findchar(const Py_UNICODE *s,
1290 int size,
1291 Py_UNICODE ch);
1292
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293static
1294PyObject *unicodeescape_string(const Py_UNICODE *s,
1295 int size,
1296 int quotes)
1297{
1298 PyObject *repr;
1299 char *p;
1300 char *q;
1301
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001302 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001303
1304 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1305 if (repr == NULL)
1306 return NULL;
1307
1308 p = q = PyString_AS_STRING(repr);
1309
1310 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001311 *p++ = 'u';
1312 *p++ = (findchar(s, size, '\'') &&
1313 !findchar(s, size, '"')) ? '"' : '\'';
1314 }
1315 while (size-- > 0) {
1316 Py_UNICODE ch = *s++;
1317 /* Escape quotes */
1318 if (quotes && (ch == q[1] || ch == '\\')) {
1319 *p++ = '\\';
1320 *p++ = (char) ch;
1321 }
1322 /* Map 16-bit characters to '\uxxxx' */
1323 else if (ch >= 256) {
1324 *p++ = '\\';
1325 *p++ = 'u';
1326 *p++ = hexdigit[(ch >> 12) & 0xf];
1327 *p++ = hexdigit[(ch >> 8) & 0xf];
1328 *p++ = hexdigit[(ch >> 4) & 0xf];
1329 *p++ = hexdigit[ch & 15];
1330 }
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001331 /* Map special whitespace to '\t', \n', '\r' */
1332 else if (ch == '\t') {
1333 *p++ = '\\';
1334 *p++ = 't';
1335 }
1336 else if (ch == '\n') {
1337 *p++ = '\\';
1338 *p++ = 'n';
1339 }
1340 else if (ch == '\r') {
1341 *p++ = '\\';
1342 *p++ = 'r';
1343 }
1344 /* Map non-printable US ASCII to '\xhh' */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001345 else if (ch < ' ' || ch >= 128) {
1346 *p++ = '\\';
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001347 *p++ = 'x';
1348 *p++ = hexdigit[(ch >> 4) & 0xf];
1349 *p++ = hexdigit[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 }
1351 /* Copy everything else as-is */
1352 else
1353 *p++ = (char) ch;
1354 }
1355 if (quotes)
1356 *p++ = q[1];
1357
1358 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001359 if (_PyString_Resize(&repr, p - q))
1360 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001361
1362 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001363
1364 onError:
1365 Py_DECREF(repr);
1366 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367}
1368
1369PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1370 int size)
1371{
1372 return unicodeescape_string(s, size, 0);
1373}
1374
1375PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1376{
1377 if (!PyUnicode_Check(unicode)) {
1378 PyErr_BadArgument();
1379 return NULL;
1380 }
1381 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1382 PyUnicode_GET_SIZE(unicode));
1383}
1384
1385/* --- Raw Unicode Escape Codec ------------------------------------------- */
1386
1387PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1388 int size,
1389 const char *errors)
1390{
1391 PyUnicodeObject *v;
1392 Py_UNICODE *p, *buf;
1393 const char *end;
1394 const char *bs;
1395
1396 /* Escaped strings will always be longer than the resulting
1397 Unicode string, so we start with size here and then reduce the
1398 length after conversion to the true value. */
1399 v = _PyUnicode_New(size);
1400 if (v == NULL)
1401 goto onError;
1402 if (size == 0)
1403 return (PyObject *)v;
1404 p = buf = PyUnicode_AS_UNICODE(v);
1405 end = s + size;
1406 while (s < end) {
1407 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001408 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 int i;
1410
1411 /* Non-escape characters are interpreted as Unicode ordinals */
1412 if (*s != '\\') {
1413 *p++ = (unsigned char)*s++;
1414 continue;
1415 }
1416
1417 /* \u-escapes are only interpreted iff the number of leading
1418 backslashes if odd */
1419 bs = s;
1420 for (;s < end;) {
1421 if (*s != '\\')
1422 break;
1423 *p++ = (unsigned char)*s++;
1424 }
1425 if (((s - bs) & 1) == 0 ||
1426 s >= end ||
1427 *s != 'u') {
1428 continue;
1429 }
1430 p--;
1431 s++;
1432
1433 /* \uXXXX with 4 hex digits */
1434 for (x = 0, i = 0; i < 4; i++) {
1435 c = (unsigned char)s[i];
1436 if (!isxdigit(c)) {
1437 if (unicodeescape_decoding_error(&s, &x, errors,
1438 "truncated \\uXXXX"))
1439 goto onError;
1440 i++;
1441 break;
1442 }
1443 x = (x<<4) & ~0xF;
1444 if (c >= '0' && c <= '9')
1445 x += c - '0';
1446 else if (c >= 'a' && c <= 'f')
1447 x += 10 + c - 'a';
1448 else
1449 x += 10 + c - 'A';
1450 }
1451 s += i;
1452 *p++ = x;
1453 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001454 if (_PyUnicode_Resize(v, (int)(p - buf)))
1455 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001456 return (PyObject *)v;
1457
1458 onError:
1459 Py_XDECREF(v);
1460 return NULL;
1461}
1462
1463PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1464 int size)
1465{
1466 PyObject *repr;
1467 char *p;
1468 char *q;
1469
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00001470 static const char *hexdigit = "0123456789abcdef";
Guido van Rossumd57fd912000-03-10 22:53:23 +00001471
1472 repr = PyString_FromStringAndSize(NULL, 6 * size);
1473 if (repr == NULL)
1474 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001475 if (size == 0)
1476 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001477
1478 p = q = PyString_AS_STRING(repr);
1479 while (size-- > 0) {
1480 Py_UNICODE ch = *s++;
1481 /* Map 16-bit characters to '\uxxxx' */
1482 if (ch >= 256) {
1483 *p++ = '\\';
1484 *p++ = 'u';
1485 *p++ = hexdigit[(ch >> 12) & 0xf];
1486 *p++ = hexdigit[(ch >> 8) & 0xf];
1487 *p++ = hexdigit[(ch >> 4) & 0xf];
1488 *p++ = hexdigit[ch & 15];
1489 }
1490 /* Copy everything else as-is */
1491 else
1492 *p++ = (char) ch;
1493 }
1494 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001495 if (_PyString_Resize(&repr, p - q))
1496 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497
1498 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001499
1500 onError:
1501 Py_DECREF(repr);
1502 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001503}
1504
1505PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1506{
1507 if (!PyUnicode_Check(unicode)) {
1508 PyErr_BadArgument();
1509 return NULL;
1510 }
1511 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1512 PyUnicode_GET_SIZE(unicode));
1513}
1514
1515/* --- Latin-1 Codec ------------------------------------------------------ */
1516
1517PyObject *PyUnicode_DecodeLatin1(const char *s,
1518 int size,
1519 const char *errors)
1520{
1521 PyUnicodeObject *v;
1522 Py_UNICODE *p;
1523
1524 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1525 v = _PyUnicode_New(size);
1526 if (v == NULL)
1527 goto onError;
1528 if (size == 0)
1529 return (PyObject *)v;
1530 p = PyUnicode_AS_UNICODE(v);
1531 while (size-- > 0)
1532 *p++ = (unsigned char)*s++;
1533 return (PyObject *)v;
1534
1535 onError:
1536 Py_XDECREF(v);
1537 return NULL;
1538}
1539
1540static
1541int latin1_encoding_error(const Py_UNICODE **source,
1542 char **dest,
1543 const char *errors,
1544 const char *details)
1545{
1546 if ((errors == NULL) ||
1547 (strcmp(errors,"strict") == 0)) {
1548 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001549 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001550 details);
1551 return -1;
1552 }
1553 else if (strcmp(errors,"ignore") == 0) {
1554 return 0;
1555 }
1556 else if (strcmp(errors,"replace") == 0) {
1557 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001558 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559 return 0;
1560 }
1561 else {
1562 PyErr_Format(PyExc_ValueError,
1563 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001564 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 errors);
1566 return -1;
1567 }
1568}
1569
1570PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1571 int size,
1572 const char *errors)
1573{
1574 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001575 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001576
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 repr = PyString_FromStringAndSize(NULL, size);
1578 if (repr == NULL)
1579 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001580 if (size == 0)
1581 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001582
1583 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001584 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585 while (size-- > 0) {
1586 Py_UNICODE ch = *p++;
1587 if (ch >= 256) {
1588 if (latin1_encoding_error(&p, &s, errors,
1589 "ordinal not in range(256)"))
1590 goto onError;
1591 }
1592 else
1593 *s++ = (char)ch;
1594 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001595 /* Resize if error handling skipped some characters */
1596 if (s - start < PyString_GET_SIZE(repr))
1597 if (_PyString_Resize(&repr, s - start))
1598 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001599 return repr;
1600
1601 onError:
1602 Py_DECREF(repr);
1603 return NULL;
1604}
1605
1606PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1607{
1608 if (!PyUnicode_Check(unicode)) {
1609 PyErr_BadArgument();
1610 return NULL;
1611 }
1612 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1613 PyUnicode_GET_SIZE(unicode),
1614 NULL);
1615}
1616
1617/* --- 7-bit ASCII Codec -------------------------------------------------- */
1618
1619static
1620int ascii_decoding_error(const char **source,
1621 Py_UNICODE **dest,
1622 const char *errors,
1623 const char *details)
1624{
1625 if ((errors == NULL) ||
1626 (strcmp(errors,"strict") == 0)) {
1627 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001628 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 details);
1630 return -1;
1631 }
1632 else if (strcmp(errors,"ignore") == 0) {
1633 return 0;
1634 }
1635 else if (strcmp(errors,"replace") == 0) {
1636 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1637 (*dest)++;
1638 return 0;
1639 }
1640 else {
1641 PyErr_Format(PyExc_ValueError,
1642 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001643 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001644 errors);
1645 return -1;
1646 }
1647}
1648
1649PyObject *PyUnicode_DecodeASCII(const char *s,
1650 int size,
1651 const char *errors)
1652{
1653 PyUnicodeObject *v;
1654 Py_UNICODE *p;
1655
1656 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1657 v = _PyUnicode_New(size);
1658 if (v == NULL)
1659 goto onError;
1660 if (size == 0)
1661 return (PyObject *)v;
1662 p = PyUnicode_AS_UNICODE(v);
1663 while (size-- > 0) {
1664 register unsigned char c;
1665
1666 c = (unsigned char)*s++;
1667 if (c < 128)
1668 *p++ = c;
1669 else if (ascii_decoding_error(&s, &p, errors,
1670 "ordinal not in range(128)"))
1671 goto onError;
1672 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001673 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1674 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1675 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001676 return (PyObject *)v;
1677
1678 onError:
1679 Py_XDECREF(v);
1680 return NULL;
1681}
1682
1683static
1684int ascii_encoding_error(const Py_UNICODE **source,
1685 char **dest,
1686 const char *errors,
1687 const char *details)
1688{
1689 if ((errors == NULL) ||
1690 (strcmp(errors,"strict") == 0)) {
1691 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001692 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 details);
1694 return -1;
1695 }
1696 else if (strcmp(errors,"ignore") == 0) {
1697 return 0;
1698 }
1699 else if (strcmp(errors,"replace") == 0) {
1700 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001701 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001702 return 0;
1703 }
1704 else {
1705 PyErr_Format(PyExc_ValueError,
1706 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001707 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001708 errors);
1709 return -1;
1710 }
1711}
1712
1713PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1714 int size,
1715 const char *errors)
1716{
1717 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001718 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001719
Guido van Rossumd57fd912000-03-10 22:53:23 +00001720 repr = PyString_FromStringAndSize(NULL, size);
1721 if (repr == NULL)
1722 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001723 if (size == 0)
1724 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001725
1726 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001727 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001728 while (size-- > 0) {
1729 Py_UNICODE ch = *p++;
1730 if (ch >= 128) {
1731 if (ascii_encoding_error(&p, &s, errors,
1732 "ordinal not in range(128)"))
1733 goto onError;
1734 }
1735 else
1736 *s++ = (char)ch;
1737 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001738 /* Resize if error handling skipped some characters */
1739 if (s - start < PyString_GET_SIZE(repr))
1740 if (_PyString_Resize(&repr, s - start))
1741 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001742 return repr;
1743
1744 onError:
1745 Py_DECREF(repr);
1746 return NULL;
1747}
1748
1749PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1750{
1751 if (!PyUnicode_Check(unicode)) {
1752 PyErr_BadArgument();
1753 return NULL;
1754 }
1755 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1756 PyUnicode_GET_SIZE(unicode),
1757 NULL);
1758}
1759
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001760#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001761
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001762/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001763
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001764PyObject *PyUnicode_DecodeMBCS(const char *s,
1765 int size,
1766 const char *errors)
1767{
1768 PyUnicodeObject *v;
1769 Py_UNICODE *p;
1770
1771 /* First get the size of the result */
1772 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001773 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001774 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1775
1776 v = _PyUnicode_New(usize);
1777 if (v == NULL)
1778 return NULL;
1779 if (usize == 0)
1780 return (PyObject *)v;
1781 p = PyUnicode_AS_UNICODE(v);
1782 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1783 Py_DECREF(v);
1784 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1785 }
1786
1787 return (PyObject *)v;
1788}
1789
1790PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1791 int size,
1792 const char *errors)
1793{
1794 PyObject *repr;
1795 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001796 DWORD mbcssize;
1797
1798 /* If there are no characters, bail now! */
1799 if (size==0)
1800 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001801
1802 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001803 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001804 if (mbcssize==0)
1805 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1806
1807 repr = PyString_FromStringAndSize(NULL, mbcssize);
1808 if (repr == NULL)
1809 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001810 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001811 return repr;
1812
1813 /* Do the conversion */
1814 s = PyString_AS_STRING(repr);
1815 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1816 Py_DECREF(repr);
1817 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1818 }
1819 return repr;
1820}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001821
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001822#endif /* MS_WIN32 */
1823
Guido van Rossumd57fd912000-03-10 22:53:23 +00001824/* --- Character Mapping Codec -------------------------------------------- */
1825
1826static
1827int charmap_decoding_error(const char **source,
1828 Py_UNICODE **dest,
1829 const char *errors,
1830 const char *details)
1831{
1832 if ((errors == NULL) ||
1833 (strcmp(errors,"strict") == 0)) {
1834 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001835 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001836 details);
1837 return -1;
1838 }
1839 else if (strcmp(errors,"ignore") == 0) {
1840 return 0;
1841 }
1842 else if (strcmp(errors,"replace") == 0) {
1843 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1844 (*dest)++;
1845 return 0;
1846 }
1847 else {
1848 PyErr_Format(PyExc_ValueError,
1849 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001850 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001851 errors);
1852 return -1;
1853 }
1854}
1855
1856PyObject *PyUnicode_DecodeCharmap(const char *s,
1857 int size,
1858 PyObject *mapping,
1859 const char *errors)
1860{
1861 PyUnicodeObject *v;
1862 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001863 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864
1865 /* Default to Latin-1 */
1866 if (mapping == NULL)
1867 return PyUnicode_DecodeLatin1(s, size, errors);
1868
1869 v = _PyUnicode_New(size);
1870 if (v == NULL)
1871 goto onError;
1872 if (size == 0)
1873 return (PyObject *)v;
1874 p = PyUnicode_AS_UNICODE(v);
1875 while (size-- > 0) {
1876 unsigned char ch = *s++;
1877 PyObject *w, *x;
1878
1879 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1880 w = PyInt_FromLong((long)ch);
1881 if (w == NULL)
1882 goto onError;
1883 x = PyObject_GetItem(mapping, w);
1884 Py_DECREF(w);
1885 if (x == NULL) {
1886 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001887 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001888 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001889 x = Py_None;
1890 Py_INCREF(x);
1891 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001892 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001893 }
1894
1895 /* Apply mapping */
1896 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001897 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001898 if (value < 0 || value > 65535) {
1899 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001900 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001901 Py_DECREF(x);
1902 goto onError;
1903 }
1904 *p++ = (Py_UNICODE)value;
1905 }
1906 else if (x == Py_None) {
1907 /* undefined mapping */
1908 if (charmap_decoding_error(&s, &p, errors,
1909 "character maps to <undefined>")) {
1910 Py_DECREF(x);
1911 goto onError;
1912 }
1913 }
1914 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001915 int targetsize = PyUnicode_GET_SIZE(x);
1916
1917 if (targetsize == 1)
1918 /* 1-1 mapping */
1919 *p++ = *PyUnicode_AS_UNICODE(x);
1920
1921 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001923 if (targetsize > extrachars) {
1924 /* resize first */
1925 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
1926 int needed = (targetsize - extrachars) + \
1927 (targetsize << 2);
1928 extrachars += needed;
1929 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001930 Py_DECREF(x);
1931 goto onError;
1932 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001933 p = PyUnicode_AS_UNICODE(v) + oldpos;
1934 }
1935 Py_UNICODE_COPY(p,
1936 PyUnicode_AS_UNICODE(x),
1937 targetsize);
1938 p += targetsize;
1939 extrachars -= targetsize;
1940 }
1941 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001942 }
1943 else {
1944 /* wrong return value */
1945 PyErr_SetString(PyExc_TypeError,
1946 "character mapping must return integer, None or unicode");
1947 Py_DECREF(x);
1948 goto onError;
1949 }
1950 Py_DECREF(x);
1951 }
1952 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1953 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1954 goto onError;
1955 return (PyObject *)v;
1956
1957 onError:
1958 Py_XDECREF(v);
1959 return NULL;
1960}
1961
1962static
1963int charmap_encoding_error(const Py_UNICODE **source,
1964 char **dest,
1965 const char *errors,
1966 const char *details)
1967{
1968 if ((errors == NULL) ||
1969 (strcmp(errors,"strict") == 0)) {
1970 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001971 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001972 details);
1973 return -1;
1974 }
1975 else if (strcmp(errors,"ignore") == 0) {
1976 return 0;
1977 }
1978 else if (strcmp(errors,"replace") == 0) {
1979 **dest = '?';
1980 (*dest)++;
1981 return 0;
1982 }
1983 else {
1984 PyErr_Format(PyExc_ValueError,
1985 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001986 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001987 errors);
1988 return -1;
1989 }
1990}
1991
1992PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1993 int size,
1994 PyObject *mapping,
1995 const char *errors)
1996{
1997 PyObject *v;
1998 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001999 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002000
2001 /* Default to Latin-1 */
2002 if (mapping == NULL)
2003 return PyUnicode_EncodeLatin1(p, size, errors);
2004
2005 v = PyString_FromStringAndSize(NULL, size);
2006 if (v == NULL)
2007 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002008 if (size == 0)
2009 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 s = PyString_AS_STRING(v);
2011 while (size-- > 0) {
2012 Py_UNICODE ch = *p++;
2013 PyObject *w, *x;
2014
2015 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2016 w = PyInt_FromLong((long)ch);
2017 if (w == NULL)
2018 goto onError;
2019 x = PyObject_GetItem(mapping, w);
2020 Py_DECREF(w);
2021 if (x == NULL) {
2022 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002023 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002024 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002025 x = Py_None;
2026 Py_INCREF(x);
2027 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002028 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002029 }
2030
2031 /* Apply mapping */
2032 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002033 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002034 if (value < 0 || value > 255) {
2035 PyErr_SetString(PyExc_TypeError,
2036 "character mapping must be in range(256)");
2037 Py_DECREF(x);
2038 goto onError;
2039 }
2040 *s++ = (char)value;
2041 }
2042 else if (x == Py_None) {
2043 /* undefined mapping */
2044 if (charmap_encoding_error(&p, &s, errors,
2045 "character maps to <undefined>")) {
2046 Py_DECREF(x);
2047 goto onError;
2048 }
2049 }
2050 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002051 int targetsize = PyString_GET_SIZE(x);
2052
2053 if (targetsize == 1)
2054 /* 1-1 mapping */
2055 *s++ = *PyString_AS_STRING(x);
2056
2057 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002058 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002059 if (targetsize > extrachars) {
2060 /* resize first */
2061 int oldpos = (int)(s - PyString_AS_STRING(v));
2062 int needed = (targetsize - extrachars) + \
2063 (targetsize << 2);
2064 extrachars += needed;
2065 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002066 Py_DECREF(x);
2067 goto onError;
2068 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002069 s = PyString_AS_STRING(v) + oldpos;
2070 }
2071 memcpy(s,
2072 PyString_AS_STRING(x),
2073 targetsize);
2074 s += targetsize;
2075 extrachars -= targetsize;
2076 }
2077 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002078 }
2079 else {
2080 /* wrong return value */
2081 PyErr_SetString(PyExc_TypeError,
2082 "character mapping must return integer, None or unicode");
2083 Py_DECREF(x);
2084 goto onError;
2085 }
2086 Py_DECREF(x);
2087 }
2088 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2089 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2090 goto onError;
2091 return v;
2092
2093 onError:
2094 Py_DECREF(v);
2095 return NULL;
2096}
2097
2098PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2099 PyObject *mapping)
2100{
2101 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2102 PyErr_BadArgument();
2103 return NULL;
2104 }
2105 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2106 PyUnicode_GET_SIZE(unicode),
2107 mapping,
2108 NULL);
2109}
2110
2111static
2112int translate_error(const Py_UNICODE **source,
2113 Py_UNICODE **dest,
2114 const char *errors,
2115 const char *details)
2116{
2117 if ((errors == NULL) ||
2118 (strcmp(errors,"strict") == 0)) {
2119 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002120 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002121 details);
2122 return -1;
2123 }
2124 else if (strcmp(errors,"ignore") == 0) {
2125 return 0;
2126 }
2127 else if (strcmp(errors,"replace") == 0) {
2128 **dest = '?';
2129 (*dest)++;
2130 return 0;
2131 }
2132 else {
2133 PyErr_Format(PyExc_ValueError,
2134 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002135 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002136 errors);
2137 return -1;
2138 }
2139}
2140
2141PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2142 int size,
2143 PyObject *mapping,
2144 const char *errors)
2145{
2146 PyUnicodeObject *v;
2147 Py_UNICODE *p;
2148
2149 if (mapping == NULL) {
2150 PyErr_BadArgument();
2151 return NULL;
2152 }
2153
2154 /* Output will never be longer than input */
2155 v = _PyUnicode_New(size);
2156 if (v == NULL)
2157 goto onError;
2158 if (size == 0)
2159 goto done;
2160 p = PyUnicode_AS_UNICODE(v);
2161 while (size-- > 0) {
2162 Py_UNICODE ch = *s++;
2163 PyObject *w, *x;
2164
2165 /* Get mapping */
2166 w = PyInt_FromLong(ch);
2167 if (w == NULL)
2168 goto onError;
2169 x = PyObject_GetItem(mapping, w);
2170 Py_DECREF(w);
2171 if (x == NULL) {
2172 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2173 /* No mapping found: default to 1-1 mapping */
2174 PyErr_Clear();
2175 *p++ = ch;
2176 continue;
2177 }
2178 goto onError;
2179 }
2180
2181 /* Apply mapping */
2182 if (PyInt_Check(x))
2183 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2184 else if (x == Py_None) {
2185 /* undefined mapping */
2186 if (translate_error(&s, &p, errors,
2187 "character maps to <undefined>")) {
2188 Py_DECREF(x);
2189 goto onError;
2190 }
2191 }
2192 else if (PyUnicode_Check(x)) {
2193 if (PyUnicode_GET_SIZE(x) != 1) {
2194 /* 1-n mapping */
2195 PyErr_SetString(PyExc_NotImplementedError,
2196 "1-n mappings are currently not implemented");
2197 Py_DECREF(x);
2198 goto onError;
2199 }
2200 *p++ = *PyUnicode_AS_UNICODE(x);
2201 }
2202 else {
2203 /* wrong return value */
2204 PyErr_SetString(PyExc_TypeError,
2205 "translate mapping must return integer, None or unicode");
2206 Py_DECREF(x);
2207 goto onError;
2208 }
2209 Py_DECREF(x);
2210 }
2211 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002212 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2213 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002214
2215 done:
2216 return (PyObject *)v;
2217
2218 onError:
2219 Py_XDECREF(v);
2220 return NULL;
2221}
2222
2223PyObject *PyUnicode_Translate(PyObject *str,
2224 PyObject *mapping,
2225 const char *errors)
2226{
2227 PyObject *result;
2228
2229 str = PyUnicode_FromObject(str);
2230 if (str == NULL)
2231 goto onError;
2232 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2233 PyUnicode_GET_SIZE(str),
2234 mapping,
2235 errors);
2236 Py_DECREF(str);
2237 return result;
2238
2239 onError:
2240 Py_XDECREF(str);
2241 return NULL;
2242}
2243
Guido van Rossum9e896b32000-04-05 20:11:21 +00002244/* --- Decimal Encoder ---------------------------------------------------- */
2245
2246int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2247 int length,
2248 char *output,
2249 const char *errors)
2250{
2251 Py_UNICODE *p, *end;
2252
2253 if (output == NULL) {
2254 PyErr_BadArgument();
2255 return -1;
2256 }
2257
2258 p = s;
2259 end = s + length;
2260 while (p < end) {
2261 register Py_UNICODE ch = *p++;
2262 int decimal;
2263
2264 if (Py_UNICODE_ISSPACE(ch)) {
2265 *output++ = ' ';
2266 continue;
2267 }
2268 decimal = Py_UNICODE_TODECIMAL(ch);
2269 if (decimal >= 0) {
2270 *output++ = '0' + decimal;
2271 continue;
2272 }
Guido van Rossumba477042000-04-06 18:18:10 +00002273 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002274 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002275 continue;
2276 }
2277 /* All other characters are considered invalid */
2278 if (errors == NULL || strcmp(errors, "strict") == 0) {
2279 PyErr_SetString(PyExc_ValueError,
2280 "invalid decimal Unicode string");
2281 goto onError;
2282 }
2283 else if (strcmp(errors, "ignore") == 0)
2284 continue;
2285 else if (strcmp(errors, "replace") == 0) {
2286 *output++ = '?';
2287 continue;
2288 }
2289 }
2290 /* 0-terminate the output string */
2291 *output++ = '\0';
2292 return 0;
2293
2294 onError:
2295 return -1;
2296}
2297
Guido van Rossumd57fd912000-03-10 22:53:23 +00002298/* --- Helpers ------------------------------------------------------------ */
2299
2300static
2301int count(PyUnicodeObject *self,
2302 int start,
2303 int end,
2304 PyUnicodeObject *substring)
2305{
2306 int count = 0;
2307
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002308 if (start < 0)
2309 start += self->length;
2310 if (start < 0)
2311 start = 0;
2312 if (end > self->length)
2313 end = self->length;
2314 if (end < 0)
2315 end += self->length;
2316 if (end < 0)
2317 end = 0;
2318
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002319 if (substring->length == 0)
2320 return (end - start + 1);
2321
Guido van Rossumd57fd912000-03-10 22:53:23 +00002322 end -= substring->length;
2323
2324 while (start <= end)
2325 if (Py_UNICODE_MATCH(self, start, substring)) {
2326 count++;
2327 start += substring->length;
2328 } else
2329 start++;
2330
2331 return count;
2332}
2333
2334int PyUnicode_Count(PyObject *str,
2335 PyObject *substr,
2336 int start,
2337 int end)
2338{
2339 int result;
2340
2341 str = PyUnicode_FromObject(str);
2342 if (str == NULL)
2343 return -1;
2344 substr = PyUnicode_FromObject(substr);
2345 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002346 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002347 return -1;
2348 }
2349
2350 result = count((PyUnicodeObject *)str,
2351 start, end,
2352 (PyUnicodeObject *)substr);
2353
2354 Py_DECREF(str);
2355 Py_DECREF(substr);
2356 return result;
2357}
2358
2359static
2360int findstring(PyUnicodeObject *self,
2361 PyUnicodeObject *substring,
2362 int start,
2363 int end,
2364 int direction)
2365{
2366 if (start < 0)
2367 start += self->length;
2368 if (start < 0)
2369 start = 0;
2370
2371 if (substring->length == 0)
2372 return start;
2373
2374 if (end > self->length)
2375 end = self->length;
2376 if (end < 0)
2377 end += self->length;
2378 if (end < 0)
2379 end = 0;
2380
2381 end -= substring->length;
2382
2383 if (direction < 0) {
2384 for (; end >= start; end--)
2385 if (Py_UNICODE_MATCH(self, end, substring))
2386 return end;
2387 } else {
2388 for (; start <= end; start++)
2389 if (Py_UNICODE_MATCH(self, start, substring))
2390 return start;
2391 }
2392
2393 return -1;
2394}
2395
2396int PyUnicode_Find(PyObject *str,
2397 PyObject *substr,
2398 int start,
2399 int end,
2400 int direction)
2401{
2402 int result;
2403
2404 str = PyUnicode_FromObject(str);
2405 if (str == NULL)
2406 return -1;
2407 substr = PyUnicode_FromObject(substr);
2408 if (substr == NULL) {
2409 Py_DECREF(substr);
2410 return -1;
2411 }
2412
2413 result = findstring((PyUnicodeObject *)str,
2414 (PyUnicodeObject *)substr,
2415 start, end, direction);
2416 Py_DECREF(str);
2417 Py_DECREF(substr);
2418 return result;
2419}
2420
2421static
2422int tailmatch(PyUnicodeObject *self,
2423 PyUnicodeObject *substring,
2424 int start,
2425 int end,
2426 int direction)
2427{
2428 if (start < 0)
2429 start += self->length;
2430 if (start < 0)
2431 start = 0;
2432
2433 if (substring->length == 0)
2434 return 1;
2435
2436 if (end > self->length)
2437 end = self->length;
2438 if (end < 0)
2439 end += self->length;
2440 if (end < 0)
2441 end = 0;
2442
2443 end -= substring->length;
2444 if (end < start)
2445 return 0;
2446
2447 if (direction > 0) {
2448 if (Py_UNICODE_MATCH(self, end, substring))
2449 return 1;
2450 } else {
2451 if (Py_UNICODE_MATCH(self, start, substring))
2452 return 1;
2453 }
2454
2455 return 0;
2456}
2457
2458int PyUnicode_Tailmatch(PyObject *str,
2459 PyObject *substr,
2460 int start,
2461 int end,
2462 int direction)
2463{
2464 int result;
2465
2466 str = PyUnicode_FromObject(str);
2467 if (str == NULL)
2468 return -1;
2469 substr = PyUnicode_FromObject(substr);
2470 if (substr == NULL) {
2471 Py_DECREF(substr);
2472 return -1;
2473 }
2474
2475 result = tailmatch((PyUnicodeObject *)str,
2476 (PyUnicodeObject *)substr,
2477 start, end, direction);
2478 Py_DECREF(str);
2479 Py_DECREF(substr);
2480 return result;
2481}
2482
2483static
2484const Py_UNICODE *findchar(const Py_UNICODE *s,
2485 int size,
2486 Py_UNICODE ch)
2487{
2488 /* like wcschr, but doesn't stop at NULL characters */
2489
2490 while (size-- > 0) {
2491 if (*s == ch)
2492 return s;
2493 s++;
2494 }
2495
2496 return NULL;
2497}
2498
2499/* Apply fixfct filter to the Unicode object self and return a
2500 reference to the modified object */
2501
2502static
2503PyObject *fixup(PyUnicodeObject *self,
2504 int (*fixfct)(PyUnicodeObject *s))
2505{
2506
2507 PyUnicodeObject *u;
2508
2509 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2510 self->length);
2511 if (u == NULL)
2512 return NULL;
2513 if (!fixfct(u)) {
2514 /* fixfct should return TRUE if it modified the buffer. If
2515 FALSE, return a reference to the original buffer instead
2516 (to save space, not time) */
2517 Py_INCREF(self);
2518 Py_DECREF(u);
2519 return (PyObject*) self;
2520 }
2521 return (PyObject*) u;
2522}
2523
2524static
2525int fixupper(PyUnicodeObject *self)
2526{
2527 int len = self->length;
2528 Py_UNICODE *s = self->str;
2529 int status = 0;
2530
2531 while (len-- > 0) {
2532 register Py_UNICODE ch;
2533
2534 ch = Py_UNICODE_TOUPPER(*s);
2535 if (ch != *s) {
2536 status = 1;
2537 *s = ch;
2538 }
2539 s++;
2540 }
2541
2542 return status;
2543}
2544
2545static
2546int fixlower(PyUnicodeObject *self)
2547{
2548 int len = self->length;
2549 Py_UNICODE *s = self->str;
2550 int status = 0;
2551
2552 while (len-- > 0) {
2553 register Py_UNICODE ch;
2554
2555 ch = Py_UNICODE_TOLOWER(*s);
2556 if (ch != *s) {
2557 status = 1;
2558 *s = ch;
2559 }
2560 s++;
2561 }
2562
2563 return status;
2564}
2565
2566static
2567int fixswapcase(PyUnicodeObject *self)
2568{
2569 int len = self->length;
2570 Py_UNICODE *s = self->str;
2571 int status = 0;
2572
2573 while (len-- > 0) {
2574 if (Py_UNICODE_ISUPPER(*s)) {
2575 *s = Py_UNICODE_TOLOWER(*s);
2576 status = 1;
2577 } else if (Py_UNICODE_ISLOWER(*s)) {
2578 *s = Py_UNICODE_TOUPPER(*s);
2579 status = 1;
2580 }
2581 s++;
2582 }
2583
2584 return status;
2585}
2586
2587static
2588int fixcapitalize(PyUnicodeObject *self)
2589{
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002590 int len = self->length;
2591 Py_UNICODE *s = self->str;
2592 int status = 0;
2593
2594 if (len == 0)
2595 return 0;
2596 if (Py_UNICODE_ISLOWER(*s)) {
2597 *s = Py_UNICODE_TOUPPER(*s);
2598 status = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00002600 s++;
2601 while (--len > 0) {
2602 if (Py_UNICODE_ISUPPER(*s)) {
2603 *s = Py_UNICODE_TOLOWER(*s);
2604 status = 1;
2605 }
2606 s++;
2607 }
2608 return status;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002609}
2610
2611static
2612int fixtitle(PyUnicodeObject *self)
2613{
2614 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2615 register Py_UNICODE *e;
2616 int previous_is_cased;
2617
2618 /* Shortcut for single character strings */
2619 if (PyUnicode_GET_SIZE(self) == 1) {
2620 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2621 if (*p != ch) {
2622 *p = ch;
2623 return 1;
2624 }
2625 else
2626 return 0;
2627 }
2628
2629 e = p + PyUnicode_GET_SIZE(self);
2630 previous_is_cased = 0;
2631 for (; p < e; p++) {
2632 register const Py_UNICODE ch = *p;
2633
2634 if (previous_is_cased)
2635 *p = Py_UNICODE_TOLOWER(ch);
2636 else
2637 *p = Py_UNICODE_TOTITLE(ch);
2638
2639 if (Py_UNICODE_ISLOWER(ch) ||
2640 Py_UNICODE_ISUPPER(ch) ||
2641 Py_UNICODE_ISTITLE(ch))
2642 previous_is_cased = 1;
2643 else
2644 previous_is_cased = 0;
2645 }
2646 return 1;
2647}
2648
2649PyObject *PyUnicode_Join(PyObject *separator,
2650 PyObject *seq)
2651{
2652 Py_UNICODE *sep;
2653 int seplen;
2654 PyUnicodeObject *res = NULL;
2655 int reslen = 0;
2656 Py_UNICODE *p;
2657 int seqlen = 0;
2658 int sz = 100;
2659 int i;
2660
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002661 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002662 if (seqlen < 0 && PyErr_Occurred())
2663 return NULL;
2664
2665 if (separator == NULL) {
2666 Py_UNICODE blank = ' ';
2667 sep = &blank;
2668 seplen = 1;
2669 }
2670 else {
2671 separator = PyUnicode_FromObject(separator);
2672 if (separator == NULL)
2673 return NULL;
2674 sep = PyUnicode_AS_UNICODE(separator);
2675 seplen = PyUnicode_GET_SIZE(separator);
2676 }
2677
2678 res = _PyUnicode_New(sz);
2679 if (res == NULL)
2680 goto onError;
2681 p = PyUnicode_AS_UNICODE(res);
2682 reslen = 0;
2683
2684 for (i = 0; i < seqlen; i++) {
2685 int itemlen;
2686 PyObject *item;
2687
2688 item = PySequence_GetItem(seq, i);
2689 if (item == NULL)
2690 goto onError;
2691 if (!PyUnicode_Check(item)) {
2692 PyObject *v;
2693 v = PyUnicode_FromObject(item);
2694 Py_DECREF(item);
2695 item = v;
2696 if (item == NULL)
2697 goto onError;
2698 }
2699 itemlen = PyUnicode_GET_SIZE(item);
2700 while (reslen + itemlen + seplen >= sz) {
2701 if (_PyUnicode_Resize(res, sz*2))
2702 goto onError;
2703 sz *= 2;
2704 p = PyUnicode_AS_UNICODE(res) + reslen;
2705 }
2706 if (i > 0) {
2707 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2708 p += seplen;
2709 reslen += seplen;
2710 }
2711 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2712 p += itemlen;
2713 reslen += itemlen;
2714 Py_DECREF(item);
2715 }
2716 if (_PyUnicode_Resize(res, reslen))
2717 goto onError;
2718
2719 Py_XDECREF(separator);
2720 return (PyObject *)res;
2721
2722 onError:
2723 Py_XDECREF(separator);
2724 Py_DECREF(res);
2725 return NULL;
2726}
2727
2728static
2729PyUnicodeObject *pad(PyUnicodeObject *self,
2730 int left,
2731 int right,
2732 Py_UNICODE fill)
2733{
2734 PyUnicodeObject *u;
2735
2736 if (left < 0)
2737 left = 0;
2738 if (right < 0)
2739 right = 0;
2740
2741 if (left == 0 && right == 0) {
2742 Py_INCREF(self);
2743 return self;
2744 }
2745
2746 u = _PyUnicode_New(left + self->length + right);
2747 if (u) {
2748 if (left)
2749 Py_UNICODE_FILL(u->str, fill, left);
2750 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2751 if (right)
2752 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2753 }
2754
2755 return u;
2756}
2757
2758#define SPLIT_APPEND(data, left, right) \
2759 str = PyUnicode_FromUnicode(data + left, right - left); \
2760 if (!str) \
2761 goto onError; \
2762 if (PyList_Append(list, str)) { \
2763 Py_DECREF(str); \
2764 goto onError; \
2765 } \
2766 else \
2767 Py_DECREF(str);
2768
2769static
2770PyObject *split_whitespace(PyUnicodeObject *self,
2771 PyObject *list,
2772 int maxcount)
2773{
2774 register int i;
2775 register int j;
2776 int len = self->length;
2777 PyObject *str;
2778
2779 for (i = j = 0; i < len; ) {
2780 /* find a token */
2781 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2782 i++;
2783 j = i;
2784 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2785 i++;
2786 if (j < i) {
2787 if (maxcount-- <= 0)
2788 break;
2789 SPLIT_APPEND(self->str, j, i);
2790 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2791 i++;
2792 j = i;
2793 }
2794 }
2795 if (j < len) {
2796 SPLIT_APPEND(self->str, j, len);
2797 }
2798 return list;
2799
2800 onError:
2801 Py_DECREF(list);
2802 return NULL;
2803}
2804
2805PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002806 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002807{
2808 register int i;
2809 register int j;
2810 int len;
2811 PyObject *list;
2812 PyObject *str;
2813 Py_UNICODE *data;
2814
2815 string = PyUnicode_FromObject(string);
2816 if (string == NULL)
2817 return NULL;
2818 data = PyUnicode_AS_UNICODE(string);
2819 len = PyUnicode_GET_SIZE(string);
2820
Guido van Rossumd57fd912000-03-10 22:53:23 +00002821 list = PyList_New(0);
2822 if (!list)
2823 goto onError;
2824
2825 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002826 int eol;
2827
Guido van Rossumd57fd912000-03-10 22:53:23 +00002828 /* Find a line and append it */
2829 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2830 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002831
2832 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002833 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002834 if (i < len) {
2835 if (data[i] == '\r' && i + 1 < len &&
2836 data[i+1] == '\n')
2837 i += 2;
2838 else
2839 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002840 if (keepends)
2841 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002842 }
Guido van Rossum86662912000-04-11 15:38:46 +00002843 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002844 j = i;
2845 }
2846 if (j < len) {
2847 SPLIT_APPEND(data, j, len);
2848 }
2849
2850 Py_DECREF(string);
2851 return list;
2852
2853 onError:
2854 Py_DECREF(list);
2855 Py_DECREF(string);
2856 return NULL;
2857}
2858
2859static
2860PyObject *split_char(PyUnicodeObject *self,
2861 PyObject *list,
2862 Py_UNICODE ch,
2863 int maxcount)
2864{
2865 register int i;
2866 register int j;
2867 int len = self->length;
2868 PyObject *str;
2869
2870 for (i = j = 0; i < len; ) {
2871 if (self->str[i] == ch) {
2872 if (maxcount-- <= 0)
2873 break;
2874 SPLIT_APPEND(self->str, j, i);
2875 i = j = i + 1;
2876 } else
2877 i++;
2878 }
2879 if (j <= len) {
2880 SPLIT_APPEND(self->str, j, len);
2881 }
2882 return list;
2883
2884 onError:
2885 Py_DECREF(list);
2886 return NULL;
2887}
2888
2889static
2890PyObject *split_substring(PyUnicodeObject *self,
2891 PyObject *list,
2892 PyUnicodeObject *substring,
2893 int maxcount)
2894{
2895 register int i;
2896 register int j;
2897 int len = self->length;
2898 int sublen = substring->length;
2899 PyObject *str;
2900
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002901 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 if (Py_UNICODE_MATCH(self, i, substring)) {
2903 if (maxcount-- <= 0)
2904 break;
2905 SPLIT_APPEND(self->str, j, i);
2906 i = j = i + sublen;
2907 } else
2908 i++;
2909 }
2910 if (j <= len) {
2911 SPLIT_APPEND(self->str, j, len);
2912 }
2913 return list;
2914
2915 onError:
2916 Py_DECREF(list);
2917 return NULL;
2918}
2919
2920#undef SPLIT_APPEND
2921
2922static
2923PyObject *split(PyUnicodeObject *self,
2924 PyUnicodeObject *substring,
2925 int maxcount)
2926{
2927 PyObject *list;
2928
2929 if (maxcount < 0)
2930 maxcount = INT_MAX;
2931
2932 list = PyList_New(0);
2933 if (!list)
2934 return NULL;
2935
2936 if (substring == NULL)
2937 return split_whitespace(self,list,maxcount);
2938
2939 else if (substring->length == 1)
2940 return split_char(self,list,substring->str[0],maxcount);
2941
2942 else if (substring->length == 0) {
2943 Py_DECREF(list);
2944 PyErr_SetString(PyExc_ValueError, "empty separator");
2945 return NULL;
2946 }
2947 else
2948 return split_substring(self,list,substring,maxcount);
2949}
2950
2951static
2952PyObject *strip(PyUnicodeObject *self,
2953 int left,
2954 int right)
2955{
2956 Py_UNICODE *p = self->str;
2957 int start = 0;
2958 int end = self->length;
2959
2960 if (left)
2961 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2962 start++;
2963
2964 if (right)
2965 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2966 end--;
2967
2968 if (start == 0 && end == self->length) {
2969 /* couldn't strip anything off, return original string */
2970 Py_INCREF(self);
2971 return (PyObject*) self;
2972 }
2973
2974 return (PyObject*) PyUnicode_FromUnicode(
2975 self->str + start,
2976 end - start
2977 );
2978}
2979
2980static
2981PyObject *replace(PyUnicodeObject *self,
2982 PyUnicodeObject *str1,
2983 PyUnicodeObject *str2,
2984 int maxcount)
2985{
2986 PyUnicodeObject *u;
2987
2988 if (maxcount < 0)
2989 maxcount = INT_MAX;
2990
2991 if (str1->length == 1 && str2->length == 1) {
2992 int i;
2993
2994 /* replace characters */
2995 if (!findchar(self->str, self->length, str1->str[0])) {
2996 /* nothing to replace, return original string */
2997 Py_INCREF(self);
2998 u = self;
2999 } else {
3000 Py_UNICODE u1 = str1->str[0];
3001 Py_UNICODE u2 = str2->str[0];
3002
3003 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3004 self->str,
3005 self->length
3006 );
3007 if (u)
3008 for (i = 0; i < u->length; i++)
3009 if (u->str[i] == u1) {
3010 if (--maxcount < 0)
3011 break;
3012 u->str[i] = u2;
3013 }
3014 }
3015
3016 } else {
3017 int n, i;
3018 Py_UNICODE *p;
3019
3020 /* replace strings */
3021 n = count(self, 0, self->length, str1);
3022 if (n > maxcount)
3023 n = maxcount;
3024 if (n == 0) {
3025 /* nothing to replace, return original string */
3026 Py_INCREF(self);
3027 u = self;
3028 } else {
3029 u = _PyUnicode_New(
3030 self->length + n * (str2->length - str1->length));
3031 if (u) {
3032 i = 0;
3033 p = u->str;
3034 while (i <= self->length - str1->length)
3035 if (Py_UNICODE_MATCH(self, i, str1)) {
3036 /* replace string segment */
3037 Py_UNICODE_COPY(p, str2->str, str2->length);
3038 p += str2->length;
3039 i += str1->length;
3040 if (--n <= 0) {
3041 /* copy remaining part */
3042 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3043 break;
3044 }
3045 } else
3046 *p++ = self->str[i++];
3047 }
3048 }
3049 }
3050
3051 return (PyObject *) u;
3052}
3053
3054/* --- Unicode Object Methods --------------------------------------------- */
3055
3056static char title__doc__[] =
3057"S.title() -> unicode\n\
3058\n\
3059Return a titlecased version of S, i.e. words start with title case\n\
3060characters, all remaining cased characters have lower case.";
3061
3062static PyObject*
3063unicode_title(PyUnicodeObject *self, PyObject *args)
3064{
3065 if (!PyArg_NoArgs(args))
3066 return NULL;
3067 return fixup(self, fixtitle);
3068}
3069
3070static char capitalize__doc__[] =
3071"S.capitalize() -> unicode\n\
3072\n\
3073Return a capitalized version of S, i.e. make the first character\n\
3074have upper case.";
3075
3076static PyObject*
3077unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3078{
3079 if (!PyArg_NoArgs(args))
3080 return NULL;
3081 return fixup(self, fixcapitalize);
3082}
3083
3084#if 0
3085static char capwords__doc__[] =
3086"S.capwords() -> unicode\n\
3087\n\
3088Apply .capitalize() to all words in S and return the result with\n\
3089normalized whitespace (all whitespace strings are replaced by ' ').";
3090
3091static PyObject*
3092unicode_capwords(PyUnicodeObject *self, PyObject *args)
3093{
3094 PyObject *list;
3095 PyObject *item;
3096 int i;
3097
3098 if (!PyArg_NoArgs(args))
3099 return NULL;
3100
3101 /* Split into words */
3102 list = split(self, NULL, -1);
3103 if (!list)
3104 return NULL;
3105
3106 /* Capitalize each word */
3107 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3108 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3109 fixcapitalize);
3110 if (item == NULL)
3111 goto onError;
3112 Py_DECREF(PyList_GET_ITEM(list, i));
3113 PyList_SET_ITEM(list, i, item);
3114 }
3115
3116 /* Join the words to form a new string */
3117 item = PyUnicode_Join(NULL, list);
3118
3119onError:
3120 Py_DECREF(list);
3121 return (PyObject *)item;
3122}
3123#endif
3124
3125static char center__doc__[] =
3126"S.center(width) -> unicode\n\
3127\n\
3128Return S centered in a Unicode string of length width. Padding is done\n\
3129using spaces.";
3130
3131static PyObject *
3132unicode_center(PyUnicodeObject *self, PyObject *args)
3133{
3134 int marg, left;
3135 int width;
3136
3137 if (!PyArg_ParseTuple(args, "i:center", &width))
3138 return NULL;
3139
3140 if (self->length >= width) {
3141 Py_INCREF(self);
3142 return (PyObject*) self;
3143 }
3144
3145 marg = width - self->length;
3146 left = marg / 2 + (marg & width & 1);
3147
3148 return (PyObject*) pad(self, left, marg - left, ' ');
3149}
3150
Marc-André Lemburge5034372000-08-08 08:04:29 +00003151#if 0
3152
3153/* This code should go into some future Unicode collation support
3154 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003155 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003156
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003157/* speedy UTF-16 code point order comparison */
3158/* gleaned from: */
3159/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3160
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003161static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003162{
3163 0, 0, 0, 0, 0, 0, 0, 0,
3164 0, 0, 0, 0, 0, 0, 0, 0,
3165 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003166 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003167};
3168
Guido van Rossumd57fd912000-03-10 22:53:23 +00003169static int
3170unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3171{
3172 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003173
Guido van Rossumd57fd912000-03-10 22:53:23 +00003174 Py_UNICODE *s1 = str1->str;
3175 Py_UNICODE *s2 = str2->str;
3176
3177 len1 = str1->length;
3178 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003179
Guido van Rossumd57fd912000-03-10 22:53:23 +00003180 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003181 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003182 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003183
3184 c1 = *s1++;
3185 c2 = *s2++;
3186 if (c1 > (1<<11) * 26)
3187 c1 += utf16Fixup[c1>>11];
3188 if (c2 > (1<<11) * 26)
3189 c2 += utf16Fixup[c2>>11];
3190
3191 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003192 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003193 if (diff)
3194 return (diff < 0) ? -1 : (diff != 0);
3195 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003196 }
3197
3198 return (len1 < len2) ? -1 : (len1 != len2);
3199}
3200
Marc-André Lemburge5034372000-08-08 08:04:29 +00003201#else
3202
3203static int
3204unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3205{
3206 register int len1, len2;
3207
3208 Py_UNICODE *s1 = str1->str;
3209 Py_UNICODE *s2 = str2->str;
3210
3211 len1 = str1->length;
3212 len2 = str2->length;
3213
3214 while (len1 > 0 && len2 > 0) {
3215 register long diff;
3216
3217 diff = (long)*s1++ - (long)*s2++;
3218 if (diff)
3219 return (diff < 0) ? -1 : (diff != 0);
3220 len1--; len2--;
3221 }
3222
3223 return (len1 < len2) ? -1 : (len1 != len2);
3224}
3225
3226#endif
3227
Guido van Rossumd57fd912000-03-10 22:53:23 +00003228int PyUnicode_Compare(PyObject *left,
3229 PyObject *right)
3230{
3231 PyUnicodeObject *u = NULL, *v = NULL;
3232 int result;
3233
3234 /* Coerce the two arguments */
3235 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3236 if (u == NULL)
3237 goto onError;
3238 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3239 if (v == NULL)
3240 goto onError;
3241
Thomas Wouters7e474022000-07-16 12:04:32 +00003242 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 if (v == u) {
3244 Py_DECREF(u);
3245 Py_DECREF(v);
3246 return 0;
3247 }
3248
3249 result = unicode_compare(u, v);
3250
3251 Py_DECREF(u);
3252 Py_DECREF(v);
3253 return result;
3254
3255onError:
3256 Py_XDECREF(u);
3257 Py_XDECREF(v);
3258 return -1;
3259}
3260
Guido van Rossum403d68b2000-03-13 15:55:09 +00003261int PyUnicode_Contains(PyObject *container,
3262 PyObject *element)
3263{
3264 PyUnicodeObject *u = NULL, *v = NULL;
3265 int result;
3266 register const Py_UNICODE *p, *e;
3267 register Py_UNICODE ch;
3268
3269 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003270 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003271 if (v == NULL) {
3272 PyErr_SetString(PyExc_TypeError,
3273 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003274 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003275 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003276 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3277 if (u == NULL) {
3278 Py_DECREF(v);
3279 goto onError;
3280 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003281
3282 /* Check v in u */
3283 if (PyUnicode_GET_SIZE(v) != 1) {
3284 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003285 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003286 goto onError;
3287 }
3288 ch = *PyUnicode_AS_UNICODE(v);
3289 p = PyUnicode_AS_UNICODE(u);
3290 e = p + PyUnicode_GET_SIZE(u);
3291 result = 0;
3292 while (p < e) {
3293 if (*p++ == ch) {
3294 result = 1;
3295 break;
3296 }
3297 }
3298
3299 Py_DECREF(u);
3300 Py_DECREF(v);
3301 return result;
3302
3303onError:
3304 Py_XDECREF(u);
3305 Py_XDECREF(v);
3306 return -1;
3307}
3308
Guido van Rossumd57fd912000-03-10 22:53:23 +00003309/* Concat to string or Unicode object giving a new Unicode object. */
3310
3311PyObject *PyUnicode_Concat(PyObject *left,
3312 PyObject *right)
3313{
3314 PyUnicodeObject *u = NULL, *v = NULL, *w;
3315
3316 /* Coerce the two arguments */
3317 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3318 if (u == NULL)
3319 goto onError;
3320 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3321 if (v == NULL)
3322 goto onError;
3323
3324 /* Shortcuts */
3325 if (v == unicode_empty) {
3326 Py_DECREF(v);
3327 return (PyObject *)u;
3328 }
3329 if (u == unicode_empty) {
3330 Py_DECREF(u);
3331 return (PyObject *)v;
3332 }
3333
3334 /* Concat the two Unicode strings */
3335 w = _PyUnicode_New(u->length + v->length);
3336 if (w == NULL)
3337 goto onError;
3338 Py_UNICODE_COPY(w->str, u->str, u->length);
3339 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3340
3341 Py_DECREF(u);
3342 Py_DECREF(v);
3343 return (PyObject *)w;
3344
3345onError:
3346 Py_XDECREF(u);
3347 Py_XDECREF(v);
3348 return NULL;
3349}
3350
3351static char count__doc__[] =
3352"S.count(sub[, start[, end]]) -> int\n\
3353\n\
3354Return the number of occurrences of substring sub in Unicode string\n\
3355S[start:end]. Optional arguments start and end are\n\
3356interpreted as in slice notation.";
3357
3358static PyObject *
3359unicode_count(PyUnicodeObject *self, PyObject *args)
3360{
3361 PyUnicodeObject *substring;
3362 int start = 0;
3363 int end = INT_MAX;
3364 PyObject *result;
3365
Guido van Rossumb8872e62000-05-09 14:14:27 +00003366 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3367 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003368 return NULL;
3369
3370 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3371 (PyObject *)substring);
3372 if (substring == NULL)
3373 return NULL;
3374
Guido van Rossumd57fd912000-03-10 22:53:23 +00003375 if (start < 0)
3376 start += self->length;
3377 if (start < 0)
3378 start = 0;
3379 if (end > self->length)
3380 end = self->length;
3381 if (end < 0)
3382 end += self->length;
3383 if (end < 0)
3384 end = 0;
3385
3386 result = PyInt_FromLong((long) count(self, start, end, substring));
3387
3388 Py_DECREF(substring);
3389 return result;
3390}
3391
3392static char encode__doc__[] =
3393"S.encode([encoding[,errors]]) -> string\n\
3394\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003395Return an encoded string version of S. Default encoding is the current\n\
3396default string encoding. errors may be given to set a different error\n\
3397handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3398a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003399
3400static PyObject *
3401unicode_encode(PyUnicodeObject *self, PyObject *args)
3402{
3403 char *encoding = NULL;
3404 char *errors = NULL;
3405 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3406 return NULL;
3407 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3408}
3409
3410static char expandtabs__doc__[] =
3411"S.expandtabs([tabsize]) -> unicode\n\
3412\n\
3413Return a copy of S where all tab characters are expanded using spaces.\n\
3414If tabsize is not given, a tab size of 8 characters is assumed.";
3415
3416static PyObject*
3417unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3418{
3419 Py_UNICODE *e;
3420 Py_UNICODE *p;
3421 Py_UNICODE *q;
3422 int i, j;
3423 PyUnicodeObject *u;
3424 int tabsize = 8;
3425
3426 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3427 return NULL;
3428
Thomas Wouters7e474022000-07-16 12:04:32 +00003429 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003430 i = j = 0;
3431 e = self->str + self->length;
3432 for (p = self->str; p < e; p++)
3433 if (*p == '\t') {
3434 if (tabsize > 0)
3435 j += tabsize - (j % tabsize);
3436 }
3437 else {
3438 j++;
3439 if (*p == '\n' || *p == '\r') {
3440 i += j;
3441 j = 0;
3442 }
3443 }
3444
3445 /* Second pass: create output string and fill it */
3446 u = _PyUnicode_New(i + j);
3447 if (!u)
3448 return NULL;
3449
3450 j = 0;
3451 q = u->str;
3452
3453 for (p = self->str; p < e; p++)
3454 if (*p == '\t') {
3455 if (tabsize > 0) {
3456 i = tabsize - (j % tabsize);
3457 j += i;
3458 while (i--)
3459 *q++ = ' ';
3460 }
3461 }
3462 else {
3463 j++;
3464 *q++ = *p;
3465 if (*p == '\n' || *p == '\r')
3466 j = 0;
3467 }
3468
3469 return (PyObject*) u;
3470}
3471
3472static char find__doc__[] =
3473"S.find(sub [,start [,end]]) -> int\n\
3474\n\
3475Return the lowest index in S where substring sub is found,\n\
3476such that sub is contained within s[start,end]. Optional\n\
3477arguments start and end are interpreted as in slice notation.\n\
3478\n\
3479Return -1 on failure.";
3480
3481static PyObject *
3482unicode_find(PyUnicodeObject *self, PyObject *args)
3483{
3484 PyUnicodeObject *substring;
3485 int start = 0;
3486 int end = INT_MAX;
3487 PyObject *result;
3488
Guido van Rossumb8872e62000-05-09 14:14:27 +00003489 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3490 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003491 return NULL;
3492 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3493 (PyObject *)substring);
3494 if (substring == NULL)
3495 return NULL;
3496
3497 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3498
3499 Py_DECREF(substring);
3500 return result;
3501}
3502
3503static PyObject *
3504unicode_getitem(PyUnicodeObject *self, int index)
3505{
3506 if (index < 0 || index >= self->length) {
3507 PyErr_SetString(PyExc_IndexError, "string index out of range");
3508 return NULL;
3509 }
3510
3511 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3512}
3513
3514static long
3515unicode_hash(PyUnicodeObject *self)
3516{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003517 /* Since Unicode objects compare equal to their ASCII string
3518 counterparts, they should use the individual character values
3519 as basis for their hash value. This is needed to assure that
3520 strings and Unicode objects behave in the same way as
3521 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003522
Fredrik Lundhdde61642000-07-10 18:27:47 +00003523 register int len;
3524 register Py_UNICODE *p;
3525 register long x;
3526
Guido van Rossumd57fd912000-03-10 22:53:23 +00003527 if (self->hash != -1)
3528 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003529 len = PyUnicode_GET_SIZE(self);
3530 p = PyUnicode_AS_UNICODE(self);
3531 x = *p << 7;
3532 while (--len >= 0)
3533 x = (1000003*x) ^ *p++;
3534 x ^= PyUnicode_GET_SIZE(self);
3535 if (x == -1)
3536 x = -2;
3537 self->hash = x;
3538 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003539}
3540
3541static char index__doc__[] =
3542"S.index(sub [,start [,end]]) -> int\n\
3543\n\
3544Like S.find() but raise ValueError when the substring is not found.";
3545
3546static PyObject *
3547unicode_index(PyUnicodeObject *self, PyObject *args)
3548{
3549 int result;
3550 PyUnicodeObject *substring;
3551 int start = 0;
3552 int end = INT_MAX;
3553
Guido van Rossumb8872e62000-05-09 14:14:27 +00003554 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3555 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003556 return NULL;
3557
3558 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3559 (PyObject *)substring);
3560 if (substring == NULL)
3561 return NULL;
3562
3563 result = findstring(self, substring, start, end, 1);
3564
3565 Py_DECREF(substring);
3566 if (result < 0) {
3567 PyErr_SetString(PyExc_ValueError, "substring not found");
3568 return NULL;
3569 }
3570 return PyInt_FromLong(result);
3571}
3572
3573static char islower__doc__[] =
3574"S.islower() -> int\n\
3575\n\
3576Return 1 if all cased characters in S are lowercase and there is\n\
3577at least one cased character in S, 0 otherwise.";
3578
3579static PyObject*
3580unicode_islower(PyUnicodeObject *self, PyObject *args)
3581{
3582 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3583 register const Py_UNICODE *e;
3584 int cased;
3585
3586 if (!PyArg_NoArgs(args))
3587 return NULL;
3588
3589 /* Shortcut for single character strings */
3590 if (PyUnicode_GET_SIZE(self) == 1)
3591 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3592
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003593 /* Special case for empty strings */
3594 if (PyString_GET_SIZE(self) == 0)
3595 return PyInt_FromLong(0);
3596
Guido van Rossumd57fd912000-03-10 22:53:23 +00003597 e = p + PyUnicode_GET_SIZE(self);
3598 cased = 0;
3599 for (; p < e; p++) {
3600 register const Py_UNICODE ch = *p;
3601
3602 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3603 return PyInt_FromLong(0);
3604 else if (!cased && Py_UNICODE_ISLOWER(ch))
3605 cased = 1;
3606 }
3607 return PyInt_FromLong(cased);
3608}
3609
3610static char isupper__doc__[] =
3611"S.isupper() -> int\n\
3612\n\
3613Return 1 if all cased characters in S are uppercase and there is\n\
3614at least one cased character in S, 0 otherwise.";
3615
3616static PyObject*
3617unicode_isupper(PyUnicodeObject *self, PyObject *args)
3618{
3619 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3620 register const Py_UNICODE *e;
3621 int cased;
3622
3623 if (!PyArg_NoArgs(args))
3624 return NULL;
3625
3626 /* Shortcut for single character strings */
3627 if (PyUnicode_GET_SIZE(self) == 1)
3628 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3629
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003630 /* Special case for empty strings */
3631 if (PyString_GET_SIZE(self) == 0)
3632 return PyInt_FromLong(0);
3633
Guido van Rossumd57fd912000-03-10 22:53:23 +00003634 e = p + PyUnicode_GET_SIZE(self);
3635 cased = 0;
3636 for (; p < e; p++) {
3637 register const Py_UNICODE ch = *p;
3638
3639 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3640 return PyInt_FromLong(0);
3641 else if (!cased && Py_UNICODE_ISUPPER(ch))
3642 cased = 1;
3643 }
3644 return PyInt_FromLong(cased);
3645}
3646
3647static char istitle__doc__[] =
3648"S.istitle() -> int\n\
3649\n\
3650Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3651may only follow uncased characters and lowercase characters only cased\n\
3652ones. Return 0 otherwise.";
3653
3654static PyObject*
3655unicode_istitle(PyUnicodeObject *self, PyObject *args)
3656{
3657 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3658 register const Py_UNICODE *e;
3659 int cased, previous_is_cased;
3660
3661 if (!PyArg_NoArgs(args))
3662 return NULL;
3663
3664 /* Shortcut for single character strings */
3665 if (PyUnicode_GET_SIZE(self) == 1)
3666 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3667 (Py_UNICODE_ISUPPER(*p) != 0));
3668
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003669 /* Special case for empty strings */
3670 if (PyString_GET_SIZE(self) == 0)
3671 return PyInt_FromLong(0);
3672
Guido van Rossumd57fd912000-03-10 22:53:23 +00003673 e = p + PyUnicode_GET_SIZE(self);
3674 cased = 0;
3675 previous_is_cased = 0;
3676 for (; p < e; p++) {
3677 register const Py_UNICODE ch = *p;
3678
3679 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3680 if (previous_is_cased)
3681 return PyInt_FromLong(0);
3682 previous_is_cased = 1;
3683 cased = 1;
3684 }
3685 else if (Py_UNICODE_ISLOWER(ch)) {
3686 if (!previous_is_cased)
3687 return PyInt_FromLong(0);
3688 previous_is_cased = 1;
3689 cased = 1;
3690 }
3691 else
3692 previous_is_cased = 0;
3693 }
3694 return PyInt_FromLong(cased);
3695}
3696
3697static char isspace__doc__[] =
3698"S.isspace() -> int\n\
3699\n\
3700Return 1 if there are only whitespace characters in S,\n\
37010 otherwise.";
3702
3703static PyObject*
3704unicode_isspace(PyUnicodeObject *self, PyObject *args)
3705{
3706 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3707 register const Py_UNICODE *e;
3708
3709 if (!PyArg_NoArgs(args))
3710 return NULL;
3711
3712 /* Shortcut for single character strings */
3713 if (PyUnicode_GET_SIZE(self) == 1 &&
3714 Py_UNICODE_ISSPACE(*p))
3715 return PyInt_FromLong(1);
3716
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003717 /* Special case for empty strings */
3718 if (PyString_GET_SIZE(self) == 0)
3719 return PyInt_FromLong(0);
3720
Guido van Rossumd57fd912000-03-10 22:53:23 +00003721 e = p + PyUnicode_GET_SIZE(self);
3722 for (; p < e; p++) {
3723 if (!Py_UNICODE_ISSPACE(*p))
3724 return PyInt_FromLong(0);
3725 }
3726 return PyInt_FromLong(1);
3727}
3728
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003729static char isalpha__doc__[] =
3730"S.isalpha() -> int\n\
3731\n\
3732Return 1 if all characters in S are alphabetic\n\
3733and there is at least one character in S, 0 otherwise.";
3734
3735static PyObject*
3736unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3737{
3738 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3739 register const Py_UNICODE *e;
3740
3741 if (!PyArg_NoArgs(args))
3742 return NULL;
3743
3744 /* Shortcut for single character strings */
3745 if (PyUnicode_GET_SIZE(self) == 1 &&
3746 Py_UNICODE_ISALPHA(*p))
3747 return PyInt_FromLong(1);
3748
3749 /* Special case for empty strings */
3750 if (PyString_GET_SIZE(self) == 0)
3751 return PyInt_FromLong(0);
3752
3753 e = p + PyUnicode_GET_SIZE(self);
3754 for (; p < e; p++) {
3755 if (!Py_UNICODE_ISALPHA(*p))
3756 return PyInt_FromLong(0);
3757 }
3758 return PyInt_FromLong(1);
3759}
3760
3761static char isalnum__doc__[] =
3762"S.isalnum() -> int\n\
3763\n\
3764Return 1 if all characters in S are alphanumeric\n\
3765and there is at least one character in S, 0 otherwise.";
3766
3767static PyObject*
3768unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3769{
3770 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3771 register const Py_UNICODE *e;
3772
3773 if (!PyArg_NoArgs(args))
3774 return NULL;
3775
3776 /* Shortcut for single character strings */
3777 if (PyUnicode_GET_SIZE(self) == 1 &&
3778 Py_UNICODE_ISALNUM(*p))
3779 return PyInt_FromLong(1);
3780
3781 /* Special case for empty strings */
3782 if (PyString_GET_SIZE(self) == 0)
3783 return PyInt_FromLong(0);
3784
3785 e = p + PyUnicode_GET_SIZE(self);
3786 for (; p < e; p++) {
3787 if (!Py_UNICODE_ISALNUM(*p))
3788 return PyInt_FromLong(0);
3789 }
3790 return PyInt_FromLong(1);
3791}
3792
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793static char isdecimal__doc__[] =
3794"S.isdecimal() -> int\n\
3795\n\
3796Return 1 if there are only decimal characters in S,\n\
37970 otherwise.";
3798
3799static PyObject*
3800unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3801{
3802 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3803 register const Py_UNICODE *e;
3804
3805 if (!PyArg_NoArgs(args))
3806 return NULL;
3807
3808 /* Shortcut for single character strings */
3809 if (PyUnicode_GET_SIZE(self) == 1 &&
3810 Py_UNICODE_ISDECIMAL(*p))
3811 return PyInt_FromLong(1);
3812
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003813 /* Special case for empty strings */
3814 if (PyString_GET_SIZE(self) == 0)
3815 return PyInt_FromLong(0);
3816
Guido van Rossumd57fd912000-03-10 22:53:23 +00003817 e = p + PyUnicode_GET_SIZE(self);
3818 for (; p < e; p++) {
3819 if (!Py_UNICODE_ISDECIMAL(*p))
3820 return PyInt_FromLong(0);
3821 }
3822 return PyInt_FromLong(1);
3823}
3824
3825static char isdigit__doc__[] =
3826"S.isdigit() -> int\n\
3827\n\
3828Return 1 if there are only digit characters in S,\n\
38290 otherwise.";
3830
3831static PyObject*
3832unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3833{
3834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3835 register const Py_UNICODE *e;
3836
3837 if (!PyArg_NoArgs(args))
3838 return NULL;
3839
3840 /* Shortcut for single character strings */
3841 if (PyUnicode_GET_SIZE(self) == 1 &&
3842 Py_UNICODE_ISDIGIT(*p))
3843 return PyInt_FromLong(1);
3844
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003845 /* Special case for empty strings */
3846 if (PyString_GET_SIZE(self) == 0)
3847 return PyInt_FromLong(0);
3848
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849 e = p + PyUnicode_GET_SIZE(self);
3850 for (; p < e; p++) {
3851 if (!Py_UNICODE_ISDIGIT(*p))
3852 return PyInt_FromLong(0);
3853 }
3854 return PyInt_FromLong(1);
3855}
3856
3857static char isnumeric__doc__[] =
3858"S.isnumeric() -> int\n\
3859\n\
3860Return 1 if there are only numeric characters in S,\n\
38610 otherwise.";
3862
3863static PyObject*
3864unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3865{
3866 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3867 register const Py_UNICODE *e;
3868
3869 if (!PyArg_NoArgs(args))
3870 return NULL;
3871
3872 /* Shortcut for single character strings */
3873 if (PyUnicode_GET_SIZE(self) == 1 &&
3874 Py_UNICODE_ISNUMERIC(*p))
3875 return PyInt_FromLong(1);
3876
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003877 /* Special case for empty strings */
3878 if (PyString_GET_SIZE(self) == 0)
3879 return PyInt_FromLong(0);
3880
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881 e = p + PyUnicode_GET_SIZE(self);
3882 for (; p < e; p++) {
3883 if (!Py_UNICODE_ISNUMERIC(*p))
3884 return PyInt_FromLong(0);
3885 }
3886 return PyInt_FromLong(1);
3887}
3888
3889static char join__doc__[] =
3890"S.join(sequence) -> unicode\n\
3891\n\
3892Return a string which is the concatenation of the strings in the\n\
3893sequence. The separator between elements is S.";
3894
3895static PyObject*
3896unicode_join(PyUnicodeObject *self, PyObject *args)
3897{
3898 PyObject *data;
3899 if (!PyArg_ParseTuple(args, "O:join", &data))
3900 return NULL;
3901
3902 return PyUnicode_Join((PyObject *)self, data);
3903}
3904
3905static int
3906unicode_length(PyUnicodeObject *self)
3907{
3908 return self->length;
3909}
3910
3911static char ljust__doc__[] =
3912"S.ljust(width) -> unicode\n\
3913\n\
3914Return S left justified in a Unicode string of length width. Padding is\n\
3915done using spaces.";
3916
3917static PyObject *
3918unicode_ljust(PyUnicodeObject *self, PyObject *args)
3919{
3920 int width;
3921 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3922 return NULL;
3923
3924 if (self->length >= width) {
3925 Py_INCREF(self);
3926 return (PyObject*) self;
3927 }
3928
3929 return (PyObject*) pad(self, 0, width - self->length, ' ');
3930}
3931
3932static char lower__doc__[] =
3933"S.lower() -> unicode\n\
3934\n\
3935Return a copy of the string S converted to lowercase.";
3936
3937static PyObject*
3938unicode_lower(PyUnicodeObject *self, PyObject *args)
3939{
3940 if (!PyArg_NoArgs(args))
3941 return NULL;
3942 return fixup(self, fixlower);
3943}
3944
3945static char lstrip__doc__[] =
3946"S.lstrip() -> unicode\n\
3947\n\
3948Return a copy of the string S with leading whitespace removed.";
3949
3950static PyObject *
3951unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3952{
3953 if (!PyArg_NoArgs(args))
3954 return NULL;
3955 return strip(self, 1, 0);
3956}
3957
3958static PyObject*
3959unicode_repeat(PyUnicodeObject *str, int len)
3960{
3961 PyUnicodeObject *u;
3962 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00003963 int nchars;
3964 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003965
3966 if (len < 0)
3967 len = 0;
3968
3969 if (len == 1) {
3970 /* no repeat, return original string */
3971 Py_INCREF(str);
3972 return (PyObject*) str;
3973 }
Tim Peters8f422462000-09-09 06:13:41 +00003974
3975 /* ensure # of chars needed doesn't overflow int and # of bytes
3976 * needed doesn't overflow size_t
3977 */
3978 nchars = len * str->length;
3979 if (len && nchars / len != str->length) {
3980 PyErr_SetString(PyExc_OverflowError,
3981 "repeated string is too long");
3982 return NULL;
3983 }
3984 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
3985 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
3986 PyErr_SetString(PyExc_OverflowError,
3987 "repeated string is too long");
3988 return NULL;
3989 }
3990 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991 if (!u)
3992 return NULL;
3993
3994 p = u->str;
3995
3996 while (len-- > 0) {
3997 Py_UNICODE_COPY(p, str->str, str->length);
3998 p += str->length;
3999 }
4000
4001 return (PyObject*) u;
4002}
4003
4004PyObject *PyUnicode_Replace(PyObject *obj,
4005 PyObject *subobj,
4006 PyObject *replobj,
4007 int maxcount)
4008{
4009 PyObject *self;
4010 PyObject *str1;
4011 PyObject *str2;
4012 PyObject *result;
4013
4014 self = PyUnicode_FromObject(obj);
4015 if (self == NULL)
4016 return NULL;
4017 str1 = PyUnicode_FromObject(subobj);
4018 if (str1 == NULL) {
4019 Py_DECREF(self);
4020 return NULL;
4021 }
4022 str2 = PyUnicode_FromObject(replobj);
4023 if (str2 == NULL) {
4024 Py_DECREF(self);
4025 Py_DECREF(str1);
4026 return NULL;
4027 }
4028 result = replace((PyUnicodeObject *)self,
4029 (PyUnicodeObject *)str1,
4030 (PyUnicodeObject *)str2,
4031 maxcount);
4032 Py_DECREF(self);
4033 Py_DECREF(str1);
4034 Py_DECREF(str2);
4035 return result;
4036}
4037
4038static char replace__doc__[] =
4039"S.replace (old, new[, maxsplit]) -> unicode\n\
4040\n\
4041Return a copy of S with all occurrences of substring\n\
4042old replaced by new. If the optional argument maxsplit is\n\
4043given, only the first maxsplit occurrences are replaced.";
4044
4045static PyObject*
4046unicode_replace(PyUnicodeObject *self, PyObject *args)
4047{
4048 PyUnicodeObject *str1;
4049 PyUnicodeObject *str2;
4050 int maxcount = -1;
4051 PyObject *result;
4052
4053 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4054 return NULL;
4055 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4056 if (str1 == NULL)
4057 return NULL;
4058 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4059 if (str2 == NULL)
4060 return NULL;
4061
4062 result = replace(self, str1, str2, maxcount);
4063
4064 Py_DECREF(str1);
4065 Py_DECREF(str2);
4066 return result;
4067}
4068
4069static
4070PyObject *unicode_repr(PyObject *unicode)
4071{
4072 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4073 PyUnicode_GET_SIZE(unicode),
4074 1);
4075}
4076
4077static char rfind__doc__[] =
4078"S.rfind(sub [,start [,end]]) -> int\n\
4079\n\
4080Return the highest index in S where substring sub is found,\n\
4081such that sub is contained within s[start,end]. Optional\n\
4082arguments start and end are interpreted as in slice notation.\n\
4083\n\
4084Return -1 on failure.";
4085
4086static PyObject *
4087unicode_rfind(PyUnicodeObject *self, PyObject *args)
4088{
4089 PyUnicodeObject *substring;
4090 int start = 0;
4091 int end = INT_MAX;
4092 PyObject *result;
4093
Guido van Rossumb8872e62000-05-09 14:14:27 +00004094 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4095 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004096 return NULL;
4097 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4098 (PyObject *)substring);
4099 if (substring == NULL)
4100 return NULL;
4101
4102 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4103
4104 Py_DECREF(substring);
4105 return result;
4106}
4107
4108static char rindex__doc__[] =
4109"S.rindex(sub [,start [,end]]) -> int\n\
4110\n\
4111Like S.rfind() but raise ValueError when the substring is not found.";
4112
4113static PyObject *
4114unicode_rindex(PyUnicodeObject *self, PyObject *args)
4115{
4116 int result;
4117 PyUnicodeObject *substring;
4118 int start = 0;
4119 int end = INT_MAX;
4120
Guido van Rossumb8872e62000-05-09 14:14:27 +00004121 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4122 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004123 return NULL;
4124 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4125 (PyObject *)substring);
4126 if (substring == NULL)
4127 return NULL;
4128
4129 result = findstring(self, substring, start, end, -1);
4130
4131 Py_DECREF(substring);
4132 if (result < 0) {
4133 PyErr_SetString(PyExc_ValueError, "substring not found");
4134 return NULL;
4135 }
4136 return PyInt_FromLong(result);
4137}
4138
4139static char rjust__doc__[] =
4140"S.rjust(width) -> unicode\n\
4141\n\
4142Return S right justified in a Unicode string of length width. Padding is\n\
4143done using spaces.";
4144
4145static PyObject *
4146unicode_rjust(PyUnicodeObject *self, PyObject *args)
4147{
4148 int width;
4149 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4150 return NULL;
4151
4152 if (self->length >= width) {
4153 Py_INCREF(self);
4154 return (PyObject*) self;
4155 }
4156
4157 return (PyObject*) pad(self, width - self->length, 0, ' ');
4158}
4159
4160static char rstrip__doc__[] =
4161"S.rstrip() -> unicode\n\
4162\n\
4163Return a copy of the string S with trailing whitespace removed.";
4164
4165static PyObject *
4166unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4167{
4168 if (!PyArg_NoArgs(args))
4169 return NULL;
4170 return strip(self, 0, 1);
4171}
4172
4173static PyObject*
4174unicode_slice(PyUnicodeObject *self, int start, int end)
4175{
4176 /* standard clamping */
4177 if (start < 0)
4178 start = 0;
4179 if (end < 0)
4180 end = 0;
4181 if (end > self->length)
4182 end = self->length;
4183 if (start == 0 && end == self->length) {
4184 /* full slice, return original string */
4185 Py_INCREF(self);
4186 return (PyObject*) self;
4187 }
4188 if (start > end)
4189 start = end;
4190 /* copy slice */
4191 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4192 end - start);
4193}
4194
4195PyObject *PyUnicode_Split(PyObject *s,
4196 PyObject *sep,
4197 int maxsplit)
4198{
4199 PyObject *result;
4200
4201 s = PyUnicode_FromObject(s);
4202 if (s == NULL)
4203 return NULL;
4204 if (sep != NULL) {
4205 sep = PyUnicode_FromObject(sep);
4206 if (sep == NULL) {
4207 Py_DECREF(s);
4208 return NULL;
4209 }
4210 }
4211
4212 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4213
4214 Py_DECREF(s);
4215 Py_XDECREF(sep);
4216 return result;
4217}
4218
4219static char split__doc__[] =
4220"S.split([sep [,maxsplit]]) -> list of strings\n\
4221\n\
4222Return a list of the words in S, using sep as the\n\
4223delimiter string. If maxsplit is given, at most maxsplit\n\
4224splits are done. If sep is not specified, any whitespace string\n\
4225is a separator.";
4226
4227static PyObject*
4228unicode_split(PyUnicodeObject *self, PyObject *args)
4229{
4230 PyObject *substring = Py_None;
4231 int maxcount = -1;
4232
4233 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4234 return NULL;
4235
4236 if (substring == Py_None)
4237 return split(self, NULL, maxcount);
4238 else if (PyUnicode_Check(substring))
4239 return split(self, (PyUnicodeObject *)substring, maxcount);
4240 else
4241 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4242}
4243
4244static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004245"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004246\n\
4247Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004248Line breaks are not included in the resulting list unless keepends\n\
4249is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004250
4251static PyObject*
4252unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4253{
Guido van Rossum86662912000-04-11 15:38:46 +00004254 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004255
Guido van Rossum86662912000-04-11 15:38:46 +00004256 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004257 return NULL;
4258
Guido van Rossum86662912000-04-11 15:38:46 +00004259 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004260}
4261
4262static
4263PyObject *unicode_str(PyUnicodeObject *self)
4264{
Fred Drakee4315f52000-05-09 19:53:39 +00004265 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004266}
4267
4268static char strip__doc__[] =
4269"S.strip() -> unicode\n\
4270\n\
4271Return a copy of S with leading and trailing whitespace removed.";
4272
4273static PyObject *
4274unicode_strip(PyUnicodeObject *self, PyObject *args)
4275{
4276 if (!PyArg_NoArgs(args))
4277 return NULL;
4278 return strip(self, 1, 1);
4279}
4280
4281static char swapcase__doc__[] =
4282"S.swapcase() -> unicode\n\
4283\n\
4284Return a copy of S with uppercase characters converted to lowercase\n\
4285and vice versa.";
4286
4287static PyObject*
4288unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4289{
4290 if (!PyArg_NoArgs(args))
4291 return NULL;
4292 return fixup(self, fixswapcase);
4293}
4294
4295static char translate__doc__[] =
4296"S.translate(table) -> unicode\n\
4297\n\
4298Return a copy of the string S, where all characters have been mapped\n\
4299through the given translation table, which must be a mapping of\n\
4300Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4301are left untouched. Characters mapped to None are deleted.";
4302
4303static PyObject*
4304unicode_translate(PyUnicodeObject *self, PyObject *args)
4305{
4306 PyObject *table;
4307
4308 if (!PyArg_ParseTuple(args, "O:translate", &table))
4309 return NULL;
4310 return PyUnicode_TranslateCharmap(self->str,
4311 self->length,
4312 table,
4313 "ignore");
4314}
4315
4316static char upper__doc__[] =
4317"S.upper() -> unicode\n\
4318\n\
4319Return a copy of S converted to uppercase.";
4320
4321static PyObject*
4322unicode_upper(PyUnicodeObject *self, PyObject *args)
4323{
4324 if (!PyArg_NoArgs(args))
4325 return NULL;
4326 return fixup(self, fixupper);
4327}
4328
4329#if 0
4330static char zfill__doc__[] =
4331"S.zfill(width) -> unicode\n\
4332\n\
4333Pad a numeric string x with zeros on the left, to fill a field\n\
4334of the specified width. The string x is never truncated.";
4335
4336static PyObject *
4337unicode_zfill(PyUnicodeObject *self, PyObject *args)
4338{
4339 int fill;
4340 PyUnicodeObject *u;
4341
4342 int width;
4343 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4344 return NULL;
4345
4346 if (self->length >= width) {
4347 Py_INCREF(self);
4348 return (PyObject*) self;
4349 }
4350
4351 fill = width - self->length;
4352
4353 u = pad(self, fill, 0, '0');
4354
4355 if (u->str[fill] == '+' || u->str[fill] == '-') {
4356 /* move sign to beginning of string */
4357 u->str[0] = u->str[fill];
4358 u->str[fill] = '0';
4359 }
4360
4361 return (PyObject*) u;
4362}
4363#endif
4364
4365#if 0
4366static PyObject*
4367unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4368{
4369 if (!PyArg_NoArgs(args))
4370 return NULL;
4371 return PyInt_FromLong(unicode_freelist_size);
4372}
4373#endif
4374
4375static char startswith__doc__[] =
4376"S.startswith(prefix[, start[, end]]) -> int\n\
4377\n\
4378Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4379optional start, test S beginning at that position. With optional end, stop\n\
4380comparing S at that position.";
4381
4382static PyObject *
4383unicode_startswith(PyUnicodeObject *self,
4384 PyObject *args)
4385{
4386 PyUnicodeObject *substring;
4387 int start = 0;
4388 int end = INT_MAX;
4389 PyObject *result;
4390
Guido van Rossumb8872e62000-05-09 14:14:27 +00004391 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4392 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 return NULL;
4394 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4395 (PyObject *)substring);
4396 if (substring == NULL)
4397 return NULL;
4398
4399 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4400
4401 Py_DECREF(substring);
4402 return result;
4403}
4404
4405
4406static char endswith__doc__[] =
4407"S.endswith(suffix[, start[, end]]) -> int\n\
4408\n\
4409Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4410optional start, test S beginning at that position. With optional end, stop\n\
4411comparing S at that position.";
4412
4413static PyObject *
4414unicode_endswith(PyUnicodeObject *self,
4415 PyObject *args)
4416{
4417 PyUnicodeObject *substring;
4418 int start = 0;
4419 int end = INT_MAX;
4420 PyObject *result;
4421
Guido van Rossumb8872e62000-05-09 14:14:27 +00004422 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4423 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004424 return NULL;
4425 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4426 (PyObject *)substring);
4427 if (substring == NULL)
4428 return NULL;
4429
4430 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4431
4432 Py_DECREF(substring);
4433 return result;
4434}
4435
4436
4437static PyMethodDef unicode_methods[] = {
4438
4439 /* Order is according to common usage: often used methods should
4440 appear first, since lookup is done sequentially. */
4441
4442 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4443 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4444 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4445 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4446 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4447 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4448 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4449 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4450 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4451 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4452 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4453 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4454 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4455 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4456/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4457 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4458 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4459 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4460 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4461 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4462 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4463 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4464 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4465 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4466 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4467 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4468 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4469 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4470 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4471 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4472 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4473 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4474 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004475 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4476 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004477#if 0
4478 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4479 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4480#endif
4481
4482#if 0
4483 /* This one is just used for debugging the implementation. */
4484 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4485#endif
4486
4487 {NULL, NULL}
4488};
4489
4490static PyObject *
4491unicode_getattr(PyUnicodeObject *self, char *name)
4492{
4493 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4494}
4495
4496static PySequenceMethods unicode_as_sequence = {
4497 (inquiry) unicode_length, /* sq_length */
4498 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4499 (intargfunc) unicode_repeat, /* sq_repeat */
4500 (intargfunc) unicode_getitem, /* sq_item */
4501 (intintargfunc) unicode_slice, /* sq_slice */
4502 0, /* sq_ass_item */
4503 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004504 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004505};
4506
4507static int
4508unicode_buffer_getreadbuf(PyUnicodeObject *self,
4509 int index,
4510 const void **ptr)
4511{
4512 if (index != 0) {
4513 PyErr_SetString(PyExc_SystemError,
4514 "accessing non-existent unicode segment");
4515 return -1;
4516 }
4517 *ptr = (void *) self->str;
4518 return PyUnicode_GET_DATA_SIZE(self);
4519}
4520
4521static int
4522unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4523 const void **ptr)
4524{
4525 PyErr_SetString(PyExc_TypeError,
4526 "cannot use unicode as modifyable buffer");
4527 return -1;
4528}
4529
4530static int
4531unicode_buffer_getsegcount(PyUnicodeObject *self,
4532 int *lenp)
4533{
4534 if (lenp)
4535 *lenp = PyUnicode_GET_DATA_SIZE(self);
4536 return 1;
4537}
4538
4539static int
4540unicode_buffer_getcharbuf(PyUnicodeObject *self,
4541 int index,
4542 const void **ptr)
4543{
4544 PyObject *str;
4545
4546 if (index != 0) {
4547 PyErr_SetString(PyExc_SystemError,
4548 "accessing non-existent unicode segment");
4549 return -1;
4550 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004551 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004552 if (str == NULL)
4553 return -1;
4554 *ptr = (void *) PyString_AS_STRING(str);
4555 return PyString_GET_SIZE(str);
4556}
4557
4558/* Helpers for PyUnicode_Format() */
4559
4560static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004561getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004562{
4563 int argidx = *p_argidx;
4564 if (argidx < arglen) {
4565 (*p_argidx)++;
4566 if (arglen < 0)
4567 return args;
4568 else
4569 return PyTuple_GetItem(args, argidx);
4570 }
4571 PyErr_SetString(PyExc_TypeError,
4572 "not enough arguments for format string");
4573 return NULL;
4574}
4575
4576#define F_LJUST (1<<0)
4577#define F_SIGN (1<<1)
4578#define F_BLANK (1<<2)
4579#define F_ALT (1<<3)
4580#define F_ZERO (1<<4)
4581
4582static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004583int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004584{
4585 register int i;
4586 int len;
4587 va_list va;
4588 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004589 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004590
4591 /* First, format the string as char array, then expand to Py_UNICODE
4592 array. */
4593 charbuffer = (char *)buffer;
4594 len = vsprintf(charbuffer, format, va);
4595 for (i = len - 1; i >= 0; i--)
4596 buffer[i] = (Py_UNICODE) charbuffer[i];
4597
4598 va_end(va);
4599 return len;
4600}
4601
4602static int
4603formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004604 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004605 int flags,
4606 int prec,
4607 int type,
4608 PyObject *v)
4609{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004610 /* fmt = '%#.' + `prec` + `type`
4611 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004612 char fmt[20];
4613 double x;
4614
4615 x = PyFloat_AsDouble(v);
4616 if (x == -1.0 && PyErr_Occurred())
4617 return -1;
4618 if (prec < 0)
4619 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004620 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4621 type = 'g';
4622 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004623 /* worst case length calc to ensure no buffer overrun:
4624 fmt = %#.<prec>g
4625 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4626 for any double rep.)
4627 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4628 If prec=0 the effective precision is 1 (the leading digit is
4629 always given), therefore increase by one to 10+prec. */
4630 if (buflen <= (size_t)10 + (size_t)prec) {
4631 PyErr_SetString(PyExc_OverflowError,
4632 "formatted float is too long (precision too long?)");
4633 return -1;
4634 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004635 return usprintf(buf, fmt, x);
4636}
4637
Tim Peters38fd5b62000-09-21 05:43:11 +00004638static PyObject*
4639formatlong(PyObject *val, int flags, int prec, int type)
4640{
4641 char *buf;
4642 int i, len;
4643 PyObject *str; /* temporary string object. */
4644 PyUnicodeObject *result;
4645
4646 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4647 if (!str)
4648 return NULL;
4649 result = _PyUnicode_New(len);
4650 for (i = 0; i < len; i++)
4651 result->str[i] = buf[i];
4652 result->str[len] = 0;
4653 Py_DECREF(str);
4654 return (PyObject*)result;
4655}
4656
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657static int
4658formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004659 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004660 int flags,
4661 int prec,
4662 int type,
4663 PyObject *v)
4664{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004665 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004666 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4667 + 1 + 1 = 24*/
4668 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004669 long x;
4670
4671 x = PyInt_AsLong(v);
4672 if (x == -1 && PyErr_Occurred())
4673 return -1;
4674 if (prec < 0)
4675 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004676 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4677 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4678 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4679 PyErr_SetString(PyExc_OverflowError,
4680 "formatted integer is too long (precision too long?)");
4681 return -1;
4682 }
Tim Petersfff53252001-04-12 18:38:48 +00004683 /* When converting 0 under %#x or %#X, C leaves off the base marker,
4684 * but we want it (for consistency with other %#x conversions, and
4685 * for consistency with Python's hex() function).
4686 */
4687 if (x == 0 && (flags & F_ALT) && (type == 'x' || type == 'X'))
4688 sprintf(fmt, "0%c%%%s.%dl%c", type, "#", prec, type);
4689 else
4690 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004691 return usprintf(buf, fmt, x);
4692}
4693
4694static int
4695formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004696 size_t buflen,
4697 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004698{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004699 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004700 if (PyUnicode_Check(v)) {
4701 if (PyUnicode_GET_SIZE(v) != 1)
4702 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004703 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004705
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004706 else if (PyString_Check(v)) {
4707 if (PyString_GET_SIZE(v) != 1)
4708 goto onError;
4709 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4710 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004711
4712 else {
4713 /* Integer input truncated to a character */
4714 long x;
4715 x = PyInt_AsLong(v);
4716 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004717 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004718 buf[0] = (char) x;
4719 }
4720 buf[1] = '\0';
4721 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004722
4723 onError:
4724 PyErr_SetString(PyExc_TypeError,
4725 "%c requires int or char");
4726 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004727}
4728
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004729/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4730
4731 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4732 chars are formatted. XXX This is a magic number. Each formatting
4733 routine does bounds checking to ensure no overflow, but a better
4734 solution may be to malloc a buffer of appropriate size for each
4735 format. For now, the current solution is sufficient.
4736*/
4737#define FORMATBUFLEN (size_t)120
4738
Guido van Rossumd57fd912000-03-10 22:53:23 +00004739PyObject *PyUnicode_Format(PyObject *format,
4740 PyObject *args)
4741{
4742 Py_UNICODE *fmt, *res;
4743 int fmtcnt, rescnt, reslen, arglen, argidx;
4744 int args_owned = 0;
4745 PyUnicodeObject *result = NULL;
4746 PyObject *dict = NULL;
4747 PyObject *uformat;
4748
4749 if (format == NULL || args == NULL) {
4750 PyErr_BadInternalCall();
4751 return NULL;
4752 }
4753 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004754 if (uformat == NULL)
4755 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756 fmt = PyUnicode_AS_UNICODE(uformat);
4757 fmtcnt = PyUnicode_GET_SIZE(uformat);
4758
4759 reslen = rescnt = fmtcnt + 100;
4760 result = _PyUnicode_New(reslen);
4761 if (result == NULL)
4762 goto onError;
4763 res = PyUnicode_AS_UNICODE(result);
4764
4765 if (PyTuple_Check(args)) {
4766 arglen = PyTuple_Size(args);
4767 argidx = 0;
4768 }
4769 else {
4770 arglen = -1;
4771 argidx = -2;
4772 }
4773 if (args->ob_type->tp_as_mapping)
4774 dict = args;
4775
4776 while (--fmtcnt >= 0) {
4777 if (*fmt != '%') {
4778 if (--rescnt < 0) {
4779 rescnt = fmtcnt + 100;
4780 reslen += rescnt;
4781 if (_PyUnicode_Resize(result, reslen) < 0)
4782 return NULL;
4783 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4784 --rescnt;
4785 }
4786 *res++ = *fmt++;
4787 }
4788 else {
4789 /* Got a format specifier */
4790 int flags = 0;
4791 int width = -1;
4792 int prec = -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004793 Py_UNICODE c = '\0';
4794 Py_UNICODE fill;
4795 PyObject *v = NULL;
4796 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004797 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004798 Py_UNICODE sign;
4799 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004800 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004801
4802 fmt++;
4803 if (*fmt == '(') {
4804 Py_UNICODE *keystart;
4805 int keylen;
4806 PyObject *key;
4807 int pcount = 1;
4808
4809 if (dict == NULL) {
4810 PyErr_SetString(PyExc_TypeError,
4811 "format requires a mapping");
4812 goto onError;
4813 }
4814 ++fmt;
4815 --fmtcnt;
4816 keystart = fmt;
4817 /* Skip over balanced parentheses */
4818 while (pcount > 0 && --fmtcnt >= 0) {
4819 if (*fmt == ')')
4820 --pcount;
4821 else if (*fmt == '(')
4822 ++pcount;
4823 fmt++;
4824 }
4825 keylen = fmt - keystart - 1;
4826 if (fmtcnt < 0 || pcount > 0) {
4827 PyErr_SetString(PyExc_ValueError,
4828 "incomplete format key");
4829 goto onError;
4830 }
Fred Drakee4315f52000-05-09 19:53:39 +00004831 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004832 then looked up since Python uses strings to hold
4833 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004834 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004835 key = PyUnicode_EncodeUTF8(keystart,
4836 keylen,
4837 NULL);
4838 if (key == NULL)
4839 goto onError;
4840 if (args_owned) {
4841 Py_DECREF(args);
4842 args_owned = 0;
4843 }
4844 args = PyObject_GetItem(dict, key);
4845 Py_DECREF(key);
4846 if (args == NULL) {
4847 goto onError;
4848 }
4849 args_owned = 1;
4850 arglen = -1;
4851 argidx = -2;
4852 }
4853 while (--fmtcnt >= 0) {
4854 switch (c = *fmt++) {
4855 case '-': flags |= F_LJUST; continue;
4856 case '+': flags |= F_SIGN; continue;
4857 case ' ': flags |= F_BLANK; continue;
4858 case '#': flags |= F_ALT; continue;
4859 case '0': flags |= F_ZERO; continue;
4860 }
4861 break;
4862 }
4863 if (c == '*') {
4864 v = getnextarg(args, arglen, &argidx);
4865 if (v == NULL)
4866 goto onError;
4867 if (!PyInt_Check(v)) {
4868 PyErr_SetString(PyExc_TypeError,
4869 "* wants int");
4870 goto onError;
4871 }
4872 width = PyInt_AsLong(v);
4873 if (width < 0) {
4874 flags |= F_LJUST;
4875 width = -width;
4876 }
4877 if (--fmtcnt >= 0)
4878 c = *fmt++;
4879 }
4880 else if (c >= '0' && c <= '9') {
4881 width = c - '0';
4882 while (--fmtcnt >= 0) {
4883 c = *fmt++;
4884 if (c < '0' || c > '9')
4885 break;
4886 if ((width*10) / 10 != width) {
4887 PyErr_SetString(PyExc_ValueError,
4888 "width too big");
4889 goto onError;
4890 }
4891 width = width*10 + (c - '0');
4892 }
4893 }
4894 if (c == '.') {
4895 prec = 0;
4896 if (--fmtcnt >= 0)
4897 c = *fmt++;
4898 if (c == '*') {
4899 v = getnextarg(args, arglen, &argidx);
4900 if (v == NULL)
4901 goto onError;
4902 if (!PyInt_Check(v)) {
4903 PyErr_SetString(PyExc_TypeError,
4904 "* wants int");
4905 goto onError;
4906 }
4907 prec = PyInt_AsLong(v);
4908 if (prec < 0)
4909 prec = 0;
4910 if (--fmtcnt >= 0)
4911 c = *fmt++;
4912 }
4913 else if (c >= '0' && c <= '9') {
4914 prec = c - '0';
4915 while (--fmtcnt >= 0) {
4916 c = Py_CHARMASK(*fmt++);
4917 if (c < '0' || c > '9')
4918 break;
4919 if ((prec*10) / 10 != prec) {
4920 PyErr_SetString(PyExc_ValueError,
4921 "prec too big");
4922 goto onError;
4923 }
4924 prec = prec*10 + (c - '0');
4925 }
4926 }
4927 } /* prec */
4928 if (fmtcnt >= 0) {
4929 if (c == 'h' || c == 'l' || c == 'L') {
Guido van Rossumd57fd912000-03-10 22:53:23 +00004930 if (--fmtcnt >= 0)
4931 c = *fmt++;
4932 }
4933 }
4934 if (fmtcnt < 0) {
4935 PyErr_SetString(PyExc_ValueError,
4936 "incomplete format");
4937 goto onError;
4938 }
4939 if (c != '%') {
4940 v = getnextarg(args, arglen, &argidx);
4941 if (v == NULL)
4942 goto onError;
4943 }
4944 sign = 0;
4945 fill = ' ';
4946 switch (c) {
4947
4948 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004949 pbuf = formatbuf;
4950 /* presume that buffer length is at least 1 */
4951 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952 len = 1;
4953 break;
4954
4955 case 's':
4956 case 'r':
4957 if (PyUnicode_Check(v) && c == 's') {
4958 temp = v;
4959 Py_INCREF(temp);
4960 }
4961 else {
4962 PyObject *unicode;
4963 if (c == 's')
4964 temp = PyObject_Str(v);
4965 else
4966 temp = PyObject_Repr(v);
4967 if (temp == NULL)
4968 goto onError;
4969 if (!PyString_Check(temp)) {
4970 /* XXX Note: this should never happen, since
4971 PyObject_Repr() and PyObject_Str() assure
4972 this */
4973 Py_DECREF(temp);
4974 PyErr_SetString(PyExc_TypeError,
4975 "%s argument has non-string str()");
4976 goto onError;
4977 }
Fred Drakee4315f52000-05-09 19:53:39 +00004978 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004979 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004980 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004981 "strict");
4982 Py_DECREF(temp);
4983 temp = unicode;
4984 if (temp == NULL)
4985 goto onError;
4986 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004987 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004988 len = PyUnicode_GET_SIZE(temp);
4989 if (prec >= 0 && len > prec)
4990 len = prec;
4991 break;
4992
4993 case 'i':
4994 case 'd':
4995 case 'u':
4996 case 'o':
4997 case 'x':
4998 case 'X':
4999 if (c == 'i')
5000 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005001 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005002 temp = formatlong(v, flags, prec, c);
5003 if (!temp)
5004 goto onError;
5005 pbuf = PyUnicode_AS_UNICODE(temp);
5006 len = PyUnicode_GET_SIZE(temp);
5007 /* unbounded ints can always produce
5008 a sign character! */
5009 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005010 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005011 else {
5012 pbuf = formatbuf;
5013 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5014 flags, prec, c, v);
5015 if (len < 0)
5016 goto onError;
5017 /* only d conversion is signed */
5018 sign = c == 'd';
5019 }
5020 if (flags & F_ZERO)
5021 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005022 break;
5023
5024 case 'e':
5025 case 'E':
5026 case 'f':
5027 case 'g':
5028 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005029 pbuf = formatbuf;
5030 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5031 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005032 if (len < 0)
5033 goto onError;
5034 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005035 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005036 fill = '0';
5037 break;
5038
5039 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005040 pbuf = formatbuf;
5041 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005042 if (len < 0)
5043 goto onError;
5044 break;
5045
5046 default:
5047 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005048 "unsupported format character '%c' (0x%x) "
5049 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005050 (31<=c && c<=126) ? c : '?',
5051 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005052 goto onError;
5053 }
5054 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005055 if (*pbuf == '-' || *pbuf == '+') {
5056 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 len--;
5058 }
5059 else if (flags & F_SIGN)
5060 sign = '+';
5061 else if (flags & F_BLANK)
5062 sign = ' ';
5063 else
5064 sign = 0;
5065 }
5066 if (width < len)
5067 width = len;
5068 if (rescnt < width + (sign != 0)) {
5069 reslen -= rescnt;
5070 rescnt = width + fmtcnt + 100;
5071 reslen += rescnt;
5072 if (_PyUnicode_Resize(result, reslen) < 0)
5073 return NULL;
5074 res = PyUnicode_AS_UNICODE(result)
5075 + reslen - rescnt;
5076 }
5077 if (sign) {
5078 if (fill != ' ')
5079 *res++ = sign;
5080 rescnt--;
5081 if (width > len)
5082 width--;
5083 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005084 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5085 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005086 assert(pbuf[1] == c);
5087 if (fill != ' ') {
5088 *res++ = *pbuf++;
5089 *res++ = *pbuf++;
Tim Peters38fd5b62000-09-21 05:43:11 +00005090 }
Tim Petersfff53252001-04-12 18:38:48 +00005091 rescnt -= 2;
5092 width -= 2;
5093 if (width < 0)
5094 width = 0;
5095 len -= 2;
Tim Peters38fd5b62000-09-21 05:43:11 +00005096 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005097 if (width > len && !(flags & F_LJUST)) {
5098 do {
5099 --rescnt;
5100 *res++ = fill;
5101 } while (--width > len);
5102 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005103 if (fill == ' ') {
5104 if (sign)
5105 *res++ = sign;
Tim Petersfff53252001-04-12 18:38:48 +00005106 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005107 assert(pbuf[0] == '0');
Tim Petersfff53252001-04-12 18:38:48 +00005108 assert(pbuf[1] == c);
Tim Peters38fd5b62000-09-21 05:43:11 +00005109 *res++ = *pbuf++;
5110 *res++ = *pbuf++;
5111 }
5112 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005113 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005114 res += len;
5115 rescnt -= len;
5116 while (--width >= len) {
5117 --rescnt;
5118 *res++ = ' ';
5119 }
5120 if (dict && (argidx < arglen) && c != '%') {
5121 PyErr_SetString(PyExc_TypeError,
5122 "not all arguments converted");
5123 goto onError;
5124 }
5125 Py_XDECREF(temp);
5126 } /* '%' */
5127 } /* until end */
5128 if (argidx < arglen && !dict) {
5129 PyErr_SetString(PyExc_TypeError,
5130 "not all arguments converted");
5131 goto onError;
5132 }
5133
5134 if (args_owned) {
5135 Py_DECREF(args);
5136 }
5137 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005138 if (_PyUnicode_Resize(result, reslen - rescnt))
5139 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005140 return (PyObject *)result;
5141
5142 onError:
5143 Py_XDECREF(result);
5144 Py_DECREF(uformat);
5145 if (args_owned) {
5146 Py_DECREF(args);
5147 }
5148 return NULL;
5149}
5150
5151static PyBufferProcs unicode_as_buffer = {
5152 (getreadbufferproc) unicode_buffer_getreadbuf,
5153 (getwritebufferproc) unicode_buffer_getwritebuf,
5154 (getsegcountproc) unicode_buffer_getsegcount,
5155 (getcharbufferproc) unicode_buffer_getcharbuf,
5156};
5157
5158PyTypeObject PyUnicode_Type = {
5159 PyObject_HEAD_INIT(&PyType_Type)
5160 0, /* ob_size */
5161 "unicode", /* tp_name */
5162 sizeof(PyUnicodeObject), /* tp_size */
5163 0, /* tp_itemsize */
5164 /* Slots */
5165 (destructor)_PyUnicode_Free, /* tp_dealloc */
5166 0, /* tp_print */
5167 (getattrfunc)unicode_getattr, /* tp_getattr */
5168 0, /* tp_setattr */
5169 (cmpfunc) unicode_compare, /* tp_compare */
5170 (reprfunc) unicode_repr, /* tp_repr */
5171 0, /* tp_as_number */
5172 &unicode_as_sequence, /* tp_as_sequence */
5173 0, /* tp_as_mapping */
5174 (hashfunc) unicode_hash, /* tp_hash*/
5175 0, /* tp_call*/
5176 (reprfunc) unicode_str, /* tp_str */
5177 (getattrofunc) NULL, /* tp_getattro */
5178 (setattrofunc) NULL, /* tp_setattro */
5179 &unicode_as_buffer, /* tp_as_buffer */
5180 Py_TPFLAGS_DEFAULT, /* tp_flags */
5181};
5182
5183/* Initialize the Unicode implementation */
5184
Thomas Wouters78890102000-07-22 19:25:51 +00005185void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005186{
5187 /* Doublecheck the configuration... */
5188 if (sizeof(Py_UNICODE) != 2)
5189 Py_FatalError("Unicode configuration error: "
5190 "sizeof(Py_UNICODE) != 2 bytes");
5191
Fred Drakee4315f52000-05-09 19:53:39 +00005192 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005193 unicode_freelist = NULL;
5194 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005195 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005196 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005197}
5198
5199/* Finalize the Unicode implementation */
5200
5201void
Thomas Wouters78890102000-07-22 19:25:51 +00005202_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005203{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005204 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005205
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005206 Py_XDECREF(unicode_empty);
5207 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005208
5209 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005210 PyUnicodeObject *v = u;
5211 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005212 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005213 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005214 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005215 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005216 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005217 unicode_freelist = NULL;
5218 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005219}