blob: 7a68dd40104d51b31583b5f30004ec3d9afa33ba [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------ */
112
113/* The empty Unicode object */
114static PyUnicodeObject *unicode_empty = NULL;
115
116/* Free list for Unicode objects */
117static PyUnicodeObject *unicode_freelist = NULL;
118static int unicode_freelist_size = 0;
119
120/* --- Unicode Object ----------------------------------------------------- */
121
122static
123int _PyUnicode_Resize(register PyUnicodeObject *unicode,
124 int length)
125{
126 void *oldstr;
127
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000130 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000131
132 /* Resizing unicode_empty is not allowed. */
133 if (unicode == unicode_empty) {
134 PyErr_SetString(PyExc_SystemError,
135 "can't resize empty unicode object");
136 return -1;
137 }
138
139 /* We allocate one more byte to make sure the string is
140 Ux0000 terminated -- XXX is this needed ? */
141 oldstr = unicode->str;
142 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
143 if (!unicode->str) {
144 unicode->str = oldstr;
145 PyErr_NoMemory();
146 return -1;
147 }
148 unicode->str[length] = 0;
149 unicode->length = length;
150
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000151 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000152 /* Reset the object caches */
153 if (unicode->utf8str) {
154 Py_DECREF(unicode->utf8str);
155 unicode->utf8str = NULL;
156 }
157 unicode->hash = -1;
158
159 return 0;
160}
161
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000162int PyUnicode_Resize(PyObject **unicode,
163 int length)
164{
165 PyUnicodeObject *v;
166
167 if (unicode == NULL) {
168 PyErr_BadInternalCall();
169 return -1;
170 }
171 v = (PyUnicodeObject *)*unicode;
172 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
173 PyErr_BadInternalCall();
174 return -1;
175 }
176 return _PyUnicode_Resize(v, length);
177}
178
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179/* We allocate one more byte to make sure the string is
180 Ux0000 terminated -- XXX is this needed ?
181
182 XXX This allocator could further be enhanced by assuring that the
183 free list never reduces its size below 1.
184
185*/
186
187static
188PyUnicodeObject *_PyUnicode_New(int length)
189{
190 register PyUnicodeObject *unicode;
191
192 /* Optimization for empty strings */
193 if (length == 0 && unicode_empty != NULL) {
194 Py_INCREF(unicode_empty);
195 return unicode_empty;
196 }
197
198 /* Unicode freelist & memory allocation */
199 if (unicode_freelist) {
200 unicode = unicode_freelist;
201 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
202 unicode_freelist_size--;
203 unicode->ob_type = &PyUnicode_Type;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000204 _Py_NewReference((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000206 /* Keep-Alive optimization: we only upsize the buffer,
207 never downsize it. */
208 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 _PyUnicode_Resize(unicode, length)) {
210 free(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000211 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 }
213 }
214 else
215 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
216 }
217 else {
218 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
219 if (unicode == NULL)
220 return NULL;
221 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
222 }
223
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000224 if (!unicode->str) {
225 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000226 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000227 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str[length] = 0;
229 unicode->length = length;
230 unicode->hash = -1;
231 unicode->utf8str = NULL;
232 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000233
234 onError:
235 _Py_ForgetReference((PyObject *)unicode);
236 PyMem_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238}
239
240static
241void _PyUnicode_Free(register PyUnicodeObject *unicode)
242{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 free(unicode->str);
247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 if (unicode->utf8str) {
251 Py_DECREF(unicode->utf8str);
252 unicode->utf8str = NULL;
253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
260 free(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000261 Py_XDECREF(unicode->utf8str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyMem_DEL(unicode);
263 }
264}
265
266PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
267 int size)
268{
269 PyUnicodeObject *unicode;
270
271 unicode = _PyUnicode_New(size);
272 if (!unicode)
273 return NULL;
274
275 /* Copy the Unicode data into the new object */
276 if (u != NULL)
277 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
278
279 return (PyObject *)unicode;
280}
281
282#ifdef HAVE_WCHAR_H
283
284PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
285 int size)
286{
287 PyUnicodeObject *unicode;
288
289 if (w == NULL) {
290 PyErr_BadInternalCall();
291 return NULL;
292 }
293
294 unicode = _PyUnicode_New(size);
295 if (!unicode)
296 return NULL;
297
298 /* Copy the wchar_t data into the new object */
299#ifdef HAVE_USABLE_WCHAR_T
300 memcpy(unicode->str, w, size * sizeof(wchar_t));
301#else
302 {
303 register Py_UNICODE *u;
304 register int i;
305 u = PyUnicode_AS_UNICODE(unicode);
306 for (i = size; i >= 0; i--)
307 *u++ = *w++;
308 }
309#endif
310
311 return (PyObject *)unicode;
312}
313
314int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
315 register wchar_t *w,
316 int size)
317{
318 if (unicode == NULL) {
319 PyErr_BadInternalCall();
320 return -1;
321 }
322 if (size > PyUnicode_GET_SIZE(unicode))
323 size = PyUnicode_GET_SIZE(unicode);
324#ifdef HAVE_USABLE_WCHAR_T
325 memcpy(w, unicode->str, size * sizeof(wchar_t));
326#else
327 {
328 register Py_UNICODE *u;
329 register int i;
330 u = PyUnicode_AS_UNICODE(unicode);
331 for (i = size; i >= 0; i--)
332 *w++ = *u++;
333 }
334#endif
335
336 return size;
337}
338
339#endif
340
341PyObject *PyUnicode_FromObject(register PyObject *obj)
342{
343 const char *s;
344 int len;
345
346 if (obj == NULL) {
347 PyErr_BadInternalCall();
348 return NULL;
349 }
350 else if (PyUnicode_Check(obj)) {
351 Py_INCREF(obj);
352 return obj;
353 }
354 else if (PyString_Check(obj)) {
355 s = PyString_AS_STRING(obj);
356 len = PyString_GET_SIZE(obj);
357 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000358 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
359 /* Overwrite the error message with something more useful in
360 case of a TypeError. */
361 if (PyErr_ExceptionMatches(PyExc_TypeError))
362 PyErr_SetString(PyExc_TypeError,
363 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (len == 0) {
367 Py_INCREF(unicode_empty);
368 return (PyObject *)unicode_empty;
369 }
370 return PyUnicode_DecodeUTF8(s, len, "strict");
371}
372
373PyObject *PyUnicode_Decode(const char *s,
374 int size,
375 const char *encoding,
376 const char *errors)
377{
378 PyObject *buffer = NULL, *unicode;
379
380 /* Shortcut for the default encoding UTF-8 */
381 if (encoding == NULL ||
382 (strcmp(encoding, "utf-8") == 0))
383 return PyUnicode_DecodeUTF8(s, size, errors);
384
385 /* Decode via the codec registry */
386 buffer = PyBuffer_FromMemory((void *)s, size);
387 if (buffer == NULL)
388 goto onError;
389 unicode = PyCodec_Decode(buffer, encoding, errors);
390 if (unicode == NULL)
391 goto onError;
392 if (!PyUnicode_Check(unicode)) {
393 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000394 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 unicode->ob_type->tp_name);
396 Py_DECREF(unicode);
397 goto onError;
398 }
399 Py_DECREF(buffer);
400 return unicode;
401
402 onError:
403 Py_XDECREF(buffer);
404 return NULL;
405}
406
407PyObject *PyUnicode_Encode(const Py_UNICODE *s,
408 int size,
409 const char *encoding,
410 const char *errors)
411{
412 PyObject *v, *unicode;
413
414 unicode = PyUnicode_FromUnicode(s, size);
415 if (unicode == NULL)
416 return NULL;
417 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
418 Py_DECREF(unicode);
419 return v;
420}
421
422PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
423 const char *encoding,
424 const char *errors)
425{
426 PyObject *v;
427
428 if (!PyUnicode_Check(unicode)) {
429 PyErr_BadArgument();
430 goto onError;
431 }
432 /* Shortcut for the default encoding UTF-8 */
433 if ((encoding == NULL ||
434 (strcmp(encoding, "utf-8") == 0)) &&
435 errors == NULL)
436 return PyUnicode_AsUTF8String(unicode);
437
438 /* Encode via the codec registry */
439 v = PyCodec_Encode(unicode, encoding, errors);
440 if (v == NULL)
441 goto onError;
442 /* XXX Should we really enforce this ? */
443 if (!PyString_Check(v)) {
444 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000445 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446 v->ob_type->tp_name);
447 Py_DECREF(v);
448 goto onError;
449 }
450 return v;
451
452 onError:
453 return NULL;
454}
455
456Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
457{
458 if (!PyUnicode_Check(unicode)) {
459 PyErr_BadArgument();
460 goto onError;
461 }
462 return PyUnicode_AS_UNICODE(unicode);
463
464 onError:
465 return NULL;
466}
467
468int PyUnicode_GetSize(PyObject *unicode)
469{
470 if (!PyUnicode_Check(unicode)) {
471 PyErr_BadArgument();
472 goto onError;
473 }
474 return PyUnicode_GET_SIZE(unicode);
475
476 onError:
477 return -1;
478}
479
480/* --- UTF-8 Codec -------------------------------------------------------- */
481
482static
483char utf8_code_length[256] = {
484 /* Map UTF-8 encoded prefix byte to sequence length. zero means
485 illegal prefix. see RFC 2279 for details */
486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
487 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
488 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
489 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
490 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
491 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
492 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
493 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
498 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
499 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
500 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
501 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
502};
503
504static
505int utf8_decoding_error(const char **source,
506 Py_UNICODE **dest,
507 const char *errors,
508 const char *details)
509{
510 if ((errors == NULL) ||
511 (strcmp(errors,"strict") == 0)) {
512 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 details);
515 return -1;
516 }
517 else if (strcmp(errors,"ignore") == 0) {
518 (*source)++;
519 return 0;
520 }
521 else if (strcmp(errors,"replace") == 0) {
522 (*source)++;
523 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
524 (*dest)++;
525 return 0;
526 }
527 else {
528 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000529 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530 errors);
531 return -1;
532 }
533}
534
535#define UTF8_ERROR(details) do { \
536 if (utf8_decoding_error(&s, &p, errors, details)) \
537 goto onError; \
538 continue; \
539} while (0)
540
541PyObject *PyUnicode_DecodeUTF8(const char *s,
542 int size,
543 const char *errors)
544{
545 int n;
546 const char *e;
547 PyUnicodeObject *unicode;
548 Py_UNICODE *p;
549
550 /* Note: size will always be longer than the resulting Unicode
551 character count */
552 unicode = _PyUnicode_New(size);
553 if (!unicode)
554 return NULL;
555 if (size == 0)
556 return (PyObject *)unicode;
557
558 /* Unpack UTF-8 encoded data */
559 p = unicode->str;
560 e = s + size;
561
562 while (s < e) {
563 register Py_UNICODE ch = (unsigned char)*s;
564
565 if (ch < 0x80) {
566 *p++ = ch;
567 s++;
568 continue;
569 }
570
571 n = utf8_code_length[ch];
572
573 if (s + n > e)
574 UTF8_ERROR("unexpected end of data");
575
576 switch (n) {
577
578 case 0:
579 UTF8_ERROR("unexpected code byte");
580 break;
581
582 case 1:
583 UTF8_ERROR("internal error");
584 break;
585
586 case 2:
587 if ((s[1] & 0xc0) != 0x80)
588 UTF8_ERROR("invalid data");
589 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
590 if (ch < 0x80)
591 UTF8_ERROR("illegal encoding");
592 else
593 *p++ = ch;
594 break;
595
596 case 3:
597 if ((s[1] & 0xc0) != 0x80 ||
598 (s[2] & 0xc0) != 0x80)
599 UTF8_ERROR("invalid data");
600 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
601 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
602 UTF8_ERROR("illegal encoding");
603 else
604 *p++ = ch;
605 break;
606
607 default:
608 /* Other sizes are only needed for UCS-4 */
609 UTF8_ERROR("unsupported Unicode code range");
610 }
611 s += n;
612 }
613
614 /* Adjust length */
615 if (_PyUnicode_Resize(unicode, p - unicode->str))
616 goto onError;
617
618 return (PyObject *)unicode;
619
620onError:
621 Py_DECREF(unicode);
622 return NULL;
623}
624
625#undef UTF8_ERROR
626
627static
628int utf8_encoding_error(const Py_UNICODE **source,
629 char **dest,
630 const char *errors,
631 const char *details)
632{
633 if ((errors == NULL) ||
634 (strcmp(errors,"strict") == 0)) {
635 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000636 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637 details);
638 return -1;
639 }
640 else if (strcmp(errors,"ignore") == 0) {
641 return 0;
642 }
643 else if (strcmp(errors,"replace") == 0) {
644 **dest = '?';
645 (*dest)++;
646 return 0;
647 }
648 else {
649 PyErr_Format(PyExc_ValueError,
650 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000651 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652 errors);
653 return -1;
654 }
655}
656
657PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
658 int size,
659 const char *errors)
660{
661 PyObject *v;
662 char *p;
663 char *q;
664
665 v = PyString_FromStringAndSize(NULL, 3 * size);
666 if (v == NULL)
667 return NULL;
668 if (size == 0)
669 goto done;
670
671 p = q = PyString_AS_STRING(v);
672 while (size-- > 0) {
673 Py_UNICODE ch = *s++;
674 if (ch < 0x80)
675 *p++ = (char) ch;
676 else if (ch < 0x0800) {
677 *p++ = 0xc0 | (ch >> 6);
678 *p++ = 0x80 | (ch & 0x3f);
679 } else if (0xD800 <= ch && ch <= 0xDFFF) {
680 /* These byte ranges are reserved for UTF-16 surrogate
681 bytes which the Python implementation currently does
682 not support. */
683 printf("code range problem: U+%04x\n", ch);
684 if (utf8_encoding_error(&s, &p, errors,
685 "unsupported code range"))
686 goto onError;
687 } else {
688 *p++ = 0xe0 | (ch >> 12);
689 *p++ = 0x80 | ((ch >> 6) & 0x3f);
690 *p++ = 0x80 | (ch & 0x3f);
691 }
692 }
693 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000694 if (_PyString_Resize(&v, p - q))
695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000696
697 done:
698 return v;
699
700 onError:
701 Py_DECREF(v);
702 return NULL;
703}
704
705/* Return a Python string holding the UTF-8 encoded value of the
706 Unicode object.
707
708 The resulting string is cached in the Unicode object for subsequent
709 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000710 the character buffer interface and will live (at least) as long as
711 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712
713 The refcount of the string is *not* incremented.
714
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000715 *** Exported for internal use by the interpreter only !!! ***
716
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717*/
718
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000719PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000720 const char *errors)
721{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000722 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000723
724 if (v)
725 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000726 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
727 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000728 errors);
729 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000730 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000731 return v;
732}
733
734PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
735{
736 PyObject *str;
737
738 if (!PyUnicode_Check(unicode)) {
739 PyErr_BadArgument();
740 return NULL;
741 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000742 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000743 if (str == NULL)
744 return NULL;
745 Py_INCREF(str);
746 return str;
747}
748
749/* --- UTF-16 Codec ------------------------------------------------------- */
750
751static
752int utf16_decoding_error(const Py_UNICODE **source,
753 Py_UNICODE **dest,
754 const char *errors,
755 const char *details)
756{
757 if ((errors == NULL) ||
758 (strcmp(errors,"strict") == 0)) {
759 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000760 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000761 details);
762 return -1;
763 }
764 else if (strcmp(errors,"ignore") == 0) {
765 return 0;
766 }
767 else if (strcmp(errors,"replace") == 0) {
768 if (dest) {
769 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
770 (*dest)++;
771 }
772 return 0;
773 }
774 else {
775 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000776 "UTF-16 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777 errors);
778 return -1;
779 }
780}
781
782#define UTF16_ERROR(details) do { \
783 if (utf16_decoding_error(&q, &p, errors, details)) \
784 goto onError; \
785 continue; \
786} while(0)
787
788PyObject *PyUnicode_DecodeUTF16(const char *s,
789 int size,
790 const char *errors,
791 int *byteorder)
792{
793 PyUnicodeObject *unicode;
794 Py_UNICODE *p;
795 const Py_UNICODE *q, *e;
796 int bo = 0;
797
798 /* size should be an even number */
799 if (size % sizeof(Py_UNICODE) != 0) {
800 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
801 return NULL;
802 /* The remaining input chars are ignored if we fall through
803 here... */
804 }
805
806 /* Note: size will always be longer than the resulting Unicode
807 character count */
808 unicode = _PyUnicode_New(size);
809 if (!unicode)
810 return NULL;
811 if (size == 0)
812 return (PyObject *)unicode;
813
814 /* Unpack UTF-16 encoded data */
815 p = unicode->str;
816 q = (Py_UNICODE *)s;
817 e = q + (size / sizeof(Py_UNICODE));
818
819 if (byteorder)
820 bo = *byteorder;
821
822 while (q < e) {
823 register Py_UNICODE ch = *q++;
824
825 /* Check for BOM marks (U+FEFF) in the input and adjust
826 current byte order setting accordingly. Swap input
827 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
828 !) */
829#ifdef BYTEORDER_IS_LITTLE_ENDIAN
830 if (ch == 0xFEFF) {
831 bo = -1;
832 continue;
833 } else if (ch == 0xFFFE) {
834 bo = 1;
835 continue;
836 }
837 if (bo == 1)
838 ch = (ch >> 8) | (ch << 8);
839#else
840 if (ch == 0xFEFF) {
841 bo = 1;
842 continue;
843 } else if (ch == 0xFFFE) {
844 bo = -1;
845 continue;
846 }
847 if (bo == -1)
848 ch = (ch >> 8) | (ch << 8);
849#endif
850 if (ch < 0xD800 || ch > 0xDFFF) {
851 *p++ = ch;
852 continue;
853 }
854
855 /* UTF-16 code pair: */
856 if (q >= e)
857 UTF16_ERROR("unexpected end of data");
858 if (0xDC00 <= *q && *q <= 0xDFFF) {
859 q++;
860 if (0xD800 <= *q && *q <= 0xDBFF)
861 /* This is valid data (a UTF-16 surrogate pair), but
862 we are not able to store this information since our
863 Py_UNICODE type only has 16 bits... this might
864 change someday, even though it's unlikely. */
865 UTF16_ERROR("code pairs are not supported");
866 else
867 continue;
868 }
869 UTF16_ERROR("illegal encoding");
870 }
871
872 if (byteorder)
873 *byteorder = bo;
874
875 /* Adjust length */
876 if (_PyUnicode_Resize(unicode, p - unicode->str))
877 goto onError;
878
879 return (PyObject *)unicode;
880
881onError:
882 Py_DECREF(unicode);
883 return NULL;
884}
885
886#undef UTF16_ERROR
887
888PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
889 int size,
890 const char *errors,
891 int byteorder)
892{
893 PyObject *v;
894 Py_UNICODE *p;
895 char *q;
896
897 /* We don't create UTF-16 pairs... */
898 v = PyString_FromStringAndSize(NULL,
899 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
900 if (v == NULL)
901 return NULL;
902 if (size == 0)
903 goto done;
904
905 q = PyString_AS_STRING(v);
906 p = (Py_UNICODE *)q;
907
908 if (byteorder == 0)
909 *p++ = 0xFEFF;
910 if (byteorder == 0 ||
911#ifdef BYTEORDER_IS_LITTLE_ENDIAN
912 byteorder == -1
913#else
914 byteorder == 1
915#endif
916 )
917 memcpy(p, s, size * sizeof(Py_UNICODE));
918 else
919 while (size-- > 0) {
920 Py_UNICODE ch = *s++;
921 *p++ = (ch >> 8) | (ch << 8);
922 }
923 done:
924 return v;
925}
926
927PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
928{
929 if (!PyUnicode_Check(unicode)) {
930 PyErr_BadArgument();
931 return NULL;
932 }
933 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
934 PyUnicode_GET_SIZE(unicode),
935 NULL,
936 0);
937}
938
939/* --- Unicode Escape Codec ----------------------------------------------- */
940
941static
942int unicodeescape_decoding_error(const char **source,
943 unsigned int *x,
944 const char *errors,
945 const char *details)
946{
947 if ((errors == NULL) ||
948 (strcmp(errors,"strict") == 0)) {
949 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000950 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000951 details);
952 return -1;
953 }
954 else if (strcmp(errors,"ignore") == 0) {
955 return 0;
956 }
957 else if (strcmp(errors,"replace") == 0) {
958 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
959 return 0;
960 }
961 else {
962 PyErr_Format(PyExc_ValueError,
963 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000964 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000965 errors);
966 return -1;
967 }
968}
969
970PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
971 int size,
972 const char *errors)
973{
974 PyUnicodeObject *v;
975 Py_UNICODE *p = NULL, *buf = NULL;
976 const char *end;
977
978 /* Escaped strings will always be longer than the resulting
979 Unicode string, so we start with size here and then reduce the
980 length after conversion to the true value. */
981 v = _PyUnicode_New(size);
982 if (v == NULL)
983 goto onError;
984 if (size == 0)
985 return (PyObject *)v;
986 p = buf = PyUnicode_AS_UNICODE(v);
987 end = s + size;
988 while (s < end) {
989 unsigned char c;
990 unsigned int x;
991 int i;
992
993 /* Non-escape characters are interpreted as Unicode ordinals */
994 if (*s != '\\') {
995 *p++ = (unsigned char)*s++;
996 continue;
997 }
998
999 /* \ - Escapes */
1000 s++;
1001 switch (*s++) {
1002
1003 /* \x escapes */
1004 case '\n': break;
1005 case '\\': *p++ = '\\'; break;
1006 case '\'': *p++ = '\''; break;
1007 case '\"': *p++ = '\"'; break;
1008 case 'b': *p++ = '\b'; break;
1009 case 'f': *p++ = '\014'; break; /* FF */
1010 case 't': *p++ = '\t'; break;
1011 case 'n': *p++ = '\n'; break;
1012 case 'r': *p++ = '\r'; break;
1013 case 'v': *p++ = '\013'; break; /* VT */
1014 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1015
1016 /* \OOO (octal) escapes */
1017 case '0': case '1': case '2': case '3':
1018 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001019 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001020 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001021 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001022 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001023 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001025 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001026 break;
1027
1028 /* \xXXXX escape with 0-4 hex digits */
1029 case 'x':
1030 x = 0;
1031 c = (unsigned char)*s;
1032 if (isxdigit(c)) {
1033 do {
1034 x = (x<<4) & ~0xF;
1035 if ('0' <= c && c <= '9')
1036 x += c - '0';
1037 else if ('a' <= c && c <= 'f')
1038 x += 10 + c - 'a';
1039 else
1040 x += 10 + c - 'A';
1041 c = (unsigned char)*++s;
1042 } while (isxdigit(c));
1043 *p++ = x;
1044 } else {
1045 *p++ = '\\';
1046 *p++ = (unsigned char)s[-1];
1047 }
1048 break;
1049
1050 /* \uXXXX with 4 hex digits */
1051 case 'u':
1052 for (x = 0, i = 0; i < 4; i++) {
1053 c = (unsigned char)s[i];
1054 if (!isxdigit(c)) {
1055 if (unicodeescape_decoding_error(&s, &x, errors,
1056 "truncated \\uXXXX"))
1057 goto onError;
1058 i++;
1059 break;
1060 }
1061 x = (x<<4) & ~0xF;
1062 if (c >= '0' && c <= '9')
1063 x += c - '0';
1064 else if (c >= 'a' && c <= 'f')
1065 x += 10 + c - 'a';
1066 else
1067 x += 10 + c - 'A';
1068 }
1069 s += i;
1070 *p++ = x;
1071 break;
1072
1073 default:
1074 *p++ = '\\';
1075 *p++ = (unsigned char)s[-1];
1076 break;
1077 }
1078 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001079 if (_PyUnicode_Resize(v, (int)(p - buf)))
1080 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001081 return (PyObject *)v;
1082
1083 onError:
1084 Py_XDECREF(v);
1085 return NULL;
1086}
1087
1088/* Return a Unicode-Escape string version of the Unicode object.
1089
1090 If quotes is true, the string is enclosed in u"" or u'' quotes as
1091 appropriate.
1092
1093*/
1094
Barry Warsaw51ac5802000-03-20 16:36:48 +00001095static const Py_UNICODE *findchar(const Py_UNICODE *s,
1096 int size,
1097 Py_UNICODE ch);
1098
Guido van Rossumd57fd912000-03-10 22:53:23 +00001099static
1100PyObject *unicodeescape_string(const Py_UNICODE *s,
1101 int size,
1102 int quotes)
1103{
1104 PyObject *repr;
1105 char *p;
1106 char *q;
1107
1108 static const char *hexdigit = "0123456789ABCDEF";
1109
1110 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1111 if (repr == NULL)
1112 return NULL;
1113
1114 p = q = PyString_AS_STRING(repr);
1115
1116 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001117 *p++ = 'u';
1118 *p++ = (findchar(s, size, '\'') &&
1119 !findchar(s, size, '"')) ? '"' : '\'';
1120 }
1121 while (size-- > 0) {
1122 Py_UNICODE ch = *s++;
1123 /* Escape quotes */
1124 if (quotes && (ch == q[1] || ch == '\\')) {
1125 *p++ = '\\';
1126 *p++ = (char) ch;
1127 }
1128 /* Map 16-bit characters to '\uxxxx' */
1129 else if (ch >= 256) {
1130 *p++ = '\\';
1131 *p++ = 'u';
1132 *p++ = hexdigit[(ch >> 12) & 0xf];
1133 *p++ = hexdigit[(ch >> 8) & 0xf];
1134 *p++ = hexdigit[(ch >> 4) & 0xf];
1135 *p++ = hexdigit[ch & 15];
1136 }
1137 /* Map non-printable US ASCII to '\ooo' */
1138 else if (ch < ' ' || ch >= 128) {
1139 *p++ = '\\';
1140 *p++ = hexdigit[(ch >> 6) & 7];
1141 *p++ = hexdigit[(ch >> 3) & 7];
1142 *p++ = hexdigit[ch & 7];
1143 }
1144 /* Copy everything else as-is */
1145 else
1146 *p++ = (char) ch;
1147 }
1148 if (quotes)
1149 *p++ = q[1];
1150
1151 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001152 if (_PyString_Resize(&repr, p - q))
1153 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154
1155 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001156
1157 onError:
1158 Py_DECREF(repr);
1159 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160}
1161
1162PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1163 int size)
1164{
1165 return unicodeescape_string(s, size, 0);
1166}
1167
1168PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1169{
1170 if (!PyUnicode_Check(unicode)) {
1171 PyErr_BadArgument();
1172 return NULL;
1173 }
1174 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1175 PyUnicode_GET_SIZE(unicode));
1176}
1177
1178/* --- Raw Unicode Escape Codec ------------------------------------------- */
1179
1180PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1181 int size,
1182 const char *errors)
1183{
1184 PyUnicodeObject *v;
1185 Py_UNICODE *p, *buf;
1186 const char *end;
1187 const char *bs;
1188
1189 /* Escaped strings will always be longer than the resulting
1190 Unicode string, so we start with size here and then reduce the
1191 length after conversion to the true value. */
1192 v = _PyUnicode_New(size);
1193 if (v == NULL)
1194 goto onError;
1195 if (size == 0)
1196 return (PyObject *)v;
1197 p = buf = PyUnicode_AS_UNICODE(v);
1198 end = s + size;
1199 while (s < end) {
1200 unsigned char c;
1201 unsigned int x;
1202 int i;
1203
1204 /* Non-escape characters are interpreted as Unicode ordinals */
1205 if (*s != '\\') {
1206 *p++ = (unsigned char)*s++;
1207 continue;
1208 }
1209
1210 /* \u-escapes are only interpreted iff the number of leading
1211 backslashes if odd */
1212 bs = s;
1213 for (;s < end;) {
1214 if (*s != '\\')
1215 break;
1216 *p++ = (unsigned char)*s++;
1217 }
1218 if (((s - bs) & 1) == 0 ||
1219 s >= end ||
1220 *s != 'u') {
1221 continue;
1222 }
1223 p--;
1224 s++;
1225
1226 /* \uXXXX with 4 hex digits */
1227 for (x = 0, i = 0; i < 4; i++) {
1228 c = (unsigned char)s[i];
1229 if (!isxdigit(c)) {
1230 if (unicodeescape_decoding_error(&s, &x, errors,
1231 "truncated \\uXXXX"))
1232 goto onError;
1233 i++;
1234 break;
1235 }
1236 x = (x<<4) & ~0xF;
1237 if (c >= '0' && c <= '9')
1238 x += c - '0';
1239 else if (c >= 'a' && c <= 'f')
1240 x += 10 + c - 'a';
1241 else
1242 x += 10 + c - 'A';
1243 }
1244 s += i;
1245 *p++ = x;
1246 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001247 if (_PyUnicode_Resize(v, (int)(p - buf)))
1248 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 return (PyObject *)v;
1250
1251 onError:
1252 Py_XDECREF(v);
1253 return NULL;
1254}
1255
1256PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1257 int size)
1258{
1259 PyObject *repr;
1260 char *p;
1261 char *q;
1262
1263 static const char *hexdigit = "0123456789ABCDEF";
1264
1265 repr = PyString_FromStringAndSize(NULL, 6 * size);
1266 if (repr == NULL)
1267 return NULL;
1268
1269 p = q = PyString_AS_STRING(repr);
1270 while (size-- > 0) {
1271 Py_UNICODE ch = *s++;
1272 /* Map 16-bit characters to '\uxxxx' */
1273 if (ch >= 256) {
1274 *p++ = '\\';
1275 *p++ = 'u';
1276 *p++ = hexdigit[(ch >> 12) & 0xf];
1277 *p++ = hexdigit[(ch >> 8) & 0xf];
1278 *p++ = hexdigit[(ch >> 4) & 0xf];
1279 *p++ = hexdigit[ch & 15];
1280 }
1281 /* Copy everything else as-is */
1282 else
1283 *p++ = (char) ch;
1284 }
1285 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001286 if (_PyString_Resize(&repr, p - q))
1287 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001288
1289 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001290
1291 onError:
1292 Py_DECREF(repr);
1293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001294}
1295
1296PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1297{
1298 if (!PyUnicode_Check(unicode)) {
1299 PyErr_BadArgument();
1300 return NULL;
1301 }
1302 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1303 PyUnicode_GET_SIZE(unicode));
1304}
1305
1306/* --- Latin-1 Codec ------------------------------------------------------ */
1307
1308PyObject *PyUnicode_DecodeLatin1(const char *s,
1309 int size,
1310 const char *errors)
1311{
1312 PyUnicodeObject *v;
1313 Py_UNICODE *p;
1314
1315 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1316 v = _PyUnicode_New(size);
1317 if (v == NULL)
1318 goto onError;
1319 if (size == 0)
1320 return (PyObject *)v;
1321 p = PyUnicode_AS_UNICODE(v);
1322 while (size-- > 0)
1323 *p++ = (unsigned char)*s++;
1324 return (PyObject *)v;
1325
1326 onError:
1327 Py_XDECREF(v);
1328 return NULL;
1329}
1330
1331static
1332int latin1_encoding_error(const Py_UNICODE **source,
1333 char **dest,
1334 const char *errors,
1335 const char *details)
1336{
1337 if ((errors == NULL) ||
1338 (strcmp(errors,"strict") == 0)) {
1339 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001340 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341 details);
1342 return -1;
1343 }
1344 else if (strcmp(errors,"ignore") == 0) {
1345 return 0;
1346 }
1347 else if (strcmp(errors,"replace") == 0) {
1348 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001349 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001350 return 0;
1351 }
1352 else {
1353 PyErr_Format(PyExc_ValueError,
1354 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001355 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001356 errors);
1357 return -1;
1358 }
1359}
1360
1361PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1362 int size,
1363 const char *errors)
1364{
1365 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001366 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001367 repr = PyString_FromStringAndSize(NULL, size);
1368 if (repr == NULL)
1369 return NULL;
1370
1371 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001372 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001373 while (size-- > 0) {
1374 Py_UNICODE ch = *p++;
1375 if (ch >= 256) {
1376 if (latin1_encoding_error(&p, &s, errors,
1377 "ordinal not in range(256)"))
1378 goto onError;
1379 }
1380 else
1381 *s++ = (char)ch;
1382 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001383 /* Resize if error handling skipped some characters */
1384 if (s - start < PyString_GET_SIZE(repr))
1385 if (_PyString_Resize(&repr, s - start))
1386 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001387 return repr;
1388
1389 onError:
1390 Py_DECREF(repr);
1391 return NULL;
1392}
1393
1394PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1395{
1396 if (!PyUnicode_Check(unicode)) {
1397 PyErr_BadArgument();
1398 return NULL;
1399 }
1400 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1401 PyUnicode_GET_SIZE(unicode),
1402 NULL);
1403}
1404
1405/* --- 7-bit ASCII Codec -------------------------------------------------- */
1406
1407static
1408int ascii_decoding_error(const char **source,
1409 Py_UNICODE **dest,
1410 const char *errors,
1411 const char *details)
1412{
1413 if ((errors == NULL) ||
1414 (strcmp(errors,"strict") == 0)) {
1415 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001416 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001417 details);
1418 return -1;
1419 }
1420 else if (strcmp(errors,"ignore") == 0) {
1421 return 0;
1422 }
1423 else if (strcmp(errors,"replace") == 0) {
1424 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1425 (*dest)++;
1426 return 0;
1427 }
1428 else {
1429 PyErr_Format(PyExc_ValueError,
1430 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001431 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001432 errors);
1433 return -1;
1434 }
1435}
1436
1437PyObject *PyUnicode_DecodeASCII(const char *s,
1438 int size,
1439 const char *errors)
1440{
1441 PyUnicodeObject *v;
1442 Py_UNICODE *p;
1443
1444 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1445 v = _PyUnicode_New(size);
1446 if (v == NULL)
1447 goto onError;
1448 if (size == 0)
1449 return (PyObject *)v;
1450 p = PyUnicode_AS_UNICODE(v);
1451 while (size-- > 0) {
1452 register unsigned char c;
1453
1454 c = (unsigned char)*s++;
1455 if (c < 128)
1456 *p++ = c;
1457 else if (ascii_decoding_error(&s, &p, errors,
1458 "ordinal not in range(128)"))
1459 goto onError;
1460 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001461 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1462 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1463 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001464 return (PyObject *)v;
1465
1466 onError:
1467 Py_XDECREF(v);
1468 return NULL;
1469}
1470
1471static
1472int ascii_encoding_error(const Py_UNICODE **source,
1473 char **dest,
1474 const char *errors,
1475 const char *details)
1476{
1477 if ((errors == NULL) ||
1478 (strcmp(errors,"strict") == 0)) {
1479 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001480 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001481 details);
1482 return -1;
1483 }
1484 else if (strcmp(errors,"ignore") == 0) {
1485 return 0;
1486 }
1487 else if (strcmp(errors,"replace") == 0) {
1488 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001489 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001490 return 0;
1491 }
1492 else {
1493 PyErr_Format(PyExc_ValueError,
1494 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001495 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001496 errors);
1497 return -1;
1498 }
1499}
1500
1501PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1502 int size,
1503 const char *errors)
1504{
1505 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001506 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001507 repr = PyString_FromStringAndSize(NULL, size);
1508 if (repr == NULL)
1509 return NULL;
1510
1511 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001512 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001513 while (size-- > 0) {
1514 Py_UNICODE ch = *p++;
1515 if (ch >= 128) {
1516 if (ascii_encoding_error(&p, &s, errors,
1517 "ordinal not in range(128)"))
1518 goto onError;
1519 }
1520 else
1521 *s++ = (char)ch;
1522 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001523 /* Resize if error handling skipped some characters */
1524 if (s - start < PyString_GET_SIZE(repr))
1525 if (_PyString_Resize(&repr, s - start))
1526 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001527 return repr;
1528
1529 onError:
1530 Py_DECREF(repr);
1531 return NULL;
1532}
1533
1534PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1535{
1536 if (!PyUnicode_Check(unicode)) {
1537 PyErr_BadArgument();
1538 return NULL;
1539 }
1540 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1541 PyUnicode_GET_SIZE(unicode),
1542 NULL);
1543}
1544
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001545#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001546
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001547/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001548
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001549PyObject *PyUnicode_DecodeMBCS(const char *s,
1550 int size,
1551 const char *errors)
1552{
1553 PyUnicodeObject *v;
1554 Py_UNICODE *p;
1555
1556 /* First get the size of the result */
1557 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum4e751c32000-05-03 12:27:22 +00001558 if (usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001559 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1560
1561 v = _PyUnicode_New(usize);
1562 if (v == NULL)
1563 return NULL;
1564 if (usize == 0)
1565 return (PyObject *)v;
1566 p = PyUnicode_AS_UNICODE(v);
1567 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1568 Py_DECREF(v);
1569 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1570 }
1571
1572 return (PyObject *)v;
1573}
1574
1575PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1576 int size,
1577 const char *errors)
1578{
1579 PyObject *repr;
1580 char *s;
1581
1582 /* First get the size of the result */
Guido van Rossum4e751c32000-05-03 12:27:22 +00001583 DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001584 if (mbcssize==0)
1585 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1586
1587 repr = PyString_FromStringAndSize(NULL, mbcssize);
1588 if (repr == NULL)
1589 return NULL;
1590 if (mbcssize==0)
1591 return repr;
1592
1593 /* Do the conversion */
1594 s = PyString_AS_STRING(repr);
1595 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1596 Py_DECREF(repr);
1597 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1598 }
1599 return repr;
1600}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001601
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001602#endif /* MS_WIN32 */
1603
Guido van Rossumd57fd912000-03-10 22:53:23 +00001604/* --- Character Mapping Codec -------------------------------------------- */
1605
1606static
1607int charmap_decoding_error(const char **source,
1608 Py_UNICODE **dest,
1609 const char *errors,
1610 const char *details)
1611{
1612 if ((errors == NULL) ||
1613 (strcmp(errors,"strict") == 0)) {
1614 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001615 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001616 details);
1617 return -1;
1618 }
1619 else if (strcmp(errors,"ignore") == 0) {
1620 return 0;
1621 }
1622 else if (strcmp(errors,"replace") == 0) {
1623 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1624 (*dest)++;
1625 return 0;
1626 }
1627 else {
1628 PyErr_Format(PyExc_ValueError,
1629 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001630 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001631 errors);
1632 return -1;
1633 }
1634}
1635
1636PyObject *PyUnicode_DecodeCharmap(const char *s,
1637 int size,
1638 PyObject *mapping,
1639 const char *errors)
1640{
1641 PyUnicodeObject *v;
1642 Py_UNICODE *p;
1643
1644 /* Default to Latin-1 */
1645 if (mapping == NULL)
1646 return PyUnicode_DecodeLatin1(s, size, errors);
1647
1648 v = _PyUnicode_New(size);
1649 if (v == NULL)
1650 goto onError;
1651 if (size == 0)
1652 return (PyObject *)v;
1653 p = PyUnicode_AS_UNICODE(v);
1654 while (size-- > 0) {
1655 unsigned char ch = *s++;
1656 PyObject *w, *x;
1657
1658 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1659 w = PyInt_FromLong((long)ch);
1660 if (w == NULL)
1661 goto onError;
1662 x = PyObject_GetItem(mapping, w);
1663 Py_DECREF(w);
1664 if (x == NULL) {
1665 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1666 /* No mapping found: default to Latin-1 mapping */
1667 PyErr_Clear();
1668 *p++ = (Py_UNICODE)ch;
1669 continue;
1670 }
1671 goto onError;
1672 }
1673
1674 /* Apply mapping */
1675 if (PyInt_Check(x)) {
1676 int value = PyInt_AS_LONG(x);
1677 if (value < 0 || value > 65535) {
1678 PyErr_SetString(PyExc_TypeError,
1679 "character mapping must be in range(65336)");
1680 Py_DECREF(x);
1681 goto onError;
1682 }
1683 *p++ = (Py_UNICODE)value;
1684 }
1685 else if (x == Py_None) {
1686 /* undefined mapping */
1687 if (charmap_decoding_error(&s, &p, errors,
1688 "character maps to <undefined>")) {
1689 Py_DECREF(x);
1690 goto onError;
1691 }
1692 }
1693 else if (PyUnicode_Check(x)) {
1694 if (PyUnicode_GET_SIZE(x) != 1) {
1695 /* 1-n mapping */
1696 PyErr_SetString(PyExc_NotImplementedError,
1697 "1-n mappings are currently not implemented");
1698 Py_DECREF(x);
1699 goto onError;
1700 }
1701 *p++ = *PyUnicode_AS_UNICODE(x);
1702 }
1703 else {
1704 /* wrong return value */
1705 PyErr_SetString(PyExc_TypeError,
1706 "character mapping must return integer, None or unicode");
1707 Py_DECREF(x);
1708 goto onError;
1709 }
1710 Py_DECREF(x);
1711 }
1712 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1713 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1714 goto onError;
1715 return (PyObject *)v;
1716
1717 onError:
1718 Py_XDECREF(v);
1719 return NULL;
1720}
1721
1722static
1723int charmap_encoding_error(const Py_UNICODE **source,
1724 char **dest,
1725 const char *errors,
1726 const char *details)
1727{
1728 if ((errors == NULL) ||
1729 (strcmp(errors,"strict") == 0)) {
1730 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001731 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 details);
1733 return -1;
1734 }
1735 else if (strcmp(errors,"ignore") == 0) {
1736 return 0;
1737 }
1738 else if (strcmp(errors,"replace") == 0) {
1739 **dest = '?';
1740 (*dest)++;
1741 return 0;
1742 }
1743 else {
1744 PyErr_Format(PyExc_ValueError,
1745 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001746 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001747 errors);
1748 return -1;
1749 }
1750}
1751
1752PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1753 int size,
1754 PyObject *mapping,
1755 const char *errors)
1756{
1757 PyObject *v;
1758 char *s;
1759
1760 /* Default to Latin-1 */
1761 if (mapping == NULL)
1762 return PyUnicode_EncodeLatin1(p, size, errors);
1763
1764 v = PyString_FromStringAndSize(NULL, size);
1765 if (v == NULL)
1766 return NULL;
1767 s = PyString_AS_STRING(v);
1768 while (size-- > 0) {
1769 Py_UNICODE ch = *p++;
1770 PyObject *w, *x;
1771
1772 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1773 w = PyInt_FromLong((long)ch);
1774 if (w == NULL)
1775 goto onError;
1776 x = PyObject_GetItem(mapping, w);
1777 Py_DECREF(w);
1778 if (x == NULL) {
1779 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1780 /* No mapping found: default to Latin-1 mapping if possible */
1781 PyErr_Clear();
1782 if (ch < 256) {
1783 *s++ = (char)ch;
1784 continue;
1785 }
1786 else if (!charmap_encoding_error(&p, &s, errors,
1787 "missing character mapping"))
1788 continue;
1789 }
1790 goto onError;
1791 }
1792
1793 /* Apply mapping */
1794 if (PyInt_Check(x)) {
1795 int value = PyInt_AS_LONG(x);
1796 if (value < 0 || value > 255) {
1797 PyErr_SetString(PyExc_TypeError,
1798 "character mapping must be in range(256)");
1799 Py_DECREF(x);
1800 goto onError;
1801 }
1802 *s++ = (char)value;
1803 }
1804 else if (x == Py_None) {
1805 /* undefined mapping */
1806 if (charmap_encoding_error(&p, &s, errors,
1807 "character maps to <undefined>")) {
1808 Py_DECREF(x);
1809 goto onError;
1810 }
1811 }
1812 else if (PyString_Check(x)) {
1813 if (PyString_GET_SIZE(x) != 1) {
1814 /* 1-n mapping */
1815 PyErr_SetString(PyExc_NotImplementedError,
1816 "1-n mappings are currently not implemented");
1817 Py_DECREF(x);
1818 goto onError;
1819 }
1820 *s++ = *PyString_AS_STRING(x);
1821 }
1822 else {
1823 /* wrong return value */
1824 PyErr_SetString(PyExc_TypeError,
1825 "character mapping must return integer, None or unicode");
1826 Py_DECREF(x);
1827 goto onError;
1828 }
1829 Py_DECREF(x);
1830 }
1831 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1832 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1833 goto onError;
1834 return v;
1835
1836 onError:
1837 Py_DECREF(v);
1838 return NULL;
1839}
1840
1841PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1842 PyObject *mapping)
1843{
1844 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1845 PyErr_BadArgument();
1846 return NULL;
1847 }
1848 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1849 PyUnicode_GET_SIZE(unicode),
1850 mapping,
1851 NULL);
1852}
1853
1854static
1855int translate_error(const Py_UNICODE **source,
1856 Py_UNICODE **dest,
1857 const char *errors,
1858 const char *details)
1859{
1860 if ((errors == NULL) ||
1861 (strcmp(errors,"strict") == 0)) {
1862 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001863 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001864 details);
1865 return -1;
1866 }
1867 else if (strcmp(errors,"ignore") == 0) {
1868 return 0;
1869 }
1870 else if (strcmp(errors,"replace") == 0) {
1871 **dest = '?';
1872 (*dest)++;
1873 return 0;
1874 }
1875 else {
1876 PyErr_Format(PyExc_ValueError,
1877 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001878 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879 errors);
1880 return -1;
1881 }
1882}
1883
1884PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1885 int size,
1886 PyObject *mapping,
1887 const char *errors)
1888{
1889 PyUnicodeObject *v;
1890 Py_UNICODE *p;
1891
1892 if (mapping == NULL) {
1893 PyErr_BadArgument();
1894 return NULL;
1895 }
1896
1897 /* Output will never be longer than input */
1898 v = _PyUnicode_New(size);
1899 if (v == NULL)
1900 goto onError;
1901 if (size == 0)
1902 goto done;
1903 p = PyUnicode_AS_UNICODE(v);
1904 while (size-- > 0) {
1905 Py_UNICODE ch = *s++;
1906 PyObject *w, *x;
1907
1908 /* Get mapping */
1909 w = PyInt_FromLong(ch);
1910 if (w == NULL)
1911 goto onError;
1912 x = PyObject_GetItem(mapping, w);
1913 Py_DECREF(w);
1914 if (x == NULL) {
1915 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1916 /* No mapping found: default to 1-1 mapping */
1917 PyErr_Clear();
1918 *p++ = ch;
1919 continue;
1920 }
1921 goto onError;
1922 }
1923
1924 /* Apply mapping */
1925 if (PyInt_Check(x))
1926 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1927 else if (x == Py_None) {
1928 /* undefined mapping */
1929 if (translate_error(&s, &p, errors,
1930 "character maps to <undefined>")) {
1931 Py_DECREF(x);
1932 goto onError;
1933 }
1934 }
1935 else if (PyUnicode_Check(x)) {
1936 if (PyUnicode_GET_SIZE(x) != 1) {
1937 /* 1-n mapping */
1938 PyErr_SetString(PyExc_NotImplementedError,
1939 "1-n mappings are currently not implemented");
1940 Py_DECREF(x);
1941 goto onError;
1942 }
1943 *p++ = *PyUnicode_AS_UNICODE(x);
1944 }
1945 else {
1946 /* wrong return value */
1947 PyErr_SetString(PyExc_TypeError,
1948 "translate mapping must return integer, None or unicode");
1949 Py_DECREF(x);
1950 goto onError;
1951 }
1952 Py_DECREF(x);
1953 }
1954 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001955 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1956 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001957
1958 done:
1959 return (PyObject *)v;
1960
1961 onError:
1962 Py_XDECREF(v);
1963 return NULL;
1964}
1965
1966PyObject *PyUnicode_Translate(PyObject *str,
1967 PyObject *mapping,
1968 const char *errors)
1969{
1970 PyObject *result;
1971
1972 str = PyUnicode_FromObject(str);
1973 if (str == NULL)
1974 goto onError;
1975 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1976 PyUnicode_GET_SIZE(str),
1977 mapping,
1978 errors);
1979 Py_DECREF(str);
1980 return result;
1981
1982 onError:
1983 Py_XDECREF(str);
1984 return NULL;
1985}
1986
Guido van Rossum9e896b32000-04-05 20:11:21 +00001987/* --- Decimal Encoder ---------------------------------------------------- */
1988
1989int PyUnicode_EncodeDecimal(Py_UNICODE *s,
1990 int length,
1991 char *output,
1992 const char *errors)
1993{
1994 Py_UNICODE *p, *end;
1995
1996 if (output == NULL) {
1997 PyErr_BadArgument();
1998 return -1;
1999 }
2000
2001 p = s;
2002 end = s + length;
2003 while (p < end) {
2004 register Py_UNICODE ch = *p++;
2005 int decimal;
2006
2007 if (Py_UNICODE_ISSPACE(ch)) {
2008 *output++ = ' ';
2009 continue;
2010 }
2011 decimal = Py_UNICODE_TODECIMAL(ch);
2012 if (decimal >= 0) {
2013 *output++ = '0' + decimal;
2014 continue;
2015 }
Guido van Rossumba477042000-04-06 18:18:10 +00002016 if (0 < ch && ch < 256) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002017 *output++ = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002018 continue;
2019 }
2020 /* All other characters are considered invalid */
2021 if (errors == NULL || strcmp(errors, "strict") == 0) {
2022 PyErr_SetString(PyExc_ValueError,
2023 "invalid decimal Unicode string");
2024 goto onError;
2025 }
2026 else if (strcmp(errors, "ignore") == 0)
2027 continue;
2028 else if (strcmp(errors, "replace") == 0) {
2029 *output++ = '?';
2030 continue;
2031 }
2032 }
2033 /* 0-terminate the output string */
2034 *output++ = '\0';
2035 return 0;
2036
2037 onError:
2038 return -1;
2039}
2040
Guido van Rossumd57fd912000-03-10 22:53:23 +00002041/* --- Helpers ------------------------------------------------------------ */
2042
2043static
2044int count(PyUnicodeObject *self,
2045 int start,
2046 int end,
2047 PyUnicodeObject *substring)
2048{
2049 int count = 0;
2050
2051 end -= substring->length;
2052
2053 while (start <= end)
2054 if (Py_UNICODE_MATCH(self, start, substring)) {
2055 count++;
2056 start += substring->length;
2057 } else
2058 start++;
2059
2060 return count;
2061}
2062
2063int PyUnicode_Count(PyObject *str,
2064 PyObject *substr,
2065 int start,
2066 int end)
2067{
2068 int result;
2069
2070 str = PyUnicode_FromObject(str);
2071 if (str == NULL)
2072 return -1;
2073 substr = PyUnicode_FromObject(substr);
2074 if (substr == NULL) {
2075 Py_DECREF(substr);
2076 return -1;
2077 }
2078
2079 result = count((PyUnicodeObject *)str,
2080 start, end,
2081 (PyUnicodeObject *)substr);
2082
2083 Py_DECREF(str);
2084 Py_DECREF(substr);
2085 return result;
2086}
2087
2088static
2089int findstring(PyUnicodeObject *self,
2090 PyUnicodeObject *substring,
2091 int start,
2092 int end,
2093 int direction)
2094{
2095 if (start < 0)
2096 start += self->length;
2097 if (start < 0)
2098 start = 0;
2099
2100 if (substring->length == 0)
2101 return start;
2102
2103 if (end > self->length)
2104 end = self->length;
2105 if (end < 0)
2106 end += self->length;
2107 if (end < 0)
2108 end = 0;
2109
2110 end -= substring->length;
2111
2112 if (direction < 0) {
2113 for (; end >= start; end--)
2114 if (Py_UNICODE_MATCH(self, end, substring))
2115 return end;
2116 } else {
2117 for (; start <= end; start++)
2118 if (Py_UNICODE_MATCH(self, start, substring))
2119 return start;
2120 }
2121
2122 return -1;
2123}
2124
2125int PyUnicode_Find(PyObject *str,
2126 PyObject *substr,
2127 int start,
2128 int end,
2129 int direction)
2130{
2131 int result;
2132
2133 str = PyUnicode_FromObject(str);
2134 if (str == NULL)
2135 return -1;
2136 substr = PyUnicode_FromObject(substr);
2137 if (substr == NULL) {
2138 Py_DECREF(substr);
2139 return -1;
2140 }
2141
2142 result = findstring((PyUnicodeObject *)str,
2143 (PyUnicodeObject *)substr,
2144 start, end, direction);
2145 Py_DECREF(str);
2146 Py_DECREF(substr);
2147 return result;
2148}
2149
2150static
2151int tailmatch(PyUnicodeObject *self,
2152 PyUnicodeObject *substring,
2153 int start,
2154 int end,
2155 int direction)
2156{
2157 if (start < 0)
2158 start += self->length;
2159 if (start < 0)
2160 start = 0;
2161
2162 if (substring->length == 0)
2163 return 1;
2164
2165 if (end > self->length)
2166 end = self->length;
2167 if (end < 0)
2168 end += self->length;
2169 if (end < 0)
2170 end = 0;
2171
2172 end -= substring->length;
2173 if (end < start)
2174 return 0;
2175
2176 if (direction > 0) {
2177 if (Py_UNICODE_MATCH(self, end, substring))
2178 return 1;
2179 } else {
2180 if (Py_UNICODE_MATCH(self, start, substring))
2181 return 1;
2182 }
2183
2184 return 0;
2185}
2186
2187int PyUnicode_Tailmatch(PyObject *str,
2188 PyObject *substr,
2189 int start,
2190 int end,
2191 int direction)
2192{
2193 int result;
2194
2195 str = PyUnicode_FromObject(str);
2196 if (str == NULL)
2197 return -1;
2198 substr = PyUnicode_FromObject(substr);
2199 if (substr == NULL) {
2200 Py_DECREF(substr);
2201 return -1;
2202 }
2203
2204 result = tailmatch((PyUnicodeObject *)str,
2205 (PyUnicodeObject *)substr,
2206 start, end, direction);
2207 Py_DECREF(str);
2208 Py_DECREF(substr);
2209 return result;
2210}
2211
2212static
2213const Py_UNICODE *findchar(const Py_UNICODE *s,
2214 int size,
2215 Py_UNICODE ch)
2216{
2217 /* like wcschr, but doesn't stop at NULL characters */
2218
2219 while (size-- > 0) {
2220 if (*s == ch)
2221 return s;
2222 s++;
2223 }
2224
2225 return NULL;
2226}
2227
2228/* Apply fixfct filter to the Unicode object self and return a
2229 reference to the modified object */
2230
2231static
2232PyObject *fixup(PyUnicodeObject *self,
2233 int (*fixfct)(PyUnicodeObject *s))
2234{
2235
2236 PyUnicodeObject *u;
2237
2238 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2239 self->length);
2240 if (u == NULL)
2241 return NULL;
2242 if (!fixfct(u)) {
2243 /* fixfct should return TRUE if it modified the buffer. If
2244 FALSE, return a reference to the original buffer instead
2245 (to save space, not time) */
2246 Py_INCREF(self);
2247 Py_DECREF(u);
2248 return (PyObject*) self;
2249 }
2250 return (PyObject*) u;
2251}
2252
2253static
2254int fixupper(PyUnicodeObject *self)
2255{
2256 int len = self->length;
2257 Py_UNICODE *s = self->str;
2258 int status = 0;
2259
2260 while (len-- > 0) {
2261 register Py_UNICODE ch;
2262
2263 ch = Py_UNICODE_TOUPPER(*s);
2264 if (ch != *s) {
2265 status = 1;
2266 *s = ch;
2267 }
2268 s++;
2269 }
2270
2271 return status;
2272}
2273
2274static
2275int fixlower(PyUnicodeObject *self)
2276{
2277 int len = self->length;
2278 Py_UNICODE *s = self->str;
2279 int status = 0;
2280
2281 while (len-- > 0) {
2282 register Py_UNICODE ch;
2283
2284 ch = Py_UNICODE_TOLOWER(*s);
2285 if (ch != *s) {
2286 status = 1;
2287 *s = ch;
2288 }
2289 s++;
2290 }
2291
2292 return status;
2293}
2294
2295static
2296int fixswapcase(PyUnicodeObject *self)
2297{
2298 int len = self->length;
2299 Py_UNICODE *s = self->str;
2300 int status = 0;
2301
2302 while (len-- > 0) {
2303 if (Py_UNICODE_ISUPPER(*s)) {
2304 *s = Py_UNICODE_TOLOWER(*s);
2305 status = 1;
2306 } else if (Py_UNICODE_ISLOWER(*s)) {
2307 *s = Py_UNICODE_TOUPPER(*s);
2308 status = 1;
2309 }
2310 s++;
2311 }
2312
2313 return status;
2314}
2315
2316static
2317int fixcapitalize(PyUnicodeObject *self)
2318{
2319 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2320 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2321 return 1;
2322 }
2323 return 0;
2324}
2325
2326static
2327int fixtitle(PyUnicodeObject *self)
2328{
2329 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2330 register Py_UNICODE *e;
2331 int previous_is_cased;
2332
2333 /* Shortcut for single character strings */
2334 if (PyUnicode_GET_SIZE(self) == 1) {
2335 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2336 if (*p != ch) {
2337 *p = ch;
2338 return 1;
2339 }
2340 else
2341 return 0;
2342 }
2343
2344 e = p + PyUnicode_GET_SIZE(self);
2345 previous_is_cased = 0;
2346 for (; p < e; p++) {
2347 register const Py_UNICODE ch = *p;
2348
2349 if (previous_is_cased)
2350 *p = Py_UNICODE_TOLOWER(ch);
2351 else
2352 *p = Py_UNICODE_TOTITLE(ch);
2353
2354 if (Py_UNICODE_ISLOWER(ch) ||
2355 Py_UNICODE_ISUPPER(ch) ||
2356 Py_UNICODE_ISTITLE(ch))
2357 previous_is_cased = 1;
2358 else
2359 previous_is_cased = 0;
2360 }
2361 return 1;
2362}
2363
2364PyObject *PyUnicode_Join(PyObject *separator,
2365 PyObject *seq)
2366{
2367 Py_UNICODE *sep;
2368 int seplen;
2369 PyUnicodeObject *res = NULL;
2370 int reslen = 0;
2371 Py_UNICODE *p;
2372 int seqlen = 0;
2373 int sz = 100;
2374 int i;
2375
2376 seqlen = PySequence_Length(seq);
2377 if (seqlen < 0 && PyErr_Occurred())
2378 return NULL;
2379
2380 if (separator == NULL) {
2381 Py_UNICODE blank = ' ';
2382 sep = &blank;
2383 seplen = 1;
2384 }
2385 else {
2386 separator = PyUnicode_FromObject(separator);
2387 if (separator == NULL)
2388 return NULL;
2389 sep = PyUnicode_AS_UNICODE(separator);
2390 seplen = PyUnicode_GET_SIZE(separator);
2391 }
2392
2393 res = _PyUnicode_New(sz);
2394 if (res == NULL)
2395 goto onError;
2396 p = PyUnicode_AS_UNICODE(res);
2397 reslen = 0;
2398
2399 for (i = 0; i < seqlen; i++) {
2400 int itemlen;
2401 PyObject *item;
2402
2403 item = PySequence_GetItem(seq, i);
2404 if (item == NULL)
2405 goto onError;
2406 if (!PyUnicode_Check(item)) {
2407 PyObject *v;
2408 v = PyUnicode_FromObject(item);
2409 Py_DECREF(item);
2410 item = v;
2411 if (item == NULL)
2412 goto onError;
2413 }
2414 itemlen = PyUnicode_GET_SIZE(item);
2415 while (reslen + itemlen + seplen >= sz) {
2416 if (_PyUnicode_Resize(res, sz*2))
2417 goto onError;
2418 sz *= 2;
2419 p = PyUnicode_AS_UNICODE(res) + reslen;
2420 }
2421 if (i > 0) {
2422 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2423 p += seplen;
2424 reslen += seplen;
2425 }
2426 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2427 p += itemlen;
2428 reslen += itemlen;
2429 Py_DECREF(item);
2430 }
2431 if (_PyUnicode_Resize(res, reslen))
2432 goto onError;
2433
2434 Py_XDECREF(separator);
2435 return (PyObject *)res;
2436
2437 onError:
2438 Py_XDECREF(separator);
2439 Py_DECREF(res);
2440 return NULL;
2441}
2442
2443static
2444PyUnicodeObject *pad(PyUnicodeObject *self,
2445 int left,
2446 int right,
2447 Py_UNICODE fill)
2448{
2449 PyUnicodeObject *u;
2450
2451 if (left < 0)
2452 left = 0;
2453 if (right < 0)
2454 right = 0;
2455
2456 if (left == 0 && right == 0) {
2457 Py_INCREF(self);
2458 return self;
2459 }
2460
2461 u = _PyUnicode_New(left + self->length + right);
2462 if (u) {
2463 if (left)
2464 Py_UNICODE_FILL(u->str, fill, left);
2465 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2466 if (right)
2467 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2468 }
2469
2470 return u;
2471}
2472
2473#define SPLIT_APPEND(data, left, right) \
2474 str = PyUnicode_FromUnicode(data + left, right - left); \
2475 if (!str) \
2476 goto onError; \
2477 if (PyList_Append(list, str)) { \
2478 Py_DECREF(str); \
2479 goto onError; \
2480 } \
2481 else \
2482 Py_DECREF(str);
2483
2484static
2485PyObject *split_whitespace(PyUnicodeObject *self,
2486 PyObject *list,
2487 int maxcount)
2488{
2489 register int i;
2490 register int j;
2491 int len = self->length;
2492 PyObject *str;
2493
2494 for (i = j = 0; i < len; ) {
2495 /* find a token */
2496 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2497 i++;
2498 j = i;
2499 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2500 i++;
2501 if (j < i) {
2502 if (maxcount-- <= 0)
2503 break;
2504 SPLIT_APPEND(self->str, j, i);
2505 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2506 i++;
2507 j = i;
2508 }
2509 }
2510 if (j < len) {
2511 SPLIT_APPEND(self->str, j, len);
2512 }
2513 return list;
2514
2515 onError:
2516 Py_DECREF(list);
2517 return NULL;
2518}
2519
2520PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002521 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002522{
2523 register int i;
2524 register int j;
2525 int len;
2526 PyObject *list;
2527 PyObject *str;
2528 Py_UNICODE *data;
2529
2530 string = PyUnicode_FromObject(string);
2531 if (string == NULL)
2532 return NULL;
2533 data = PyUnicode_AS_UNICODE(string);
2534 len = PyUnicode_GET_SIZE(string);
2535
Guido van Rossumd57fd912000-03-10 22:53:23 +00002536 list = PyList_New(0);
2537 if (!list)
2538 goto onError;
2539
2540 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002541 int eol;
2542
Guido van Rossumd57fd912000-03-10 22:53:23 +00002543 /* Find a line and append it */
2544 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2545 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002546
2547 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002548 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002549 if (i < len) {
2550 if (data[i] == '\r' && i + 1 < len &&
2551 data[i+1] == '\n')
2552 i += 2;
2553 else
2554 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002555 if (keepends)
2556 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 }
Guido van Rossum86662912000-04-11 15:38:46 +00002558 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002559 j = i;
2560 }
2561 if (j < len) {
2562 SPLIT_APPEND(data, j, len);
2563 }
2564
2565 Py_DECREF(string);
2566 return list;
2567
2568 onError:
2569 Py_DECREF(list);
2570 Py_DECREF(string);
2571 return NULL;
2572}
2573
2574static
2575PyObject *split_char(PyUnicodeObject *self,
2576 PyObject *list,
2577 Py_UNICODE ch,
2578 int maxcount)
2579{
2580 register int i;
2581 register int j;
2582 int len = self->length;
2583 PyObject *str;
2584
2585 for (i = j = 0; i < len; ) {
2586 if (self->str[i] == ch) {
2587 if (maxcount-- <= 0)
2588 break;
2589 SPLIT_APPEND(self->str, j, i);
2590 i = j = i + 1;
2591 } else
2592 i++;
2593 }
2594 if (j <= len) {
2595 SPLIT_APPEND(self->str, j, len);
2596 }
2597 return list;
2598
2599 onError:
2600 Py_DECREF(list);
2601 return NULL;
2602}
2603
2604static
2605PyObject *split_substring(PyUnicodeObject *self,
2606 PyObject *list,
2607 PyUnicodeObject *substring,
2608 int maxcount)
2609{
2610 register int i;
2611 register int j;
2612 int len = self->length;
2613 int sublen = substring->length;
2614 PyObject *str;
2615
2616 for (i = j = 0; i < len - sublen; ) {
2617 if (Py_UNICODE_MATCH(self, i, substring)) {
2618 if (maxcount-- <= 0)
2619 break;
2620 SPLIT_APPEND(self->str, j, i);
2621 i = j = i + sublen;
2622 } else
2623 i++;
2624 }
2625 if (j <= len) {
2626 SPLIT_APPEND(self->str, j, len);
2627 }
2628 return list;
2629
2630 onError:
2631 Py_DECREF(list);
2632 return NULL;
2633}
2634
2635#undef SPLIT_APPEND
2636
2637static
2638PyObject *split(PyUnicodeObject *self,
2639 PyUnicodeObject *substring,
2640 int maxcount)
2641{
2642 PyObject *list;
2643
2644 if (maxcount < 0)
2645 maxcount = INT_MAX;
2646
2647 list = PyList_New(0);
2648 if (!list)
2649 return NULL;
2650
2651 if (substring == NULL)
2652 return split_whitespace(self,list,maxcount);
2653
2654 else if (substring->length == 1)
2655 return split_char(self,list,substring->str[0],maxcount);
2656
2657 else if (substring->length == 0) {
2658 Py_DECREF(list);
2659 PyErr_SetString(PyExc_ValueError, "empty separator");
2660 return NULL;
2661 }
2662 else
2663 return split_substring(self,list,substring,maxcount);
2664}
2665
2666static
2667PyObject *strip(PyUnicodeObject *self,
2668 int left,
2669 int right)
2670{
2671 Py_UNICODE *p = self->str;
2672 int start = 0;
2673 int end = self->length;
2674
2675 if (left)
2676 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2677 start++;
2678
2679 if (right)
2680 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2681 end--;
2682
2683 if (start == 0 && end == self->length) {
2684 /* couldn't strip anything off, return original string */
2685 Py_INCREF(self);
2686 return (PyObject*) self;
2687 }
2688
2689 return (PyObject*) PyUnicode_FromUnicode(
2690 self->str + start,
2691 end - start
2692 );
2693}
2694
2695static
2696PyObject *replace(PyUnicodeObject *self,
2697 PyUnicodeObject *str1,
2698 PyUnicodeObject *str2,
2699 int maxcount)
2700{
2701 PyUnicodeObject *u;
2702
2703 if (maxcount < 0)
2704 maxcount = INT_MAX;
2705
2706 if (str1->length == 1 && str2->length == 1) {
2707 int i;
2708
2709 /* replace characters */
2710 if (!findchar(self->str, self->length, str1->str[0])) {
2711 /* nothing to replace, return original string */
2712 Py_INCREF(self);
2713 u = self;
2714 } else {
2715 Py_UNICODE u1 = str1->str[0];
2716 Py_UNICODE u2 = str2->str[0];
2717
2718 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2719 self->str,
2720 self->length
2721 );
2722 if (u)
2723 for (i = 0; i < u->length; i++)
2724 if (u->str[i] == u1) {
2725 if (--maxcount < 0)
2726 break;
2727 u->str[i] = u2;
2728 }
2729 }
2730
2731 } else {
2732 int n, i;
2733 Py_UNICODE *p;
2734
2735 /* replace strings */
2736 n = count(self, 0, self->length, str1);
2737 if (n > maxcount)
2738 n = maxcount;
2739 if (n == 0) {
2740 /* nothing to replace, return original string */
2741 Py_INCREF(self);
2742 u = self;
2743 } else {
2744 u = _PyUnicode_New(
2745 self->length + n * (str2->length - str1->length));
2746 if (u) {
2747 i = 0;
2748 p = u->str;
2749 while (i <= self->length - str1->length)
2750 if (Py_UNICODE_MATCH(self, i, str1)) {
2751 /* replace string segment */
2752 Py_UNICODE_COPY(p, str2->str, str2->length);
2753 p += str2->length;
2754 i += str1->length;
2755 if (--n <= 0) {
2756 /* copy remaining part */
2757 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2758 break;
2759 }
2760 } else
2761 *p++ = self->str[i++];
2762 }
2763 }
2764 }
2765
2766 return (PyObject *) u;
2767}
2768
2769/* --- Unicode Object Methods --------------------------------------------- */
2770
2771static char title__doc__[] =
2772"S.title() -> unicode\n\
2773\n\
2774Return a titlecased version of S, i.e. words start with title case\n\
2775characters, all remaining cased characters have lower case.";
2776
2777static PyObject*
2778unicode_title(PyUnicodeObject *self, PyObject *args)
2779{
2780 if (!PyArg_NoArgs(args))
2781 return NULL;
2782 return fixup(self, fixtitle);
2783}
2784
2785static char capitalize__doc__[] =
2786"S.capitalize() -> unicode\n\
2787\n\
2788Return a capitalized version of S, i.e. make the first character\n\
2789have upper case.";
2790
2791static PyObject*
2792unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2793{
2794 if (!PyArg_NoArgs(args))
2795 return NULL;
2796 return fixup(self, fixcapitalize);
2797}
2798
2799#if 0
2800static char capwords__doc__[] =
2801"S.capwords() -> unicode\n\
2802\n\
2803Apply .capitalize() to all words in S and return the result with\n\
2804normalized whitespace (all whitespace strings are replaced by ' ').";
2805
2806static PyObject*
2807unicode_capwords(PyUnicodeObject *self, PyObject *args)
2808{
2809 PyObject *list;
2810 PyObject *item;
2811 int i;
2812
2813 if (!PyArg_NoArgs(args))
2814 return NULL;
2815
2816 /* Split into words */
2817 list = split(self, NULL, -1);
2818 if (!list)
2819 return NULL;
2820
2821 /* Capitalize each word */
2822 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2823 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2824 fixcapitalize);
2825 if (item == NULL)
2826 goto onError;
2827 Py_DECREF(PyList_GET_ITEM(list, i));
2828 PyList_SET_ITEM(list, i, item);
2829 }
2830
2831 /* Join the words to form a new string */
2832 item = PyUnicode_Join(NULL, list);
2833
2834onError:
2835 Py_DECREF(list);
2836 return (PyObject *)item;
2837}
2838#endif
2839
2840static char center__doc__[] =
2841"S.center(width) -> unicode\n\
2842\n\
2843Return S centered in a Unicode string of length width. Padding is done\n\
2844using spaces.";
2845
2846static PyObject *
2847unicode_center(PyUnicodeObject *self, PyObject *args)
2848{
2849 int marg, left;
2850 int width;
2851
2852 if (!PyArg_ParseTuple(args, "i:center", &width))
2853 return NULL;
2854
2855 if (self->length >= width) {
2856 Py_INCREF(self);
2857 return (PyObject*) self;
2858 }
2859
2860 marg = width - self->length;
2861 left = marg / 2 + (marg & width & 1);
2862
2863 return (PyObject*) pad(self, left, marg - left, ' ');
2864}
2865
2866static int
2867unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2868{
2869 int len1, len2;
2870 Py_UNICODE *s1 = str1->str;
2871 Py_UNICODE *s2 = str2->str;
2872
2873 len1 = str1->length;
2874 len2 = str2->length;
2875
2876 while (len1 > 0 && len2 > 0) {
2877 int cmp = (*s1++) - (*s2++);
2878 if (cmp)
2879 /* This should make Christian happy! */
2880 return (cmp < 0) ? -1 : (cmp != 0);
2881 len1--, len2--;
2882 }
2883
2884 return (len1 < len2) ? -1 : (len1 != len2);
2885}
2886
2887int PyUnicode_Compare(PyObject *left,
2888 PyObject *right)
2889{
2890 PyUnicodeObject *u = NULL, *v = NULL;
2891 int result;
2892
2893 /* Coerce the two arguments */
2894 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2895 if (u == NULL)
2896 goto onError;
2897 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2898 if (v == NULL)
2899 goto onError;
2900
2901 /* Shortcut for emtpy or interned objects */
2902 if (v == u) {
2903 Py_DECREF(u);
2904 Py_DECREF(v);
2905 return 0;
2906 }
2907
2908 result = unicode_compare(u, v);
2909
2910 Py_DECREF(u);
2911 Py_DECREF(v);
2912 return result;
2913
2914onError:
2915 Py_XDECREF(u);
2916 Py_XDECREF(v);
2917 return -1;
2918}
2919
Guido van Rossum403d68b2000-03-13 15:55:09 +00002920int PyUnicode_Contains(PyObject *container,
2921 PyObject *element)
2922{
2923 PyUnicodeObject *u = NULL, *v = NULL;
2924 int result;
2925 register const Py_UNICODE *p, *e;
2926 register Py_UNICODE ch;
2927
2928 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002929 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2930 if (v == NULL)
2931 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002932 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2933 if (u == NULL) {
2934 Py_DECREF(v);
2935 goto onError;
2936 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002937
2938 /* Check v in u */
2939 if (PyUnicode_GET_SIZE(v) != 1) {
2940 PyErr_SetString(PyExc_TypeError,
2941 "string member test needs char left operand");
2942 goto onError;
2943 }
2944 ch = *PyUnicode_AS_UNICODE(v);
2945 p = PyUnicode_AS_UNICODE(u);
2946 e = p + PyUnicode_GET_SIZE(u);
2947 result = 0;
2948 while (p < e) {
2949 if (*p++ == ch) {
2950 result = 1;
2951 break;
2952 }
2953 }
2954
2955 Py_DECREF(u);
2956 Py_DECREF(v);
2957 return result;
2958
2959onError:
2960 Py_XDECREF(u);
2961 Py_XDECREF(v);
2962 return -1;
2963}
2964
Guido van Rossumd57fd912000-03-10 22:53:23 +00002965/* Concat to string or Unicode object giving a new Unicode object. */
2966
2967PyObject *PyUnicode_Concat(PyObject *left,
2968 PyObject *right)
2969{
2970 PyUnicodeObject *u = NULL, *v = NULL, *w;
2971
2972 /* Coerce the two arguments */
2973 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2974 if (u == NULL)
2975 goto onError;
2976 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2977 if (v == NULL)
2978 goto onError;
2979
2980 /* Shortcuts */
2981 if (v == unicode_empty) {
2982 Py_DECREF(v);
2983 return (PyObject *)u;
2984 }
2985 if (u == unicode_empty) {
2986 Py_DECREF(u);
2987 return (PyObject *)v;
2988 }
2989
2990 /* Concat the two Unicode strings */
2991 w = _PyUnicode_New(u->length + v->length);
2992 if (w == NULL)
2993 goto onError;
2994 Py_UNICODE_COPY(w->str, u->str, u->length);
2995 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
2996
2997 Py_DECREF(u);
2998 Py_DECREF(v);
2999 return (PyObject *)w;
3000
3001onError:
3002 Py_XDECREF(u);
3003 Py_XDECREF(v);
3004 return NULL;
3005}
3006
3007static char count__doc__[] =
3008"S.count(sub[, start[, end]]) -> int\n\
3009\n\
3010Return the number of occurrences of substring sub in Unicode string\n\
3011S[start:end]. Optional arguments start and end are\n\
3012interpreted as in slice notation.";
3013
3014static PyObject *
3015unicode_count(PyUnicodeObject *self, PyObject *args)
3016{
3017 PyUnicodeObject *substring;
3018 int start = 0;
3019 int end = INT_MAX;
3020 PyObject *result;
3021
3022 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
3023 return NULL;
3024
3025 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3026 (PyObject *)substring);
3027 if (substring == NULL)
3028 return NULL;
3029
3030 if (substring->length == 0) {
3031 Py_DECREF(substring);
3032 return PyInt_FromLong((long) 0);
3033 }
3034
3035 if (start < 0)
3036 start += self->length;
3037 if (start < 0)
3038 start = 0;
3039 if (end > self->length)
3040 end = self->length;
3041 if (end < 0)
3042 end += self->length;
3043 if (end < 0)
3044 end = 0;
3045
3046 result = PyInt_FromLong((long) count(self, start, end, substring));
3047
3048 Py_DECREF(substring);
3049 return result;
3050}
3051
3052static char encode__doc__[] =
3053"S.encode([encoding[,errors]]) -> string\n\
3054\n\
3055Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
3056errors may be given to set a different error handling scheme. Default\n\
3057is 'strict' meaning that encoding errors raise a ValueError. Other\n\
3058possible values are 'ignore' and 'replace'.";
3059
3060static PyObject *
3061unicode_encode(PyUnicodeObject *self, PyObject *args)
3062{
3063 char *encoding = NULL;
3064 char *errors = NULL;
3065 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3066 return NULL;
3067 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3068}
3069
3070static char expandtabs__doc__[] =
3071"S.expandtabs([tabsize]) -> unicode\n\
3072\n\
3073Return a copy of S where all tab characters are expanded using spaces.\n\
3074If tabsize is not given, a tab size of 8 characters is assumed.";
3075
3076static PyObject*
3077unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3078{
3079 Py_UNICODE *e;
3080 Py_UNICODE *p;
3081 Py_UNICODE *q;
3082 int i, j;
3083 PyUnicodeObject *u;
3084 int tabsize = 8;
3085
3086 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3087 return NULL;
3088
3089 /* First pass: determine size of ouput string */
3090 i = j = 0;
3091 e = self->str + self->length;
3092 for (p = self->str; p < e; p++)
3093 if (*p == '\t') {
3094 if (tabsize > 0)
3095 j += tabsize - (j % tabsize);
3096 }
3097 else {
3098 j++;
3099 if (*p == '\n' || *p == '\r') {
3100 i += j;
3101 j = 0;
3102 }
3103 }
3104
3105 /* Second pass: create output string and fill it */
3106 u = _PyUnicode_New(i + j);
3107 if (!u)
3108 return NULL;
3109
3110 j = 0;
3111 q = u->str;
3112
3113 for (p = self->str; p < e; p++)
3114 if (*p == '\t') {
3115 if (tabsize > 0) {
3116 i = tabsize - (j % tabsize);
3117 j += i;
3118 while (i--)
3119 *q++ = ' ';
3120 }
3121 }
3122 else {
3123 j++;
3124 *q++ = *p;
3125 if (*p == '\n' || *p == '\r')
3126 j = 0;
3127 }
3128
3129 return (PyObject*) u;
3130}
3131
3132static char find__doc__[] =
3133"S.find(sub [,start [,end]]) -> int\n\
3134\n\
3135Return the lowest index in S where substring sub is found,\n\
3136such that sub is contained within s[start,end]. Optional\n\
3137arguments start and end are interpreted as in slice notation.\n\
3138\n\
3139Return -1 on failure.";
3140
3141static PyObject *
3142unicode_find(PyUnicodeObject *self, PyObject *args)
3143{
3144 PyUnicodeObject *substring;
3145 int start = 0;
3146 int end = INT_MAX;
3147 PyObject *result;
3148
3149 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
3150 return NULL;
3151 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3152 (PyObject *)substring);
3153 if (substring == NULL)
3154 return NULL;
3155
3156 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3157
3158 Py_DECREF(substring);
3159 return result;
3160}
3161
3162static PyObject *
3163unicode_getitem(PyUnicodeObject *self, int index)
3164{
3165 if (index < 0 || index >= self->length) {
3166 PyErr_SetString(PyExc_IndexError, "string index out of range");
3167 return NULL;
3168 }
3169
3170 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3171}
3172
3173static long
3174unicode_hash(PyUnicodeObject *self)
3175{
3176 long hash;
3177 PyObject *utf8;
3178
3179 /* Since Unicode objects compare equal to their UTF-8 string
3180 counterparts, they should also use the UTF-8 strings as basis
3181 for their hash value. This is needed to assure that strings and
3182 Unicode objects behave in the same way as dictionary
3183 keys. Unfortunately, this costs some performance and also some
3184 memory if the cached UTF-8 representation is not used later
3185 on. */
3186 if (self->hash != -1)
3187 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003188 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003189 if (utf8 == NULL)
3190 return -1;
3191 hash = PyObject_Hash(utf8);
3192 if (hash == -1)
3193 return -1;
3194 self->hash = hash;
3195 return hash;
3196}
3197
3198static char index__doc__[] =
3199"S.index(sub [,start [,end]]) -> int\n\
3200\n\
3201Like S.find() but raise ValueError when the substring is not found.";
3202
3203static PyObject *
3204unicode_index(PyUnicodeObject *self, PyObject *args)
3205{
3206 int result;
3207 PyUnicodeObject *substring;
3208 int start = 0;
3209 int end = INT_MAX;
3210
3211 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
3212 return NULL;
3213
3214 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3215 (PyObject *)substring);
3216 if (substring == NULL)
3217 return NULL;
3218
3219 result = findstring(self, substring, start, end, 1);
3220
3221 Py_DECREF(substring);
3222 if (result < 0) {
3223 PyErr_SetString(PyExc_ValueError, "substring not found");
3224 return NULL;
3225 }
3226 return PyInt_FromLong(result);
3227}
3228
3229static char islower__doc__[] =
3230"S.islower() -> int\n\
3231\n\
3232Return 1 if all cased characters in S are lowercase and there is\n\
3233at least one cased character in S, 0 otherwise.";
3234
3235static PyObject*
3236unicode_islower(PyUnicodeObject *self, PyObject *args)
3237{
3238 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3239 register const Py_UNICODE *e;
3240 int cased;
3241
3242 if (!PyArg_NoArgs(args))
3243 return NULL;
3244
3245 /* Shortcut for single character strings */
3246 if (PyUnicode_GET_SIZE(self) == 1)
3247 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3248
3249 e = p + PyUnicode_GET_SIZE(self);
3250 cased = 0;
3251 for (; p < e; p++) {
3252 register const Py_UNICODE ch = *p;
3253
3254 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3255 return PyInt_FromLong(0);
3256 else if (!cased && Py_UNICODE_ISLOWER(ch))
3257 cased = 1;
3258 }
3259 return PyInt_FromLong(cased);
3260}
3261
3262static char isupper__doc__[] =
3263"S.isupper() -> int\n\
3264\n\
3265Return 1 if all cased characters in S are uppercase and there is\n\
3266at least one cased character in S, 0 otherwise.";
3267
3268static PyObject*
3269unicode_isupper(PyUnicodeObject *self, PyObject *args)
3270{
3271 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3272 register const Py_UNICODE *e;
3273 int cased;
3274
3275 if (!PyArg_NoArgs(args))
3276 return NULL;
3277
3278 /* Shortcut for single character strings */
3279 if (PyUnicode_GET_SIZE(self) == 1)
3280 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3281
3282 e = p + PyUnicode_GET_SIZE(self);
3283 cased = 0;
3284 for (; p < e; p++) {
3285 register const Py_UNICODE ch = *p;
3286
3287 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3288 return PyInt_FromLong(0);
3289 else if (!cased && Py_UNICODE_ISUPPER(ch))
3290 cased = 1;
3291 }
3292 return PyInt_FromLong(cased);
3293}
3294
3295static char istitle__doc__[] =
3296"S.istitle() -> int\n\
3297\n\
3298Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3299may only follow uncased characters and lowercase characters only cased\n\
3300ones. Return 0 otherwise.";
3301
3302static PyObject*
3303unicode_istitle(PyUnicodeObject *self, PyObject *args)
3304{
3305 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3306 register const Py_UNICODE *e;
3307 int cased, previous_is_cased;
3308
3309 if (!PyArg_NoArgs(args))
3310 return NULL;
3311
3312 /* Shortcut for single character strings */
3313 if (PyUnicode_GET_SIZE(self) == 1)
3314 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3315 (Py_UNICODE_ISUPPER(*p) != 0));
3316
3317 e = p + PyUnicode_GET_SIZE(self);
3318 cased = 0;
3319 previous_is_cased = 0;
3320 for (; p < e; p++) {
3321 register const Py_UNICODE ch = *p;
3322
3323 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3324 if (previous_is_cased)
3325 return PyInt_FromLong(0);
3326 previous_is_cased = 1;
3327 cased = 1;
3328 }
3329 else if (Py_UNICODE_ISLOWER(ch)) {
3330 if (!previous_is_cased)
3331 return PyInt_FromLong(0);
3332 previous_is_cased = 1;
3333 cased = 1;
3334 }
3335 else
3336 previous_is_cased = 0;
3337 }
3338 return PyInt_FromLong(cased);
3339}
3340
3341static char isspace__doc__[] =
3342"S.isspace() -> int\n\
3343\n\
3344Return 1 if there are only whitespace characters in S,\n\
33450 otherwise.";
3346
3347static PyObject*
3348unicode_isspace(PyUnicodeObject *self, PyObject *args)
3349{
3350 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3351 register const Py_UNICODE *e;
3352
3353 if (!PyArg_NoArgs(args))
3354 return NULL;
3355
3356 /* Shortcut for single character strings */
3357 if (PyUnicode_GET_SIZE(self) == 1 &&
3358 Py_UNICODE_ISSPACE(*p))
3359 return PyInt_FromLong(1);
3360
3361 e = p + PyUnicode_GET_SIZE(self);
3362 for (; p < e; p++) {
3363 if (!Py_UNICODE_ISSPACE(*p))
3364 return PyInt_FromLong(0);
3365 }
3366 return PyInt_FromLong(1);
3367}
3368
3369static char isdecimal__doc__[] =
3370"S.isdecimal() -> int\n\
3371\n\
3372Return 1 if there are only decimal characters in S,\n\
33730 otherwise.";
3374
3375static PyObject*
3376unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3377{
3378 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3379 register const Py_UNICODE *e;
3380
3381 if (!PyArg_NoArgs(args))
3382 return NULL;
3383
3384 /* Shortcut for single character strings */
3385 if (PyUnicode_GET_SIZE(self) == 1 &&
3386 Py_UNICODE_ISDECIMAL(*p))
3387 return PyInt_FromLong(1);
3388
3389 e = p + PyUnicode_GET_SIZE(self);
3390 for (; p < e; p++) {
3391 if (!Py_UNICODE_ISDECIMAL(*p))
3392 return PyInt_FromLong(0);
3393 }
3394 return PyInt_FromLong(1);
3395}
3396
3397static char isdigit__doc__[] =
3398"S.isdigit() -> int\n\
3399\n\
3400Return 1 if there are only digit characters in S,\n\
34010 otherwise.";
3402
3403static PyObject*
3404unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3405{
3406 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3407 register const Py_UNICODE *e;
3408
3409 if (!PyArg_NoArgs(args))
3410 return NULL;
3411
3412 /* Shortcut for single character strings */
3413 if (PyUnicode_GET_SIZE(self) == 1 &&
3414 Py_UNICODE_ISDIGIT(*p))
3415 return PyInt_FromLong(1);
3416
3417 e = p + PyUnicode_GET_SIZE(self);
3418 for (; p < e; p++) {
3419 if (!Py_UNICODE_ISDIGIT(*p))
3420 return PyInt_FromLong(0);
3421 }
3422 return PyInt_FromLong(1);
3423}
3424
3425static char isnumeric__doc__[] =
3426"S.isnumeric() -> int\n\
3427\n\
3428Return 1 if there are only numeric characters in S,\n\
34290 otherwise.";
3430
3431static PyObject*
3432unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3433{
3434 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3435 register const Py_UNICODE *e;
3436
3437 if (!PyArg_NoArgs(args))
3438 return NULL;
3439
3440 /* Shortcut for single character strings */
3441 if (PyUnicode_GET_SIZE(self) == 1 &&
3442 Py_UNICODE_ISNUMERIC(*p))
3443 return PyInt_FromLong(1);
3444
3445 e = p + PyUnicode_GET_SIZE(self);
3446 for (; p < e; p++) {
3447 if (!Py_UNICODE_ISNUMERIC(*p))
3448 return PyInt_FromLong(0);
3449 }
3450 return PyInt_FromLong(1);
3451}
3452
3453static char join__doc__[] =
3454"S.join(sequence) -> unicode\n\
3455\n\
3456Return a string which is the concatenation of the strings in the\n\
3457sequence. The separator between elements is S.";
3458
3459static PyObject*
3460unicode_join(PyUnicodeObject *self, PyObject *args)
3461{
3462 PyObject *data;
3463 if (!PyArg_ParseTuple(args, "O:join", &data))
3464 return NULL;
3465
3466 return PyUnicode_Join((PyObject *)self, data);
3467}
3468
3469static int
3470unicode_length(PyUnicodeObject *self)
3471{
3472 return self->length;
3473}
3474
3475static char ljust__doc__[] =
3476"S.ljust(width) -> unicode\n\
3477\n\
3478Return S left justified in a Unicode string of length width. Padding is\n\
3479done using spaces.";
3480
3481static PyObject *
3482unicode_ljust(PyUnicodeObject *self, PyObject *args)
3483{
3484 int width;
3485 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3486 return NULL;
3487
3488 if (self->length >= width) {
3489 Py_INCREF(self);
3490 return (PyObject*) self;
3491 }
3492
3493 return (PyObject*) pad(self, 0, width - self->length, ' ');
3494}
3495
3496static char lower__doc__[] =
3497"S.lower() -> unicode\n\
3498\n\
3499Return a copy of the string S converted to lowercase.";
3500
3501static PyObject*
3502unicode_lower(PyUnicodeObject *self, PyObject *args)
3503{
3504 if (!PyArg_NoArgs(args))
3505 return NULL;
3506 return fixup(self, fixlower);
3507}
3508
3509static char lstrip__doc__[] =
3510"S.lstrip() -> unicode\n\
3511\n\
3512Return a copy of the string S with leading whitespace removed.";
3513
3514static PyObject *
3515unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3516{
3517 if (!PyArg_NoArgs(args))
3518 return NULL;
3519 return strip(self, 1, 0);
3520}
3521
3522static PyObject*
3523unicode_repeat(PyUnicodeObject *str, int len)
3524{
3525 PyUnicodeObject *u;
3526 Py_UNICODE *p;
3527
3528 if (len < 0)
3529 len = 0;
3530
3531 if (len == 1) {
3532 /* no repeat, return original string */
3533 Py_INCREF(str);
3534 return (PyObject*) str;
3535 }
3536
3537 u = _PyUnicode_New(len * str->length);
3538 if (!u)
3539 return NULL;
3540
3541 p = u->str;
3542
3543 while (len-- > 0) {
3544 Py_UNICODE_COPY(p, str->str, str->length);
3545 p += str->length;
3546 }
3547
3548 return (PyObject*) u;
3549}
3550
3551PyObject *PyUnicode_Replace(PyObject *obj,
3552 PyObject *subobj,
3553 PyObject *replobj,
3554 int maxcount)
3555{
3556 PyObject *self;
3557 PyObject *str1;
3558 PyObject *str2;
3559 PyObject *result;
3560
3561 self = PyUnicode_FromObject(obj);
3562 if (self == NULL)
3563 return NULL;
3564 str1 = PyUnicode_FromObject(subobj);
3565 if (str1 == NULL) {
3566 Py_DECREF(self);
3567 return NULL;
3568 }
3569 str2 = PyUnicode_FromObject(replobj);
3570 if (str2 == NULL) {
3571 Py_DECREF(self);
3572 Py_DECREF(str1);
3573 return NULL;
3574 }
3575 result = replace((PyUnicodeObject *)self,
3576 (PyUnicodeObject *)str1,
3577 (PyUnicodeObject *)str2,
3578 maxcount);
3579 Py_DECREF(self);
3580 Py_DECREF(str1);
3581 Py_DECREF(str2);
3582 return result;
3583}
3584
3585static char replace__doc__[] =
3586"S.replace (old, new[, maxsplit]) -> unicode\n\
3587\n\
3588Return a copy of S with all occurrences of substring\n\
3589old replaced by new. If the optional argument maxsplit is\n\
3590given, only the first maxsplit occurrences are replaced.";
3591
3592static PyObject*
3593unicode_replace(PyUnicodeObject *self, PyObject *args)
3594{
3595 PyUnicodeObject *str1;
3596 PyUnicodeObject *str2;
3597 int maxcount = -1;
3598 PyObject *result;
3599
3600 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3601 return NULL;
3602 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3603 if (str1 == NULL)
3604 return NULL;
3605 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3606 if (str2 == NULL)
3607 return NULL;
3608
3609 result = replace(self, str1, str2, maxcount);
3610
3611 Py_DECREF(str1);
3612 Py_DECREF(str2);
3613 return result;
3614}
3615
3616static
3617PyObject *unicode_repr(PyObject *unicode)
3618{
3619 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3620 PyUnicode_GET_SIZE(unicode),
3621 1);
3622}
3623
3624static char rfind__doc__[] =
3625"S.rfind(sub [,start [,end]]) -> int\n\
3626\n\
3627Return the highest index in S where substring sub is found,\n\
3628such that sub is contained within s[start,end]. Optional\n\
3629arguments start and end are interpreted as in slice notation.\n\
3630\n\
3631Return -1 on failure.";
3632
3633static PyObject *
3634unicode_rfind(PyUnicodeObject *self, PyObject *args)
3635{
3636 PyUnicodeObject *substring;
3637 int start = 0;
3638 int end = INT_MAX;
3639 PyObject *result;
3640
3641 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3642 return NULL;
3643 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3644 (PyObject *)substring);
3645 if (substring == NULL)
3646 return NULL;
3647
3648 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3649
3650 Py_DECREF(substring);
3651 return result;
3652}
3653
3654static char rindex__doc__[] =
3655"S.rindex(sub [,start [,end]]) -> int\n\
3656\n\
3657Like S.rfind() but raise ValueError when the substring is not found.";
3658
3659static PyObject *
3660unicode_rindex(PyUnicodeObject *self, PyObject *args)
3661{
3662 int result;
3663 PyUnicodeObject *substring;
3664 int start = 0;
3665 int end = INT_MAX;
3666
3667 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3668 return NULL;
3669 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3670 (PyObject *)substring);
3671 if (substring == NULL)
3672 return NULL;
3673
3674 result = findstring(self, substring, start, end, -1);
3675
3676 Py_DECREF(substring);
3677 if (result < 0) {
3678 PyErr_SetString(PyExc_ValueError, "substring not found");
3679 return NULL;
3680 }
3681 return PyInt_FromLong(result);
3682}
3683
3684static char rjust__doc__[] =
3685"S.rjust(width) -> unicode\n\
3686\n\
3687Return S right justified in a Unicode string of length width. Padding is\n\
3688done using spaces.";
3689
3690static PyObject *
3691unicode_rjust(PyUnicodeObject *self, PyObject *args)
3692{
3693 int width;
3694 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3695 return NULL;
3696
3697 if (self->length >= width) {
3698 Py_INCREF(self);
3699 return (PyObject*) self;
3700 }
3701
3702 return (PyObject*) pad(self, width - self->length, 0, ' ');
3703}
3704
3705static char rstrip__doc__[] =
3706"S.rstrip() -> unicode\n\
3707\n\
3708Return a copy of the string S with trailing whitespace removed.";
3709
3710static PyObject *
3711unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3712{
3713 if (!PyArg_NoArgs(args))
3714 return NULL;
3715 return strip(self, 0, 1);
3716}
3717
3718static PyObject*
3719unicode_slice(PyUnicodeObject *self, int start, int end)
3720{
3721 /* standard clamping */
3722 if (start < 0)
3723 start = 0;
3724 if (end < 0)
3725 end = 0;
3726 if (end > self->length)
3727 end = self->length;
3728 if (start == 0 && end == self->length) {
3729 /* full slice, return original string */
3730 Py_INCREF(self);
3731 return (PyObject*) self;
3732 }
3733 if (start > end)
3734 start = end;
3735 /* copy slice */
3736 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3737 end - start);
3738}
3739
3740PyObject *PyUnicode_Split(PyObject *s,
3741 PyObject *sep,
3742 int maxsplit)
3743{
3744 PyObject *result;
3745
3746 s = PyUnicode_FromObject(s);
3747 if (s == NULL)
3748 return NULL;
3749 if (sep != NULL) {
3750 sep = PyUnicode_FromObject(sep);
3751 if (sep == NULL) {
3752 Py_DECREF(s);
3753 return NULL;
3754 }
3755 }
3756
3757 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3758
3759 Py_DECREF(s);
3760 Py_XDECREF(sep);
3761 return result;
3762}
3763
3764static char split__doc__[] =
3765"S.split([sep [,maxsplit]]) -> list of strings\n\
3766\n\
3767Return a list of the words in S, using sep as the\n\
3768delimiter string. If maxsplit is given, at most maxsplit\n\
3769splits are done. If sep is not specified, any whitespace string\n\
3770is a separator.";
3771
3772static PyObject*
3773unicode_split(PyUnicodeObject *self, PyObject *args)
3774{
3775 PyObject *substring = Py_None;
3776 int maxcount = -1;
3777
3778 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3779 return NULL;
3780
3781 if (substring == Py_None)
3782 return split(self, NULL, maxcount);
3783 else if (PyUnicode_Check(substring))
3784 return split(self, (PyUnicodeObject *)substring, maxcount);
3785 else
3786 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3787}
3788
3789static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00003790"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00003791\n\
3792Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00003793Line breaks are not included in the resulting list unless keepends\n\
3794is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795
3796static PyObject*
3797unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3798{
Guido van Rossum86662912000-04-11 15:38:46 +00003799 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800
Guido van Rossum86662912000-04-11 15:38:46 +00003801 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003802 return NULL;
3803
Guido van Rossum86662912000-04-11 15:38:46 +00003804 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003805}
3806
3807static
3808PyObject *unicode_str(PyUnicodeObject *self)
3809{
3810 return PyUnicode_AsUTF8String((PyObject *)self);
3811}
3812
3813static char strip__doc__[] =
3814"S.strip() -> unicode\n\
3815\n\
3816Return a copy of S with leading and trailing whitespace removed.";
3817
3818static PyObject *
3819unicode_strip(PyUnicodeObject *self, PyObject *args)
3820{
3821 if (!PyArg_NoArgs(args))
3822 return NULL;
3823 return strip(self, 1, 1);
3824}
3825
3826static char swapcase__doc__[] =
3827"S.swapcase() -> unicode\n\
3828\n\
3829Return a copy of S with uppercase characters converted to lowercase\n\
3830and vice versa.";
3831
3832static PyObject*
3833unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3834{
3835 if (!PyArg_NoArgs(args))
3836 return NULL;
3837 return fixup(self, fixswapcase);
3838}
3839
3840static char translate__doc__[] =
3841"S.translate(table) -> unicode\n\
3842\n\
3843Return a copy of the string S, where all characters have been mapped\n\
3844through the given translation table, which must be a mapping of\n\
3845Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3846are left untouched. Characters mapped to None are deleted.";
3847
3848static PyObject*
3849unicode_translate(PyUnicodeObject *self, PyObject *args)
3850{
3851 PyObject *table;
3852
3853 if (!PyArg_ParseTuple(args, "O:translate", &table))
3854 return NULL;
3855 return PyUnicode_TranslateCharmap(self->str,
3856 self->length,
3857 table,
3858 "ignore");
3859}
3860
3861static char upper__doc__[] =
3862"S.upper() -> unicode\n\
3863\n\
3864Return a copy of S converted to uppercase.";
3865
3866static PyObject*
3867unicode_upper(PyUnicodeObject *self, PyObject *args)
3868{
3869 if (!PyArg_NoArgs(args))
3870 return NULL;
3871 return fixup(self, fixupper);
3872}
3873
3874#if 0
3875static char zfill__doc__[] =
3876"S.zfill(width) -> unicode\n\
3877\n\
3878Pad a numeric string x with zeros on the left, to fill a field\n\
3879of the specified width. The string x is never truncated.";
3880
3881static PyObject *
3882unicode_zfill(PyUnicodeObject *self, PyObject *args)
3883{
3884 int fill;
3885 PyUnicodeObject *u;
3886
3887 int width;
3888 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3889 return NULL;
3890
3891 if (self->length >= width) {
3892 Py_INCREF(self);
3893 return (PyObject*) self;
3894 }
3895
3896 fill = width - self->length;
3897
3898 u = pad(self, fill, 0, '0');
3899
3900 if (u->str[fill] == '+' || u->str[fill] == '-') {
3901 /* move sign to beginning of string */
3902 u->str[0] = u->str[fill];
3903 u->str[fill] = '0';
3904 }
3905
3906 return (PyObject*) u;
3907}
3908#endif
3909
3910#if 0
3911static PyObject*
3912unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3913{
3914 if (!PyArg_NoArgs(args))
3915 return NULL;
3916 return PyInt_FromLong(unicode_freelist_size);
3917}
3918#endif
3919
3920static char startswith__doc__[] =
3921"S.startswith(prefix[, start[, end]]) -> int\n\
3922\n\
3923Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3924optional start, test S beginning at that position. With optional end, stop\n\
3925comparing S at that position.";
3926
3927static PyObject *
3928unicode_startswith(PyUnicodeObject *self,
3929 PyObject *args)
3930{
3931 PyUnicodeObject *substring;
3932 int start = 0;
3933 int end = INT_MAX;
3934 PyObject *result;
3935
3936 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3937 return NULL;
3938 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3939 (PyObject *)substring);
3940 if (substring == NULL)
3941 return NULL;
3942
3943 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3944
3945 Py_DECREF(substring);
3946 return result;
3947}
3948
3949
3950static char endswith__doc__[] =
3951"S.endswith(suffix[, start[, end]]) -> int\n\
3952\n\
3953Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3954optional start, test S beginning at that position. With optional end, stop\n\
3955comparing S at that position.";
3956
3957static PyObject *
3958unicode_endswith(PyUnicodeObject *self,
3959 PyObject *args)
3960{
3961 PyUnicodeObject *substring;
3962 int start = 0;
3963 int end = INT_MAX;
3964 PyObject *result;
3965
3966 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3967 return NULL;
3968 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3969 (PyObject *)substring);
3970 if (substring == NULL)
3971 return NULL;
3972
3973 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3974
3975 Py_DECREF(substring);
3976 return result;
3977}
3978
3979
3980static PyMethodDef unicode_methods[] = {
3981
3982 /* Order is according to common usage: often used methods should
3983 appear first, since lookup is done sequentially. */
3984
3985 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3986 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3987 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3988 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3989 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3990 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3991 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3992 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3993 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3994 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3995 {"index", (PyCFunction) unicode_index, 1, index__doc__},
3996 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
3997 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
3998 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
3999/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4000 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4001 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4002 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4003 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4004 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4005 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4006 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4007 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4008 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4009 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4010 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4011 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4012 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4013 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4014 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4015 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4016 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4017 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4018#if 0
4019 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4020 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4021#endif
4022
4023#if 0
4024 /* This one is just used for debugging the implementation. */
4025 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4026#endif
4027
4028 {NULL, NULL}
4029};
4030
4031static PyObject *
4032unicode_getattr(PyUnicodeObject *self, char *name)
4033{
4034 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4035}
4036
4037static PySequenceMethods unicode_as_sequence = {
4038 (inquiry) unicode_length, /* sq_length */
4039 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4040 (intargfunc) unicode_repeat, /* sq_repeat */
4041 (intargfunc) unicode_getitem, /* sq_item */
4042 (intintargfunc) unicode_slice, /* sq_slice */
4043 0, /* sq_ass_item */
4044 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004045 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004046};
4047
4048static int
4049unicode_buffer_getreadbuf(PyUnicodeObject *self,
4050 int index,
4051 const void **ptr)
4052{
4053 if (index != 0) {
4054 PyErr_SetString(PyExc_SystemError,
4055 "accessing non-existent unicode segment");
4056 return -1;
4057 }
4058 *ptr = (void *) self->str;
4059 return PyUnicode_GET_DATA_SIZE(self);
4060}
4061
4062static int
4063unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4064 const void **ptr)
4065{
4066 PyErr_SetString(PyExc_TypeError,
4067 "cannot use unicode as modifyable buffer");
4068 return -1;
4069}
4070
4071static int
4072unicode_buffer_getsegcount(PyUnicodeObject *self,
4073 int *lenp)
4074{
4075 if (lenp)
4076 *lenp = PyUnicode_GET_DATA_SIZE(self);
4077 return 1;
4078}
4079
4080static int
4081unicode_buffer_getcharbuf(PyUnicodeObject *self,
4082 int index,
4083 const void **ptr)
4084{
4085 PyObject *str;
4086
4087 if (index != 0) {
4088 PyErr_SetString(PyExc_SystemError,
4089 "accessing non-existent unicode segment");
4090 return -1;
4091 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004092 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004093 if (str == NULL)
4094 return -1;
4095 *ptr = (void *) PyString_AS_STRING(str);
4096 return PyString_GET_SIZE(str);
4097}
4098
4099/* Helpers for PyUnicode_Format() */
4100
4101static PyObject *
4102getnextarg(args, arglen, p_argidx)
4103 PyObject *args;
4104int arglen;
4105int *p_argidx;
4106{
4107 int argidx = *p_argidx;
4108 if (argidx < arglen) {
4109 (*p_argidx)++;
4110 if (arglen < 0)
4111 return args;
4112 else
4113 return PyTuple_GetItem(args, argidx);
4114 }
4115 PyErr_SetString(PyExc_TypeError,
4116 "not enough arguments for format string");
4117 return NULL;
4118}
4119
4120#define F_LJUST (1<<0)
4121#define F_SIGN (1<<1)
4122#define F_BLANK (1<<2)
4123#define F_ALT (1<<3)
4124#define F_ZERO (1<<4)
4125
4126static
4127#ifdef HAVE_STDARG_PROTOTYPES
4128int usprintf(register Py_UNICODE *buffer, char *format, ...)
4129#else
4130int usprintf(va_alist) va_dcl
4131#endif
4132{
4133 register int i;
4134 int len;
4135 va_list va;
4136 char *charbuffer;
4137#ifdef HAVE_STDARG_PROTOTYPES
4138 va_start(va, format);
4139#else
4140 Py_UNICODE *args;
4141 char *format;
4142
4143 va_start(va);
4144 buffer = va_arg(va, Py_UNICODE *);
4145 format = va_arg(va, char *);
4146#endif
4147
4148 /* First, format the string as char array, then expand to Py_UNICODE
4149 array. */
4150 charbuffer = (char *)buffer;
4151 len = vsprintf(charbuffer, format, va);
4152 for (i = len - 1; i >= 0; i--)
4153 buffer[i] = (Py_UNICODE) charbuffer[i];
4154
4155 va_end(va);
4156 return len;
4157}
4158
4159static int
4160formatfloat(Py_UNICODE *buf,
4161 int flags,
4162 int prec,
4163 int type,
4164 PyObject *v)
4165{
4166 char fmt[20];
4167 double x;
4168
4169 x = PyFloat_AsDouble(v);
4170 if (x == -1.0 && PyErr_Occurred())
4171 return -1;
4172 if (prec < 0)
4173 prec = 6;
4174 if (prec > 50)
4175 prec = 50; /* Arbitrary limitation */
4176 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4177 type = 'g';
4178 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4179 return usprintf(buf, fmt, x);
4180}
4181
4182static int
4183formatint(Py_UNICODE *buf,
4184 int flags,
4185 int prec,
4186 int type,
4187 PyObject *v)
4188{
4189 char fmt[20];
4190 long x;
4191
4192 x = PyInt_AsLong(v);
4193 if (x == -1 && PyErr_Occurred())
4194 return -1;
4195 if (prec < 0)
4196 prec = 1;
4197 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4198 return usprintf(buf, fmt, x);
4199}
4200
4201static int
4202formatchar(Py_UNICODE *buf,
4203 PyObject *v)
4204{
4205 if (PyUnicode_Check(v))
4206 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4207
4208 else if (PyString_Check(v))
4209 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4210
4211 else {
4212 /* Integer input truncated to a character */
4213 long x;
4214 x = PyInt_AsLong(v);
4215 if (x == -1 && PyErr_Occurred())
4216 return -1;
4217 buf[0] = (char) x;
4218 }
4219 buf[1] = '\0';
4220 return 1;
4221}
4222
4223PyObject *PyUnicode_Format(PyObject *format,
4224 PyObject *args)
4225{
4226 Py_UNICODE *fmt, *res;
4227 int fmtcnt, rescnt, reslen, arglen, argidx;
4228 int args_owned = 0;
4229 PyUnicodeObject *result = NULL;
4230 PyObject *dict = NULL;
4231 PyObject *uformat;
4232
4233 if (format == NULL || args == NULL) {
4234 PyErr_BadInternalCall();
4235 return NULL;
4236 }
4237 uformat = PyUnicode_FromObject(format);
4238 fmt = PyUnicode_AS_UNICODE(uformat);
4239 fmtcnt = PyUnicode_GET_SIZE(uformat);
4240
4241 reslen = rescnt = fmtcnt + 100;
4242 result = _PyUnicode_New(reslen);
4243 if (result == NULL)
4244 goto onError;
4245 res = PyUnicode_AS_UNICODE(result);
4246
4247 if (PyTuple_Check(args)) {
4248 arglen = PyTuple_Size(args);
4249 argidx = 0;
4250 }
4251 else {
4252 arglen = -1;
4253 argidx = -2;
4254 }
4255 if (args->ob_type->tp_as_mapping)
4256 dict = args;
4257
4258 while (--fmtcnt >= 0) {
4259 if (*fmt != '%') {
4260 if (--rescnt < 0) {
4261 rescnt = fmtcnt + 100;
4262 reslen += rescnt;
4263 if (_PyUnicode_Resize(result, reslen) < 0)
4264 return NULL;
4265 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4266 --rescnt;
4267 }
4268 *res++ = *fmt++;
4269 }
4270 else {
4271 /* Got a format specifier */
4272 int flags = 0;
4273 int width = -1;
4274 int prec = -1;
4275 int size = 0;
4276 Py_UNICODE c = '\0';
4277 Py_UNICODE fill;
4278 PyObject *v = NULL;
4279 PyObject *temp = NULL;
4280 Py_UNICODE *buf;
4281 Py_UNICODE sign;
4282 int len;
4283 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4284
4285 fmt++;
4286 if (*fmt == '(') {
4287 Py_UNICODE *keystart;
4288 int keylen;
4289 PyObject *key;
4290 int pcount = 1;
4291
4292 if (dict == NULL) {
4293 PyErr_SetString(PyExc_TypeError,
4294 "format requires a mapping");
4295 goto onError;
4296 }
4297 ++fmt;
4298 --fmtcnt;
4299 keystart = fmt;
4300 /* Skip over balanced parentheses */
4301 while (pcount > 0 && --fmtcnt >= 0) {
4302 if (*fmt == ')')
4303 --pcount;
4304 else if (*fmt == '(')
4305 ++pcount;
4306 fmt++;
4307 }
4308 keylen = fmt - keystart - 1;
4309 if (fmtcnt < 0 || pcount > 0) {
4310 PyErr_SetString(PyExc_ValueError,
4311 "incomplete format key");
4312 goto onError;
4313 }
4314 /* keys are converted to strings (using UTF-8) and
4315 then looked up since Python uses strings to hold
4316 variables names etc. in its namespaces and we
4317 wouldn't want to break common idioms. The
4318 alternative would be using Unicode objects for the
4319 lookup but u"abc" and "abc" have different hash
4320 values (on purpose). */
4321 key = PyUnicode_EncodeUTF8(keystart,
4322 keylen,
4323 NULL);
4324 if (key == NULL)
4325 goto onError;
4326 if (args_owned) {
4327 Py_DECREF(args);
4328 args_owned = 0;
4329 }
4330 args = PyObject_GetItem(dict, key);
4331 Py_DECREF(key);
4332 if (args == NULL) {
4333 goto onError;
4334 }
4335 args_owned = 1;
4336 arglen = -1;
4337 argidx = -2;
4338 }
4339 while (--fmtcnt >= 0) {
4340 switch (c = *fmt++) {
4341 case '-': flags |= F_LJUST; continue;
4342 case '+': flags |= F_SIGN; continue;
4343 case ' ': flags |= F_BLANK; continue;
4344 case '#': flags |= F_ALT; continue;
4345 case '0': flags |= F_ZERO; continue;
4346 }
4347 break;
4348 }
4349 if (c == '*') {
4350 v = getnextarg(args, arglen, &argidx);
4351 if (v == NULL)
4352 goto onError;
4353 if (!PyInt_Check(v)) {
4354 PyErr_SetString(PyExc_TypeError,
4355 "* wants int");
4356 goto onError;
4357 }
4358 width = PyInt_AsLong(v);
4359 if (width < 0) {
4360 flags |= F_LJUST;
4361 width = -width;
4362 }
4363 if (--fmtcnt >= 0)
4364 c = *fmt++;
4365 }
4366 else if (c >= '0' && c <= '9') {
4367 width = c - '0';
4368 while (--fmtcnt >= 0) {
4369 c = *fmt++;
4370 if (c < '0' || c > '9')
4371 break;
4372 if ((width*10) / 10 != width) {
4373 PyErr_SetString(PyExc_ValueError,
4374 "width too big");
4375 goto onError;
4376 }
4377 width = width*10 + (c - '0');
4378 }
4379 }
4380 if (c == '.') {
4381 prec = 0;
4382 if (--fmtcnt >= 0)
4383 c = *fmt++;
4384 if (c == '*') {
4385 v = getnextarg(args, arglen, &argidx);
4386 if (v == NULL)
4387 goto onError;
4388 if (!PyInt_Check(v)) {
4389 PyErr_SetString(PyExc_TypeError,
4390 "* wants int");
4391 goto onError;
4392 }
4393 prec = PyInt_AsLong(v);
4394 if (prec < 0)
4395 prec = 0;
4396 if (--fmtcnt >= 0)
4397 c = *fmt++;
4398 }
4399 else if (c >= '0' && c <= '9') {
4400 prec = c - '0';
4401 while (--fmtcnt >= 0) {
4402 c = Py_CHARMASK(*fmt++);
4403 if (c < '0' || c > '9')
4404 break;
4405 if ((prec*10) / 10 != prec) {
4406 PyErr_SetString(PyExc_ValueError,
4407 "prec too big");
4408 goto onError;
4409 }
4410 prec = prec*10 + (c - '0');
4411 }
4412 }
4413 } /* prec */
4414 if (fmtcnt >= 0) {
4415 if (c == 'h' || c == 'l' || c == 'L') {
4416 size = c;
4417 if (--fmtcnt >= 0)
4418 c = *fmt++;
4419 }
4420 }
4421 if (fmtcnt < 0) {
4422 PyErr_SetString(PyExc_ValueError,
4423 "incomplete format");
4424 goto onError;
4425 }
4426 if (c != '%') {
4427 v = getnextarg(args, arglen, &argidx);
4428 if (v == NULL)
4429 goto onError;
4430 }
4431 sign = 0;
4432 fill = ' ';
4433 switch (c) {
4434
4435 case '%':
4436 buf = tmpbuf;
4437 buf[0] = '%';
4438 len = 1;
4439 break;
4440
4441 case 's':
4442 case 'r':
4443 if (PyUnicode_Check(v) && c == 's') {
4444 temp = v;
4445 Py_INCREF(temp);
4446 }
4447 else {
4448 PyObject *unicode;
4449 if (c == 's')
4450 temp = PyObject_Str(v);
4451 else
4452 temp = PyObject_Repr(v);
4453 if (temp == NULL)
4454 goto onError;
4455 if (!PyString_Check(temp)) {
4456 /* XXX Note: this should never happen, since
4457 PyObject_Repr() and PyObject_Str() assure
4458 this */
4459 Py_DECREF(temp);
4460 PyErr_SetString(PyExc_TypeError,
4461 "%s argument has non-string str()");
4462 goto onError;
4463 }
4464 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4465 PyString_GET_SIZE(temp),
4466 "strict");
4467 Py_DECREF(temp);
4468 temp = unicode;
4469 if (temp == NULL)
4470 goto onError;
4471 }
4472 buf = PyUnicode_AS_UNICODE(temp);
4473 len = PyUnicode_GET_SIZE(temp);
4474 if (prec >= 0 && len > prec)
4475 len = prec;
4476 break;
4477
4478 case 'i':
4479 case 'd':
4480 case 'u':
4481 case 'o':
4482 case 'x':
4483 case 'X':
4484 if (c == 'i')
4485 c = 'd';
4486 buf = tmpbuf;
4487 len = formatint(buf, flags, prec, c, v);
4488 if (len < 0)
4489 goto onError;
4490 sign = (c == 'd');
4491 if (flags & F_ZERO) {
4492 fill = '0';
4493 if ((flags&F_ALT) &&
4494 (c == 'x' || c == 'X') &&
4495 buf[0] == '0' && buf[1] == c) {
4496 *res++ = *buf++;
4497 *res++ = *buf++;
4498 rescnt -= 2;
4499 len -= 2;
4500 width -= 2;
4501 if (width < 0)
4502 width = 0;
4503 }
4504 }
4505 break;
4506
4507 case 'e':
4508 case 'E':
4509 case 'f':
4510 case 'g':
4511 case 'G':
4512 buf = tmpbuf;
4513 len = formatfloat(buf, flags, prec, c, v);
4514 if (len < 0)
4515 goto onError;
4516 sign = 1;
4517 if (flags&F_ZERO)
4518 fill = '0';
4519 break;
4520
4521 case 'c':
4522 buf = tmpbuf;
4523 len = formatchar(buf, v);
4524 if (len < 0)
4525 goto onError;
4526 break;
4527
4528 default:
4529 PyErr_Format(PyExc_ValueError,
4530 "unsupported format character '%c' (0x%x)",
4531 c, c);
4532 goto onError;
4533 }
4534 if (sign) {
4535 if (*buf == '-' || *buf == '+') {
4536 sign = *buf++;
4537 len--;
4538 }
4539 else if (flags & F_SIGN)
4540 sign = '+';
4541 else if (flags & F_BLANK)
4542 sign = ' ';
4543 else
4544 sign = 0;
4545 }
4546 if (width < len)
4547 width = len;
4548 if (rescnt < width + (sign != 0)) {
4549 reslen -= rescnt;
4550 rescnt = width + fmtcnt + 100;
4551 reslen += rescnt;
4552 if (_PyUnicode_Resize(result, reslen) < 0)
4553 return NULL;
4554 res = PyUnicode_AS_UNICODE(result)
4555 + reslen - rescnt;
4556 }
4557 if (sign) {
4558 if (fill != ' ')
4559 *res++ = sign;
4560 rescnt--;
4561 if (width > len)
4562 width--;
4563 }
4564 if (width > len && !(flags & F_LJUST)) {
4565 do {
4566 --rescnt;
4567 *res++ = fill;
4568 } while (--width > len);
4569 }
4570 if (sign && fill == ' ')
4571 *res++ = sign;
4572 memcpy(res, buf, len * sizeof(Py_UNICODE));
4573 res += len;
4574 rescnt -= len;
4575 while (--width >= len) {
4576 --rescnt;
4577 *res++ = ' ';
4578 }
4579 if (dict && (argidx < arglen) && c != '%') {
4580 PyErr_SetString(PyExc_TypeError,
4581 "not all arguments converted");
4582 goto onError;
4583 }
4584 Py_XDECREF(temp);
4585 } /* '%' */
4586 } /* until end */
4587 if (argidx < arglen && !dict) {
4588 PyErr_SetString(PyExc_TypeError,
4589 "not all arguments converted");
4590 goto onError;
4591 }
4592
4593 if (args_owned) {
4594 Py_DECREF(args);
4595 }
4596 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004597 if (_PyUnicode_Resize(result, reslen - rescnt))
4598 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004599 return (PyObject *)result;
4600
4601 onError:
4602 Py_XDECREF(result);
4603 Py_DECREF(uformat);
4604 if (args_owned) {
4605 Py_DECREF(args);
4606 }
4607 return NULL;
4608}
4609
4610static PyBufferProcs unicode_as_buffer = {
4611 (getreadbufferproc) unicode_buffer_getreadbuf,
4612 (getwritebufferproc) unicode_buffer_getwritebuf,
4613 (getsegcountproc) unicode_buffer_getsegcount,
4614 (getcharbufferproc) unicode_buffer_getcharbuf,
4615};
4616
4617PyTypeObject PyUnicode_Type = {
4618 PyObject_HEAD_INIT(&PyType_Type)
4619 0, /* ob_size */
4620 "unicode", /* tp_name */
4621 sizeof(PyUnicodeObject), /* tp_size */
4622 0, /* tp_itemsize */
4623 /* Slots */
4624 (destructor)_PyUnicode_Free, /* tp_dealloc */
4625 0, /* tp_print */
4626 (getattrfunc)unicode_getattr, /* tp_getattr */
4627 0, /* tp_setattr */
4628 (cmpfunc) unicode_compare, /* tp_compare */
4629 (reprfunc) unicode_repr, /* tp_repr */
4630 0, /* tp_as_number */
4631 &unicode_as_sequence, /* tp_as_sequence */
4632 0, /* tp_as_mapping */
4633 (hashfunc) unicode_hash, /* tp_hash*/
4634 0, /* tp_call*/
4635 (reprfunc) unicode_str, /* tp_str */
4636 (getattrofunc) NULL, /* tp_getattro */
4637 (setattrofunc) NULL, /* tp_setattro */
4638 &unicode_as_buffer, /* tp_as_buffer */
4639 Py_TPFLAGS_DEFAULT, /* tp_flags */
4640};
4641
4642/* Initialize the Unicode implementation */
4643
4644void _PyUnicode_Init()
4645{
4646 /* Doublecheck the configuration... */
4647 if (sizeof(Py_UNICODE) != 2)
4648 Py_FatalError("Unicode configuration error: "
4649 "sizeof(Py_UNICODE) != 2 bytes");
4650
4651 unicode_empty = _PyUnicode_New(0);
4652}
4653
4654/* Finalize the Unicode implementation */
4655
4656void
4657_PyUnicode_Fini()
4658{
4659 PyUnicodeObject *u = unicode_freelist;
4660
4661 while (u != NULL) {
4662 PyUnicodeObject *v = u;
4663 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004664 if (v->str)
4665 free(v->str);
4666 Py_XDECREF(v->utf8str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 free(v);
4668 }
4669 Py_XDECREF(unicode_empty);
4670}