blob: b9223eb5d8a17081c6df0ca963a4130b63762e36 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
76/* Limit for the Unicode object free list */
77
78#define MAX_UNICODE_FREELIST_SIZE 1024
79
80/* Limit for the Unicode object free list stay alive optimization.
81
82 The implementation will keep allocated Unicode memory intact for
83 all objects on the free list having a size less than this
84 limit. This reduces malloc() overhead for small Unicode objects.
85
86 At worse this will result in MAX_UNICODE_FREELIST_SIZE *
87 (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
88 malloc()-overhead) bytes of unused garbage.
89
90 Setting the limit to 0 effectively turns the feature off.
91
92 XXX The feature is currently turned off because there are
93 apparently some lingering bugs in its implementation which I
94 haven't yet been able to sort out.
95
96*/
97
98#define STAYALIVE_SIZE_LIMIT 0
99
100/* Endianness switches; defaults to little endian */
101
102#ifdef WORDS_BIGENDIAN
103# define BYTEORDER_IS_BIG_ENDIAN
104#else
105# define BYTEORDER_IS_LITTLE_ENDIAN
106#endif
107
108/* --- Globals ------------------------------------------------------------ */
109
110/* The empty Unicode object */
111static PyUnicodeObject *unicode_empty = NULL;
112
113/* Free list for Unicode objects */
114static PyUnicodeObject *unicode_freelist = NULL;
115static int unicode_freelist_size = 0;
116
117/* --- Unicode Object ----------------------------------------------------- */
118
119static
120int _PyUnicode_Resize(register PyUnicodeObject *unicode,
121 int length)
122{
123 void *oldstr;
124
125 /* Shortcut if there's nothing to do. */
126 if (unicode->length == length)
127 return 0;
128
129 /* Resizing unicode_empty is not allowed. */
130 if (unicode == unicode_empty) {
131 PyErr_SetString(PyExc_SystemError,
132 "can't resize empty unicode object");
133 return -1;
134 }
135
136 /* We allocate one more byte to make sure the string is
137 Ux0000 terminated -- XXX is this needed ? */
138 oldstr = unicode->str;
139 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
140 if (!unicode->str) {
141 unicode->str = oldstr;
142 PyErr_NoMemory();
143 return -1;
144 }
145 unicode->str[length] = 0;
146 unicode->length = length;
147
148 /* Reset the object caches */
149 if (unicode->utf8str) {
150 Py_DECREF(unicode->utf8str);
151 unicode->utf8str = NULL;
152 }
153 unicode->hash = -1;
154
155 return 0;
156}
157
158/* We allocate one more byte to make sure the string is
159 Ux0000 terminated -- XXX is this needed ?
160
161 XXX This allocator could further be enhanced by assuring that the
162 free list never reduces its size below 1.
163
164*/
165
166static
167PyUnicodeObject *_PyUnicode_New(int length)
168{
169 register PyUnicodeObject *unicode;
170
171 /* Optimization for empty strings */
172 if (length == 0 && unicode_empty != NULL) {
173 Py_INCREF(unicode_empty);
174 return unicode_empty;
175 }
176
177 /* Unicode freelist & memory allocation */
178 if (unicode_freelist) {
179 unicode = unicode_freelist;
180 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
181 unicode_freelist_size--;
182 unicode->ob_type = &PyUnicode_Type;
183 _Py_NewReference(unicode);
184 if (unicode->str) {
185 if (unicode->length < length &&
186 _PyUnicode_Resize(unicode, length)) {
187 free(unicode->str);
188 PyMem_DEL(unicode);
189 return NULL;
190 }
191 }
192 else
193 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
194 }
195 else {
196 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
197 if (unicode == NULL)
198 return NULL;
199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200 }
201
202 if (!unicode->str) {
203 PyMem_DEL(unicode);
204 PyErr_NoMemory();
205 return NULL;
206 }
207 unicode->str[length] = 0;
208 unicode->length = length;
209 unicode->hash = -1;
210 unicode->utf8str = NULL;
211 return unicode;
212}
213
214static
215void _PyUnicode_Free(register PyUnicodeObject *unicode)
216{
217 Py_XDECREF(unicode->utf8str);
218 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
219 if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
220 free(unicode->str);
221 unicode->str = NULL;
222 unicode->length = 0;
223 }
224 *(PyUnicodeObject **)unicode = unicode_freelist;
225 unicode_freelist = unicode;
226 unicode_freelist_size++;
227 _Py_ForgetReference(unicode);
228 }
229 else {
230 free(unicode->str);
231 PyMem_DEL(unicode);
232 }
233}
234
235PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
236 int size)
237{
238 PyUnicodeObject *unicode;
239
240 unicode = _PyUnicode_New(size);
241 if (!unicode)
242 return NULL;
243
244 /* Copy the Unicode data into the new object */
245 if (u != NULL)
246 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
247
248 return (PyObject *)unicode;
249}
250
251#ifdef HAVE_WCHAR_H
252
253PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
254 int size)
255{
256 PyUnicodeObject *unicode;
257
258 if (w == NULL) {
259 PyErr_BadInternalCall();
260 return NULL;
261 }
262
263 unicode = _PyUnicode_New(size);
264 if (!unicode)
265 return NULL;
266
267 /* Copy the wchar_t data into the new object */
268#ifdef HAVE_USABLE_WCHAR_T
269 memcpy(unicode->str, w, size * sizeof(wchar_t));
270#else
271 {
272 register Py_UNICODE *u;
273 register int i;
274 u = PyUnicode_AS_UNICODE(unicode);
275 for (i = size; i >= 0; i--)
276 *u++ = *w++;
277 }
278#endif
279
280 return (PyObject *)unicode;
281}
282
283int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
284 register wchar_t *w,
285 int size)
286{
287 if (unicode == NULL) {
288 PyErr_BadInternalCall();
289 return -1;
290 }
291 if (size > PyUnicode_GET_SIZE(unicode))
292 size = PyUnicode_GET_SIZE(unicode);
293#ifdef HAVE_USABLE_WCHAR_T
294 memcpy(w, unicode->str, size * sizeof(wchar_t));
295#else
296 {
297 register Py_UNICODE *u;
298 register int i;
299 u = PyUnicode_AS_UNICODE(unicode);
300 for (i = size; i >= 0; i--)
301 *w++ = *u++;
302 }
303#endif
304
305 return size;
306}
307
308#endif
309
310PyObject *PyUnicode_FromObject(register PyObject *obj)
311{
312 const char *s;
313 int len;
314
315 if (obj == NULL) {
316 PyErr_BadInternalCall();
317 return NULL;
318 }
319 else if (PyUnicode_Check(obj)) {
320 Py_INCREF(obj);
321 return obj;
322 }
323 else if (PyString_Check(obj)) {
324 s = PyString_AS_STRING(obj);
325 len = PyString_GET_SIZE(obj);
326 }
327 else if (PyObject_AsCharBuffer(obj, &s, &len))
328 return NULL;
329 if (len == 0) {
330 Py_INCREF(unicode_empty);
331 return (PyObject *)unicode_empty;
332 }
333 return PyUnicode_DecodeUTF8(s, len, "strict");
334}
335
336PyObject *PyUnicode_Decode(const char *s,
337 int size,
338 const char *encoding,
339 const char *errors)
340{
341 PyObject *buffer = NULL, *unicode;
342
343 /* Shortcut for the default encoding UTF-8 */
344 if (encoding == NULL ||
345 (strcmp(encoding, "utf-8") == 0))
346 return PyUnicode_DecodeUTF8(s, size, errors);
347
348 /* Decode via the codec registry */
349 buffer = PyBuffer_FromMemory((void *)s, size);
350 if (buffer == NULL)
351 goto onError;
352 unicode = PyCodec_Decode(buffer, encoding, errors);
353 if (unicode == NULL)
354 goto onError;
355 if (!PyUnicode_Check(unicode)) {
356 PyErr_Format(PyExc_TypeError,
357 "decoder did not return an unicode object (type=%s)",
358 unicode->ob_type->tp_name);
359 Py_DECREF(unicode);
360 goto onError;
361 }
362 Py_DECREF(buffer);
363 return unicode;
364
365 onError:
366 Py_XDECREF(buffer);
367 return NULL;
368}
369
370PyObject *PyUnicode_Encode(const Py_UNICODE *s,
371 int size,
372 const char *encoding,
373 const char *errors)
374{
375 PyObject *v, *unicode;
376
377 unicode = PyUnicode_FromUnicode(s, size);
378 if (unicode == NULL)
379 return NULL;
380 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
381 Py_DECREF(unicode);
382 return v;
383}
384
385PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
386 const char *encoding,
387 const char *errors)
388{
389 PyObject *v;
390
391 if (!PyUnicode_Check(unicode)) {
392 PyErr_BadArgument();
393 goto onError;
394 }
395 /* Shortcut for the default encoding UTF-8 */
396 if ((encoding == NULL ||
397 (strcmp(encoding, "utf-8") == 0)) &&
398 errors == NULL)
399 return PyUnicode_AsUTF8String(unicode);
400
401 /* Encode via the codec registry */
402 v = PyCodec_Encode(unicode, encoding, errors);
403 if (v == NULL)
404 goto onError;
405 /* XXX Should we really enforce this ? */
406 if (!PyString_Check(v)) {
407 PyErr_Format(PyExc_TypeError,
408 "encoder did not return a string object (type=%s)",
409 v->ob_type->tp_name);
410 Py_DECREF(v);
411 goto onError;
412 }
413 return v;
414
415 onError:
416 return NULL;
417}
418
419Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
420{
421 if (!PyUnicode_Check(unicode)) {
422 PyErr_BadArgument();
423 goto onError;
424 }
425 return PyUnicode_AS_UNICODE(unicode);
426
427 onError:
428 return NULL;
429}
430
431int PyUnicode_GetSize(PyObject *unicode)
432{
433 if (!PyUnicode_Check(unicode)) {
434 PyErr_BadArgument();
435 goto onError;
436 }
437 return PyUnicode_GET_SIZE(unicode);
438
439 onError:
440 return -1;
441}
442
443/* --- UTF-8 Codec -------------------------------------------------------- */
444
445static
446char utf8_code_length[256] = {
447 /* Map UTF-8 encoded prefix byte to sequence length. zero means
448 illegal prefix. see RFC 2279 for details */
449 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
450 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
451 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
452 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
453 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
454 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
455 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
456 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
457 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
462 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
463 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
464 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
465};
466
467static
468int utf8_decoding_error(const char **source,
469 Py_UNICODE **dest,
470 const char *errors,
471 const char *details)
472{
473 if ((errors == NULL) ||
474 (strcmp(errors,"strict") == 0)) {
475 PyErr_Format(PyExc_UnicodeError,
476 "UTF-8 decoding error: %s",
477 details);
478 return -1;
479 }
480 else if (strcmp(errors,"ignore") == 0) {
481 (*source)++;
482 return 0;
483 }
484 else if (strcmp(errors,"replace") == 0) {
485 (*source)++;
486 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
487 (*dest)++;
488 return 0;
489 }
490 else {
491 PyErr_Format(PyExc_ValueError,
492 "UTF-8 decoding error; unkown error handling code: %s",
493 errors);
494 return -1;
495 }
496}
497
498#define UTF8_ERROR(details) do { \
499 if (utf8_decoding_error(&s, &p, errors, details)) \
500 goto onError; \
501 continue; \
502} while (0)
503
504PyObject *PyUnicode_DecodeUTF8(const char *s,
505 int size,
506 const char *errors)
507{
508 int n;
509 const char *e;
510 PyUnicodeObject *unicode;
511 Py_UNICODE *p;
512
513 /* Note: size will always be longer than the resulting Unicode
514 character count */
515 unicode = _PyUnicode_New(size);
516 if (!unicode)
517 return NULL;
518 if (size == 0)
519 return (PyObject *)unicode;
520
521 /* Unpack UTF-8 encoded data */
522 p = unicode->str;
523 e = s + size;
524
525 while (s < e) {
526 register Py_UNICODE ch = (unsigned char)*s;
527
528 if (ch < 0x80) {
529 *p++ = ch;
530 s++;
531 continue;
532 }
533
534 n = utf8_code_length[ch];
535
536 if (s + n > e)
537 UTF8_ERROR("unexpected end of data");
538
539 switch (n) {
540
541 case 0:
542 UTF8_ERROR("unexpected code byte");
543 break;
544
545 case 1:
546 UTF8_ERROR("internal error");
547 break;
548
549 case 2:
550 if ((s[1] & 0xc0) != 0x80)
551 UTF8_ERROR("invalid data");
552 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
553 if (ch < 0x80)
554 UTF8_ERROR("illegal encoding");
555 else
556 *p++ = ch;
557 break;
558
559 case 3:
560 if ((s[1] & 0xc0) != 0x80 ||
561 (s[2] & 0xc0) != 0x80)
562 UTF8_ERROR("invalid data");
563 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
564 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
565 UTF8_ERROR("illegal encoding");
566 else
567 *p++ = ch;
568 break;
569
570 default:
571 /* Other sizes are only needed for UCS-4 */
572 UTF8_ERROR("unsupported Unicode code range");
573 }
574 s += n;
575 }
576
577 /* Adjust length */
578 if (_PyUnicode_Resize(unicode, p - unicode->str))
579 goto onError;
580
581 return (PyObject *)unicode;
582
583onError:
584 Py_DECREF(unicode);
585 return NULL;
586}
587
588#undef UTF8_ERROR
589
590static
591int utf8_encoding_error(const Py_UNICODE **source,
592 char **dest,
593 const char *errors,
594 const char *details)
595{
596 if ((errors == NULL) ||
597 (strcmp(errors,"strict") == 0)) {
598 PyErr_Format(PyExc_UnicodeError,
599 "UTF-8 encoding error: %s",
600 details);
601 return -1;
602 }
603 else if (strcmp(errors,"ignore") == 0) {
604 return 0;
605 }
606 else if (strcmp(errors,"replace") == 0) {
607 **dest = '?';
608 (*dest)++;
609 return 0;
610 }
611 else {
612 PyErr_Format(PyExc_ValueError,
613 "UTF-8 encoding error; "
614 "unkown error handling code: %s",
615 errors);
616 return -1;
617 }
618}
619
620PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
621 int size,
622 const char *errors)
623{
624 PyObject *v;
625 char *p;
626 char *q;
627
628 v = PyString_FromStringAndSize(NULL, 3 * size);
629 if (v == NULL)
630 return NULL;
631 if (size == 0)
632 goto done;
633
634 p = q = PyString_AS_STRING(v);
635 while (size-- > 0) {
636 Py_UNICODE ch = *s++;
637 if (ch < 0x80)
638 *p++ = (char) ch;
639 else if (ch < 0x0800) {
640 *p++ = 0xc0 | (ch >> 6);
641 *p++ = 0x80 | (ch & 0x3f);
642 } else if (0xD800 <= ch && ch <= 0xDFFF) {
643 /* These byte ranges are reserved for UTF-16 surrogate
644 bytes which the Python implementation currently does
645 not support. */
646 printf("code range problem: U+%04x\n", ch);
647 if (utf8_encoding_error(&s, &p, errors,
648 "unsupported code range"))
649 goto onError;
650 } else {
651 *p++ = 0xe0 | (ch >> 12);
652 *p++ = 0x80 | ((ch >> 6) & 0x3f);
653 *p++ = 0x80 | (ch & 0x3f);
654 }
655 }
656 *p = '\0';
657 _PyString_Resize(&v, p - q);
658
659 done:
660 return v;
661
662 onError:
663 Py_DECREF(v);
664 return NULL;
665}
666
667/* Return a Python string holding the UTF-8 encoded value of the
668 Unicode object.
669
670 The resulting string is cached in the Unicode object for subsequent
671 usage by this function. The cached version is needed to implement
672 the character buffer interface.
673
674 The refcount of the string is *not* incremented.
675
676*/
677
678static
679PyObject *utf8_string(PyUnicodeObject *self,
680 const char *errors)
681{
682 PyObject *v = self->utf8str;
683
684 if (v)
685 return v;
686 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
687 PyUnicode_GET_SIZE(self),
688 errors);
689 if (v && errors == NULL)
690 self->utf8str = v;
691 return v;
692}
693
694PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
695{
696 PyObject *str;
697
698 if (!PyUnicode_Check(unicode)) {
699 PyErr_BadArgument();
700 return NULL;
701 }
702 str = utf8_string((PyUnicodeObject *)unicode, NULL);
703 if (str == NULL)
704 return NULL;
705 Py_INCREF(str);
706 return str;
707}
708
709/* --- UTF-16 Codec ------------------------------------------------------- */
710
711static
712int utf16_decoding_error(const Py_UNICODE **source,
713 Py_UNICODE **dest,
714 const char *errors,
715 const char *details)
716{
717 if ((errors == NULL) ||
718 (strcmp(errors,"strict") == 0)) {
719 PyErr_Format(PyExc_UnicodeError,
720 "UTF-16 decoding error: %s",
721 details);
722 return -1;
723 }
724 else if (strcmp(errors,"ignore") == 0) {
725 return 0;
726 }
727 else if (strcmp(errors,"replace") == 0) {
728 if (dest) {
729 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
730 (*dest)++;
731 }
732 return 0;
733 }
734 else {
735 PyErr_Format(PyExc_ValueError,
736 "UTF-16 decoding error; unkown error handling code: %s",
737 errors);
738 return -1;
739 }
740}
741
742#define UTF16_ERROR(details) do { \
743 if (utf16_decoding_error(&q, &p, errors, details)) \
744 goto onError; \
745 continue; \
746} while(0)
747
748PyObject *PyUnicode_DecodeUTF16(const char *s,
749 int size,
750 const char *errors,
751 int *byteorder)
752{
753 PyUnicodeObject *unicode;
754 Py_UNICODE *p;
755 const Py_UNICODE *q, *e;
756 int bo = 0;
757
758 /* size should be an even number */
759 if (size % sizeof(Py_UNICODE) != 0) {
760 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
761 return NULL;
762 /* The remaining input chars are ignored if we fall through
763 here... */
764 }
765
766 /* Note: size will always be longer than the resulting Unicode
767 character count */
768 unicode = _PyUnicode_New(size);
769 if (!unicode)
770 return NULL;
771 if (size == 0)
772 return (PyObject *)unicode;
773
774 /* Unpack UTF-16 encoded data */
775 p = unicode->str;
776 q = (Py_UNICODE *)s;
777 e = q + (size / sizeof(Py_UNICODE));
778
779 if (byteorder)
780 bo = *byteorder;
781
782 while (q < e) {
783 register Py_UNICODE ch = *q++;
784
785 /* Check for BOM marks (U+FEFF) in the input and adjust
786 current byte order setting accordingly. Swap input
787 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
788 !) */
789#ifdef BYTEORDER_IS_LITTLE_ENDIAN
790 if (ch == 0xFEFF) {
791 bo = -1;
792 continue;
793 } else if (ch == 0xFFFE) {
794 bo = 1;
795 continue;
796 }
797 if (bo == 1)
798 ch = (ch >> 8) | (ch << 8);
799#else
800 if (ch == 0xFEFF) {
801 bo = 1;
802 continue;
803 } else if (ch == 0xFFFE) {
804 bo = -1;
805 continue;
806 }
807 if (bo == -1)
808 ch = (ch >> 8) | (ch << 8);
809#endif
810 if (ch < 0xD800 || ch > 0xDFFF) {
811 *p++ = ch;
812 continue;
813 }
814
815 /* UTF-16 code pair: */
816 if (q >= e)
817 UTF16_ERROR("unexpected end of data");
818 if (0xDC00 <= *q && *q <= 0xDFFF) {
819 q++;
820 if (0xD800 <= *q && *q <= 0xDBFF)
821 /* This is valid data (a UTF-16 surrogate pair), but
822 we are not able to store this information since our
823 Py_UNICODE type only has 16 bits... this might
824 change someday, even though it's unlikely. */
825 UTF16_ERROR("code pairs are not supported");
826 else
827 continue;
828 }
829 UTF16_ERROR("illegal encoding");
830 }
831
832 if (byteorder)
833 *byteorder = bo;
834
835 /* Adjust length */
836 if (_PyUnicode_Resize(unicode, p - unicode->str))
837 goto onError;
838
839 return (PyObject *)unicode;
840
841onError:
842 Py_DECREF(unicode);
843 return NULL;
844}
845
846#undef UTF16_ERROR
847
848PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
849 int size,
850 const char *errors,
851 int byteorder)
852{
853 PyObject *v;
854 Py_UNICODE *p;
855 char *q;
856
857 /* We don't create UTF-16 pairs... */
858 v = PyString_FromStringAndSize(NULL,
859 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
860 if (v == NULL)
861 return NULL;
862 if (size == 0)
863 goto done;
864
865 q = PyString_AS_STRING(v);
866 p = (Py_UNICODE *)q;
867
868 if (byteorder == 0)
869 *p++ = 0xFEFF;
870 if (byteorder == 0 ||
871#ifdef BYTEORDER_IS_LITTLE_ENDIAN
872 byteorder == -1
873#else
874 byteorder == 1
875#endif
876 )
877 memcpy(p, s, size * sizeof(Py_UNICODE));
878 else
879 while (size-- > 0) {
880 Py_UNICODE ch = *s++;
881 *p++ = (ch >> 8) | (ch << 8);
882 }
883 done:
884 return v;
885}
886
887PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
888{
889 if (!PyUnicode_Check(unicode)) {
890 PyErr_BadArgument();
891 return NULL;
892 }
893 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
894 PyUnicode_GET_SIZE(unicode),
895 NULL,
896 0);
897}
898
899/* --- Unicode Escape Codec ----------------------------------------------- */
900
901static
902int unicodeescape_decoding_error(const char **source,
903 unsigned int *x,
904 const char *errors,
905 const char *details)
906{
907 if ((errors == NULL) ||
908 (strcmp(errors,"strict") == 0)) {
909 PyErr_Format(PyExc_UnicodeError,
910 "Unicode-Escape decoding error: %s",
911 details);
912 return -1;
913 }
914 else if (strcmp(errors,"ignore") == 0) {
915 return 0;
916 }
917 else if (strcmp(errors,"replace") == 0) {
918 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
919 return 0;
920 }
921 else {
922 PyErr_Format(PyExc_ValueError,
923 "Unicode-Escape decoding error; "
924 "unkown error handling code: %s",
925 errors);
926 return -1;
927 }
928}
929
930PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
931 int size,
932 const char *errors)
933{
934 PyUnicodeObject *v;
935 Py_UNICODE *p = NULL, *buf = NULL;
936 const char *end;
937
938 /* Escaped strings will always be longer than the resulting
939 Unicode string, so we start with size here and then reduce the
940 length after conversion to the true value. */
941 v = _PyUnicode_New(size);
942 if (v == NULL)
943 goto onError;
944 if (size == 0)
945 return (PyObject *)v;
946 p = buf = PyUnicode_AS_UNICODE(v);
947 end = s + size;
948 while (s < end) {
949 unsigned char c;
950 unsigned int x;
951 int i;
952
953 /* Non-escape characters are interpreted as Unicode ordinals */
954 if (*s != '\\') {
955 *p++ = (unsigned char)*s++;
956 continue;
957 }
958
959 /* \ - Escapes */
960 s++;
961 switch (*s++) {
962
963 /* \x escapes */
964 case '\n': break;
965 case '\\': *p++ = '\\'; break;
966 case '\'': *p++ = '\''; break;
967 case '\"': *p++ = '\"'; break;
968 case 'b': *p++ = '\b'; break;
969 case 'f': *p++ = '\014'; break; /* FF */
970 case 't': *p++ = '\t'; break;
971 case 'n': *p++ = '\n'; break;
972 case 'r': *p++ = '\r'; break;
973 case 'v': *p++ = '\013'; break; /* VT */
974 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
975
976 /* \OOO (octal) escapes */
977 case '0': case '1': case '2': case '3':
978 case '4': case '5': case '6': case '7':
979 c = s[-1] - '0';
980 if ('0' <= *s && *s <= '7') {
981 c = (c<<3) + *s++ - '0';
982 if ('0' <= *s && *s <= '7')
983 c = (c<<3) + *s++ - '0';
984 }
985 *p++ = c;
986 break;
987
988 /* \xXXXX escape with 0-4 hex digits */
989 case 'x':
990 x = 0;
991 c = (unsigned char)*s;
992 if (isxdigit(c)) {
993 do {
994 x = (x<<4) & ~0xF;
995 if ('0' <= c && c <= '9')
996 x += c - '0';
997 else if ('a' <= c && c <= 'f')
998 x += 10 + c - 'a';
999 else
1000 x += 10 + c - 'A';
1001 c = (unsigned char)*++s;
1002 } while (isxdigit(c));
1003 *p++ = x;
1004 } else {
1005 *p++ = '\\';
1006 *p++ = (unsigned char)s[-1];
1007 }
1008 break;
1009
1010 /* \uXXXX with 4 hex digits */
1011 case 'u':
1012 for (x = 0, i = 0; i < 4; i++) {
1013 c = (unsigned char)s[i];
1014 if (!isxdigit(c)) {
1015 if (unicodeescape_decoding_error(&s, &x, errors,
1016 "truncated \\uXXXX"))
1017 goto onError;
1018 i++;
1019 break;
1020 }
1021 x = (x<<4) & ~0xF;
1022 if (c >= '0' && c <= '9')
1023 x += c - '0';
1024 else if (c >= 'a' && c <= 'f')
1025 x += 10 + c - 'a';
1026 else
1027 x += 10 + c - 'A';
1028 }
1029 s += i;
1030 *p++ = x;
1031 break;
1032
1033 default:
1034 *p++ = '\\';
1035 *p++ = (unsigned char)s[-1];
1036 break;
1037 }
1038 }
1039 _PyUnicode_Resize(v, (int)(p - buf));
1040 return (PyObject *)v;
1041
1042 onError:
1043 Py_XDECREF(v);
1044 return NULL;
1045}
1046
1047/* Return a Unicode-Escape string version of the Unicode object.
1048
1049 If quotes is true, the string is enclosed in u"" or u'' quotes as
1050 appropriate.
1051
1052*/
1053
1054static
1055PyObject *unicodeescape_string(const Py_UNICODE *s,
1056 int size,
1057 int quotes)
1058{
1059 PyObject *repr;
1060 char *p;
1061 char *q;
1062
1063 static const char *hexdigit = "0123456789ABCDEF";
1064
1065 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1066 if (repr == NULL)
1067 return NULL;
1068
1069 p = q = PyString_AS_STRING(repr);
1070
1071 if (quotes) {
1072 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1073 int size,
1074 Py_UNICODE ch);
1075 *p++ = 'u';
1076 *p++ = (findchar(s, size, '\'') &&
1077 !findchar(s, size, '"')) ? '"' : '\'';
1078 }
1079 while (size-- > 0) {
1080 Py_UNICODE ch = *s++;
1081 /* Escape quotes */
1082 if (quotes && (ch == q[1] || ch == '\\')) {
1083 *p++ = '\\';
1084 *p++ = (char) ch;
1085 }
1086 /* Map 16-bit characters to '\uxxxx' */
1087 else if (ch >= 256) {
1088 *p++ = '\\';
1089 *p++ = 'u';
1090 *p++ = hexdigit[(ch >> 12) & 0xf];
1091 *p++ = hexdigit[(ch >> 8) & 0xf];
1092 *p++ = hexdigit[(ch >> 4) & 0xf];
1093 *p++ = hexdigit[ch & 15];
1094 }
1095 /* Map non-printable US ASCII to '\ooo' */
1096 else if (ch < ' ' || ch >= 128) {
1097 *p++ = '\\';
1098 *p++ = hexdigit[(ch >> 6) & 7];
1099 *p++ = hexdigit[(ch >> 3) & 7];
1100 *p++ = hexdigit[ch & 7];
1101 }
1102 /* Copy everything else as-is */
1103 else
1104 *p++ = (char) ch;
1105 }
1106 if (quotes)
1107 *p++ = q[1];
1108
1109 *p = '\0';
1110 _PyString_Resize(&repr, p - q);
1111
1112 return repr;
1113}
1114
1115PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1116 int size)
1117{
1118 return unicodeescape_string(s, size, 0);
1119}
1120
1121PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1122{
1123 if (!PyUnicode_Check(unicode)) {
1124 PyErr_BadArgument();
1125 return NULL;
1126 }
1127 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1128 PyUnicode_GET_SIZE(unicode));
1129}
1130
1131/* --- Raw Unicode Escape Codec ------------------------------------------- */
1132
1133PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1134 int size,
1135 const char *errors)
1136{
1137 PyUnicodeObject *v;
1138 Py_UNICODE *p, *buf;
1139 const char *end;
1140 const char *bs;
1141
1142 /* Escaped strings will always be longer than the resulting
1143 Unicode string, so we start with size here and then reduce the
1144 length after conversion to the true value. */
1145 v = _PyUnicode_New(size);
1146 if (v == NULL)
1147 goto onError;
1148 if (size == 0)
1149 return (PyObject *)v;
1150 p = buf = PyUnicode_AS_UNICODE(v);
1151 end = s + size;
1152 while (s < end) {
1153 unsigned char c;
1154 unsigned int x;
1155 int i;
1156
1157 /* Non-escape characters are interpreted as Unicode ordinals */
1158 if (*s != '\\') {
1159 *p++ = (unsigned char)*s++;
1160 continue;
1161 }
1162
1163 /* \u-escapes are only interpreted iff the number of leading
1164 backslashes if odd */
1165 bs = s;
1166 for (;s < end;) {
1167 if (*s != '\\')
1168 break;
1169 *p++ = (unsigned char)*s++;
1170 }
1171 if (((s - bs) & 1) == 0 ||
1172 s >= end ||
1173 *s != 'u') {
1174 continue;
1175 }
1176 p--;
1177 s++;
1178
1179 /* \uXXXX with 4 hex digits */
1180 for (x = 0, i = 0; i < 4; i++) {
1181 c = (unsigned char)s[i];
1182 if (!isxdigit(c)) {
1183 if (unicodeescape_decoding_error(&s, &x, errors,
1184 "truncated \\uXXXX"))
1185 goto onError;
1186 i++;
1187 break;
1188 }
1189 x = (x<<4) & ~0xF;
1190 if (c >= '0' && c <= '9')
1191 x += c - '0';
1192 else if (c >= 'a' && c <= 'f')
1193 x += 10 + c - 'a';
1194 else
1195 x += 10 + c - 'A';
1196 }
1197 s += i;
1198 *p++ = x;
1199 }
1200 _PyUnicode_Resize(v, (int)(p - buf));
1201 return (PyObject *)v;
1202
1203 onError:
1204 Py_XDECREF(v);
1205 return NULL;
1206}
1207
1208PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1209 int size)
1210{
1211 PyObject *repr;
1212 char *p;
1213 char *q;
1214
1215 static const char *hexdigit = "0123456789ABCDEF";
1216
1217 repr = PyString_FromStringAndSize(NULL, 6 * size);
1218 if (repr == NULL)
1219 return NULL;
1220
1221 p = q = PyString_AS_STRING(repr);
1222 while (size-- > 0) {
1223 Py_UNICODE ch = *s++;
1224 /* Map 16-bit characters to '\uxxxx' */
1225 if (ch >= 256) {
1226 *p++ = '\\';
1227 *p++ = 'u';
1228 *p++ = hexdigit[(ch >> 12) & 0xf];
1229 *p++ = hexdigit[(ch >> 8) & 0xf];
1230 *p++ = hexdigit[(ch >> 4) & 0xf];
1231 *p++ = hexdigit[ch & 15];
1232 }
1233 /* Copy everything else as-is */
1234 else
1235 *p++ = (char) ch;
1236 }
1237 *p = '\0';
1238 _PyString_Resize(&repr, p - q);
1239
1240 return repr;
1241}
1242
1243PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1244{
1245 if (!PyUnicode_Check(unicode)) {
1246 PyErr_BadArgument();
1247 return NULL;
1248 }
1249 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1250 PyUnicode_GET_SIZE(unicode));
1251}
1252
1253/* --- Latin-1 Codec ------------------------------------------------------ */
1254
1255PyObject *PyUnicode_DecodeLatin1(const char *s,
1256 int size,
1257 const char *errors)
1258{
1259 PyUnicodeObject *v;
1260 Py_UNICODE *p;
1261
1262 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1263 v = _PyUnicode_New(size);
1264 if (v == NULL)
1265 goto onError;
1266 if (size == 0)
1267 return (PyObject *)v;
1268 p = PyUnicode_AS_UNICODE(v);
1269 while (size-- > 0)
1270 *p++ = (unsigned char)*s++;
1271 return (PyObject *)v;
1272
1273 onError:
1274 Py_XDECREF(v);
1275 return NULL;
1276}
1277
1278static
1279int latin1_encoding_error(const Py_UNICODE **source,
1280 char **dest,
1281 const char *errors,
1282 const char *details)
1283{
1284 if ((errors == NULL) ||
1285 (strcmp(errors,"strict") == 0)) {
1286 PyErr_Format(PyExc_UnicodeError,
1287 "Latin-1 encoding error: %s",
1288 details);
1289 return -1;
1290 }
1291 else if (strcmp(errors,"ignore") == 0) {
1292 return 0;
1293 }
1294 else if (strcmp(errors,"replace") == 0) {
1295 **dest = '?';
1296 return 0;
1297 }
1298 else {
1299 PyErr_Format(PyExc_ValueError,
1300 "Latin-1 encoding error; "
1301 "unkown error handling code: %s",
1302 errors);
1303 return -1;
1304 }
1305}
1306
1307PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1308 int size,
1309 const char *errors)
1310{
1311 PyObject *repr;
1312 char *s;
1313 repr = PyString_FromStringAndSize(NULL, size);
1314 if (repr == NULL)
1315 return NULL;
1316
1317 s = PyString_AS_STRING(repr);
1318 while (size-- > 0) {
1319 Py_UNICODE ch = *p++;
1320 if (ch >= 256) {
1321 if (latin1_encoding_error(&p, &s, errors,
1322 "ordinal not in range(256)"))
1323 goto onError;
1324 }
1325 else
1326 *s++ = (char)ch;
1327 }
1328 return repr;
1329
1330 onError:
1331 Py_DECREF(repr);
1332 return NULL;
1333}
1334
1335PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1336{
1337 if (!PyUnicode_Check(unicode)) {
1338 PyErr_BadArgument();
1339 return NULL;
1340 }
1341 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1342 PyUnicode_GET_SIZE(unicode),
1343 NULL);
1344}
1345
1346/* --- 7-bit ASCII Codec -------------------------------------------------- */
1347
1348static
1349int ascii_decoding_error(const char **source,
1350 Py_UNICODE **dest,
1351 const char *errors,
1352 const char *details)
1353{
1354 if ((errors == NULL) ||
1355 (strcmp(errors,"strict") == 0)) {
1356 PyErr_Format(PyExc_UnicodeError,
1357 "ASCII decoding error: %s",
1358 details);
1359 return -1;
1360 }
1361 else if (strcmp(errors,"ignore") == 0) {
1362 return 0;
1363 }
1364 else if (strcmp(errors,"replace") == 0) {
1365 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1366 (*dest)++;
1367 return 0;
1368 }
1369 else {
1370 PyErr_Format(PyExc_ValueError,
1371 "ASCII decoding error; "
1372 "unkown error handling code: %s",
1373 errors);
1374 return -1;
1375 }
1376}
1377
1378PyObject *PyUnicode_DecodeASCII(const char *s,
1379 int size,
1380 const char *errors)
1381{
1382 PyUnicodeObject *v;
1383 Py_UNICODE *p;
1384
1385 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1386 v = _PyUnicode_New(size);
1387 if (v == NULL)
1388 goto onError;
1389 if (size == 0)
1390 return (PyObject *)v;
1391 p = PyUnicode_AS_UNICODE(v);
1392 while (size-- > 0) {
1393 register unsigned char c;
1394
1395 c = (unsigned char)*s++;
1396 if (c < 128)
1397 *p++ = c;
1398 else if (ascii_decoding_error(&s, &p, errors,
1399 "ordinal not in range(128)"))
1400 goto onError;
1401 }
1402 if (p - PyUnicode_AS_UNICODE(v) < size)
1403 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1404 return (PyObject *)v;
1405
1406 onError:
1407 Py_XDECREF(v);
1408 return NULL;
1409}
1410
1411static
1412int ascii_encoding_error(const Py_UNICODE **source,
1413 char **dest,
1414 const char *errors,
1415 const char *details)
1416{
1417 if ((errors == NULL) ||
1418 (strcmp(errors,"strict") == 0)) {
1419 PyErr_Format(PyExc_UnicodeError,
1420 "ASCII encoding error: %s",
1421 details);
1422 return -1;
1423 }
1424 else if (strcmp(errors,"ignore") == 0) {
1425 return 0;
1426 }
1427 else if (strcmp(errors,"replace") == 0) {
1428 **dest = '?';
1429 return 0;
1430 }
1431 else {
1432 PyErr_Format(PyExc_ValueError,
1433 "ASCII encoding error; "
1434 "unkown error handling code: %s",
1435 errors);
1436 return -1;
1437 }
1438}
1439
1440PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1441 int size,
1442 const char *errors)
1443{
1444 PyObject *repr;
1445 char *s;
1446 repr = PyString_FromStringAndSize(NULL, size);
1447 if (repr == NULL)
1448 return NULL;
1449
1450 s = PyString_AS_STRING(repr);
1451 while (size-- > 0) {
1452 Py_UNICODE ch = *p++;
1453 if (ch >= 128) {
1454 if (ascii_encoding_error(&p, &s, errors,
1455 "ordinal not in range(128)"))
1456 goto onError;
1457 }
1458 else
1459 *s++ = (char)ch;
1460 }
1461 return repr;
1462
1463 onError:
1464 Py_DECREF(repr);
1465 return NULL;
1466}
1467
1468PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1469{
1470 if (!PyUnicode_Check(unicode)) {
1471 PyErr_BadArgument();
1472 return NULL;
1473 }
1474 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1475 PyUnicode_GET_SIZE(unicode),
1476 NULL);
1477}
1478
1479/* --- Character Mapping Codec -------------------------------------------- */
1480
1481static
1482int charmap_decoding_error(const char **source,
1483 Py_UNICODE **dest,
1484 const char *errors,
1485 const char *details)
1486{
1487 if ((errors == NULL) ||
1488 (strcmp(errors,"strict") == 0)) {
1489 PyErr_Format(PyExc_UnicodeError,
1490 "charmap decoding error: %s",
1491 details);
1492 return -1;
1493 }
1494 else if (strcmp(errors,"ignore") == 0) {
1495 return 0;
1496 }
1497 else if (strcmp(errors,"replace") == 0) {
1498 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1499 (*dest)++;
1500 return 0;
1501 }
1502 else {
1503 PyErr_Format(PyExc_ValueError,
1504 "charmap decoding error; "
1505 "unkown error handling code: %s",
1506 errors);
1507 return -1;
1508 }
1509}
1510
1511PyObject *PyUnicode_DecodeCharmap(const char *s,
1512 int size,
1513 PyObject *mapping,
1514 const char *errors)
1515{
1516 PyUnicodeObject *v;
1517 Py_UNICODE *p;
1518
1519 /* Default to Latin-1 */
1520 if (mapping == NULL)
1521 return PyUnicode_DecodeLatin1(s, size, errors);
1522
1523 v = _PyUnicode_New(size);
1524 if (v == NULL)
1525 goto onError;
1526 if (size == 0)
1527 return (PyObject *)v;
1528 p = PyUnicode_AS_UNICODE(v);
1529 while (size-- > 0) {
1530 unsigned char ch = *s++;
1531 PyObject *w, *x;
1532
1533 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1534 w = PyInt_FromLong((long)ch);
1535 if (w == NULL)
1536 goto onError;
1537 x = PyObject_GetItem(mapping, w);
1538 Py_DECREF(w);
1539 if (x == NULL) {
1540 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1541 /* No mapping found: default to Latin-1 mapping */
1542 PyErr_Clear();
1543 *p++ = (Py_UNICODE)ch;
1544 continue;
1545 }
1546 goto onError;
1547 }
1548
1549 /* Apply mapping */
1550 if (PyInt_Check(x)) {
1551 int value = PyInt_AS_LONG(x);
1552 if (value < 0 || value > 65535) {
1553 PyErr_SetString(PyExc_TypeError,
1554 "character mapping must be in range(65336)");
1555 Py_DECREF(x);
1556 goto onError;
1557 }
1558 *p++ = (Py_UNICODE)value;
1559 }
1560 else if (x == Py_None) {
1561 /* undefined mapping */
1562 if (charmap_decoding_error(&s, &p, errors,
1563 "character maps to <undefined>")) {
1564 Py_DECREF(x);
1565 goto onError;
1566 }
1567 }
1568 else if (PyUnicode_Check(x)) {
1569 if (PyUnicode_GET_SIZE(x) != 1) {
1570 /* 1-n mapping */
1571 PyErr_SetString(PyExc_NotImplementedError,
1572 "1-n mappings are currently not implemented");
1573 Py_DECREF(x);
1574 goto onError;
1575 }
1576 *p++ = *PyUnicode_AS_UNICODE(x);
1577 }
1578 else {
1579 /* wrong return value */
1580 PyErr_SetString(PyExc_TypeError,
1581 "character mapping must return integer, None or unicode");
1582 Py_DECREF(x);
1583 goto onError;
1584 }
1585 Py_DECREF(x);
1586 }
1587 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1588 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1589 goto onError;
1590 return (PyObject *)v;
1591
1592 onError:
1593 Py_XDECREF(v);
1594 return NULL;
1595}
1596
1597static
1598int charmap_encoding_error(const Py_UNICODE **source,
1599 char **dest,
1600 const char *errors,
1601 const char *details)
1602{
1603 if ((errors == NULL) ||
1604 (strcmp(errors,"strict") == 0)) {
1605 PyErr_Format(PyExc_UnicodeError,
1606 "charmap encoding error: %s",
1607 details);
1608 return -1;
1609 }
1610 else if (strcmp(errors,"ignore") == 0) {
1611 return 0;
1612 }
1613 else if (strcmp(errors,"replace") == 0) {
1614 **dest = '?';
1615 (*dest)++;
1616 return 0;
1617 }
1618 else {
1619 PyErr_Format(PyExc_ValueError,
1620 "charmap encoding error; "
1621 "unkown error handling code: %s",
1622 errors);
1623 return -1;
1624 }
1625}
1626
1627PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1628 int size,
1629 PyObject *mapping,
1630 const char *errors)
1631{
1632 PyObject *v;
1633 char *s;
1634
1635 /* Default to Latin-1 */
1636 if (mapping == NULL)
1637 return PyUnicode_EncodeLatin1(p, size, errors);
1638
1639 v = PyString_FromStringAndSize(NULL, size);
1640 if (v == NULL)
1641 return NULL;
1642 s = PyString_AS_STRING(v);
1643 while (size-- > 0) {
1644 Py_UNICODE ch = *p++;
1645 PyObject *w, *x;
1646
1647 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1648 w = PyInt_FromLong((long)ch);
1649 if (w == NULL)
1650 goto onError;
1651 x = PyObject_GetItem(mapping, w);
1652 Py_DECREF(w);
1653 if (x == NULL) {
1654 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1655 /* No mapping found: default to Latin-1 mapping if possible */
1656 PyErr_Clear();
1657 if (ch < 256) {
1658 *s++ = (char)ch;
1659 continue;
1660 }
1661 else if (!charmap_encoding_error(&p, &s, errors,
1662 "missing character mapping"))
1663 continue;
1664 }
1665 goto onError;
1666 }
1667
1668 /* Apply mapping */
1669 if (PyInt_Check(x)) {
1670 int value = PyInt_AS_LONG(x);
1671 if (value < 0 || value > 255) {
1672 PyErr_SetString(PyExc_TypeError,
1673 "character mapping must be in range(256)");
1674 Py_DECREF(x);
1675 goto onError;
1676 }
1677 *s++ = (char)value;
1678 }
1679 else if (x == Py_None) {
1680 /* undefined mapping */
1681 if (charmap_encoding_error(&p, &s, errors,
1682 "character maps to <undefined>")) {
1683 Py_DECREF(x);
1684 goto onError;
1685 }
1686 }
1687 else if (PyString_Check(x)) {
1688 if (PyString_GET_SIZE(x) != 1) {
1689 /* 1-n mapping */
1690 PyErr_SetString(PyExc_NotImplementedError,
1691 "1-n mappings are currently not implemented");
1692 Py_DECREF(x);
1693 goto onError;
1694 }
1695 *s++ = *PyString_AS_STRING(x);
1696 }
1697 else {
1698 /* wrong return value */
1699 PyErr_SetString(PyExc_TypeError,
1700 "character mapping must return integer, None or unicode");
1701 Py_DECREF(x);
1702 goto onError;
1703 }
1704 Py_DECREF(x);
1705 }
1706 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1707 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1708 goto onError;
1709 return v;
1710
1711 onError:
1712 Py_DECREF(v);
1713 return NULL;
1714}
1715
1716PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1717 PyObject *mapping)
1718{
1719 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1720 PyErr_BadArgument();
1721 return NULL;
1722 }
1723 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1724 PyUnicode_GET_SIZE(unicode),
1725 mapping,
1726 NULL);
1727}
1728
1729static
1730int translate_error(const Py_UNICODE **source,
1731 Py_UNICODE **dest,
1732 const char *errors,
1733 const char *details)
1734{
1735 if ((errors == NULL) ||
1736 (strcmp(errors,"strict") == 0)) {
1737 PyErr_Format(PyExc_UnicodeError,
1738 "translate error: %s",
1739 details);
1740 return -1;
1741 }
1742 else if (strcmp(errors,"ignore") == 0) {
1743 return 0;
1744 }
1745 else if (strcmp(errors,"replace") == 0) {
1746 **dest = '?';
1747 (*dest)++;
1748 return 0;
1749 }
1750 else {
1751 PyErr_Format(PyExc_ValueError,
1752 "translate error; "
1753 "unkown error handling code: %s",
1754 errors);
1755 return -1;
1756 }
1757}
1758
1759PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1760 int size,
1761 PyObject *mapping,
1762 const char *errors)
1763{
1764 PyUnicodeObject *v;
1765 Py_UNICODE *p;
1766
1767 if (mapping == NULL) {
1768 PyErr_BadArgument();
1769 return NULL;
1770 }
1771
1772 /* Output will never be longer than input */
1773 v = _PyUnicode_New(size);
1774 if (v == NULL)
1775 goto onError;
1776 if (size == 0)
1777 goto done;
1778 p = PyUnicode_AS_UNICODE(v);
1779 while (size-- > 0) {
1780 Py_UNICODE ch = *s++;
1781 PyObject *w, *x;
1782
1783 /* Get mapping */
1784 w = PyInt_FromLong(ch);
1785 if (w == NULL)
1786 goto onError;
1787 x = PyObject_GetItem(mapping, w);
1788 Py_DECREF(w);
1789 if (x == NULL) {
1790 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1791 /* No mapping found: default to 1-1 mapping */
1792 PyErr_Clear();
1793 *p++ = ch;
1794 continue;
1795 }
1796 goto onError;
1797 }
1798
1799 /* Apply mapping */
1800 if (PyInt_Check(x))
1801 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1802 else if (x == Py_None) {
1803 /* undefined mapping */
1804 if (translate_error(&s, &p, errors,
1805 "character maps to <undefined>")) {
1806 Py_DECREF(x);
1807 goto onError;
1808 }
1809 }
1810 else if (PyUnicode_Check(x)) {
1811 if (PyUnicode_GET_SIZE(x) != 1) {
1812 /* 1-n mapping */
1813 PyErr_SetString(PyExc_NotImplementedError,
1814 "1-n mappings are currently not implemented");
1815 Py_DECREF(x);
1816 goto onError;
1817 }
1818 *p++ = *PyUnicode_AS_UNICODE(x);
1819 }
1820 else {
1821 /* wrong return value */
1822 PyErr_SetString(PyExc_TypeError,
1823 "translate mapping must return integer, None or unicode");
1824 Py_DECREF(x);
1825 goto onError;
1826 }
1827 Py_DECREF(x);
1828 }
1829 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1830 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1831
1832 done:
1833 return (PyObject *)v;
1834
1835 onError:
1836 Py_XDECREF(v);
1837 return NULL;
1838}
1839
1840PyObject *PyUnicode_Translate(PyObject *str,
1841 PyObject *mapping,
1842 const char *errors)
1843{
1844 PyObject *result;
1845
1846 str = PyUnicode_FromObject(str);
1847 if (str == NULL)
1848 goto onError;
1849 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1850 PyUnicode_GET_SIZE(str),
1851 mapping,
1852 errors);
1853 Py_DECREF(str);
1854 return result;
1855
1856 onError:
1857 Py_XDECREF(str);
1858 return NULL;
1859}
1860
1861/* --- Helpers ------------------------------------------------------------ */
1862
1863static
1864int count(PyUnicodeObject *self,
1865 int start,
1866 int end,
1867 PyUnicodeObject *substring)
1868{
1869 int count = 0;
1870
1871 end -= substring->length;
1872
1873 while (start <= end)
1874 if (Py_UNICODE_MATCH(self, start, substring)) {
1875 count++;
1876 start += substring->length;
1877 } else
1878 start++;
1879
1880 return count;
1881}
1882
1883int PyUnicode_Count(PyObject *str,
1884 PyObject *substr,
1885 int start,
1886 int end)
1887{
1888 int result;
1889
1890 str = PyUnicode_FromObject(str);
1891 if (str == NULL)
1892 return -1;
1893 substr = PyUnicode_FromObject(substr);
1894 if (substr == NULL) {
1895 Py_DECREF(substr);
1896 return -1;
1897 }
1898
1899 result = count((PyUnicodeObject *)str,
1900 start, end,
1901 (PyUnicodeObject *)substr);
1902
1903 Py_DECREF(str);
1904 Py_DECREF(substr);
1905 return result;
1906}
1907
1908static
1909int findstring(PyUnicodeObject *self,
1910 PyUnicodeObject *substring,
1911 int start,
1912 int end,
1913 int direction)
1914{
1915 if (start < 0)
1916 start += self->length;
1917 if (start < 0)
1918 start = 0;
1919
1920 if (substring->length == 0)
1921 return start;
1922
1923 if (end > self->length)
1924 end = self->length;
1925 if (end < 0)
1926 end += self->length;
1927 if (end < 0)
1928 end = 0;
1929
1930 end -= substring->length;
1931
1932 if (direction < 0) {
1933 for (; end >= start; end--)
1934 if (Py_UNICODE_MATCH(self, end, substring))
1935 return end;
1936 } else {
1937 for (; start <= end; start++)
1938 if (Py_UNICODE_MATCH(self, start, substring))
1939 return start;
1940 }
1941
1942 return -1;
1943}
1944
1945int PyUnicode_Find(PyObject *str,
1946 PyObject *substr,
1947 int start,
1948 int end,
1949 int direction)
1950{
1951 int result;
1952
1953 str = PyUnicode_FromObject(str);
1954 if (str == NULL)
1955 return -1;
1956 substr = PyUnicode_FromObject(substr);
1957 if (substr == NULL) {
1958 Py_DECREF(substr);
1959 return -1;
1960 }
1961
1962 result = findstring((PyUnicodeObject *)str,
1963 (PyUnicodeObject *)substr,
1964 start, end, direction);
1965 Py_DECREF(str);
1966 Py_DECREF(substr);
1967 return result;
1968}
1969
1970static
1971int tailmatch(PyUnicodeObject *self,
1972 PyUnicodeObject *substring,
1973 int start,
1974 int end,
1975 int direction)
1976{
1977 if (start < 0)
1978 start += self->length;
1979 if (start < 0)
1980 start = 0;
1981
1982 if (substring->length == 0)
1983 return 1;
1984
1985 if (end > self->length)
1986 end = self->length;
1987 if (end < 0)
1988 end += self->length;
1989 if (end < 0)
1990 end = 0;
1991
1992 end -= substring->length;
1993 if (end < start)
1994 return 0;
1995
1996 if (direction > 0) {
1997 if (Py_UNICODE_MATCH(self, end, substring))
1998 return 1;
1999 } else {
2000 if (Py_UNICODE_MATCH(self, start, substring))
2001 return 1;
2002 }
2003
2004 return 0;
2005}
2006
2007int PyUnicode_Tailmatch(PyObject *str,
2008 PyObject *substr,
2009 int start,
2010 int end,
2011 int direction)
2012{
2013 int result;
2014
2015 str = PyUnicode_FromObject(str);
2016 if (str == NULL)
2017 return -1;
2018 substr = PyUnicode_FromObject(substr);
2019 if (substr == NULL) {
2020 Py_DECREF(substr);
2021 return -1;
2022 }
2023
2024 result = tailmatch((PyUnicodeObject *)str,
2025 (PyUnicodeObject *)substr,
2026 start, end, direction);
2027 Py_DECREF(str);
2028 Py_DECREF(substr);
2029 return result;
2030}
2031
2032static
2033const Py_UNICODE *findchar(const Py_UNICODE *s,
2034 int size,
2035 Py_UNICODE ch)
2036{
2037 /* like wcschr, but doesn't stop at NULL characters */
2038
2039 while (size-- > 0) {
2040 if (*s == ch)
2041 return s;
2042 s++;
2043 }
2044
2045 return NULL;
2046}
2047
2048/* Apply fixfct filter to the Unicode object self and return a
2049 reference to the modified object */
2050
2051static
2052PyObject *fixup(PyUnicodeObject *self,
2053 int (*fixfct)(PyUnicodeObject *s))
2054{
2055
2056 PyUnicodeObject *u;
2057
2058 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2059 self->length);
2060 if (u == NULL)
2061 return NULL;
2062 if (!fixfct(u)) {
2063 /* fixfct should return TRUE if it modified the buffer. If
2064 FALSE, return a reference to the original buffer instead
2065 (to save space, not time) */
2066 Py_INCREF(self);
2067 Py_DECREF(u);
2068 return (PyObject*) self;
2069 }
2070 return (PyObject*) u;
2071}
2072
2073static
2074int fixupper(PyUnicodeObject *self)
2075{
2076 int len = self->length;
2077 Py_UNICODE *s = self->str;
2078 int status = 0;
2079
2080 while (len-- > 0) {
2081 register Py_UNICODE ch;
2082
2083 ch = Py_UNICODE_TOUPPER(*s);
2084 if (ch != *s) {
2085 status = 1;
2086 *s = ch;
2087 }
2088 s++;
2089 }
2090
2091 return status;
2092}
2093
2094static
2095int fixlower(PyUnicodeObject *self)
2096{
2097 int len = self->length;
2098 Py_UNICODE *s = self->str;
2099 int status = 0;
2100
2101 while (len-- > 0) {
2102 register Py_UNICODE ch;
2103
2104 ch = Py_UNICODE_TOLOWER(*s);
2105 if (ch != *s) {
2106 status = 1;
2107 *s = ch;
2108 }
2109 s++;
2110 }
2111
2112 return status;
2113}
2114
2115static
2116int fixswapcase(PyUnicodeObject *self)
2117{
2118 int len = self->length;
2119 Py_UNICODE *s = self->str;
2120 int status = 0;
2121
2122 while (len-- > 0) {
2123 if (Py_UNICODE_ISUPPER(*s)) {
2124 *s = Py_UNICODE_TOLOWER(*s);
2125 status = 1;
2126 } else if (Py_UNICODE_ISLOWER(*s)) {
2127 *s = Py_UNICODE_TOUPPER(*s);
2128 status = 1;
2129 }
2130 s++;
2131 }
2132
2133 return status;
2134}
2135
2136static
2137int fixcapitalize(PyUnicodeObject *self)
2138{
2139 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2140 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2141 return 1;
2142 }
2143 return 0;
2144}
2145
2146static
2147int fixtitle(PyUnicodeObject *self)
2148{
2149 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2150 register Py_UNICODE *e;
2151 int previous_is_cased;
2152
2153 /* Shortcut for single character strings */
2154 if (PyUnicode_GET_SIZE(self) == 1) {
2155 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2156 if (*p != ch) {
2157 *p = ch;
2158 return 1;
2159 }
2160 else
2161 return 0;
2162 }
2163
2164 e = p + PyUnicode_GET_SIZE(self);
2165 previous_is_cased = 0;
2166 for (; p < e; p++) {
2167 register const Py_UNICODE ch = *p;
2168
2169 if (previous_is_cased)
2170 *p = Py_UNICODE_TOLOWER(ch);
2171 else
2172 *p = Py_UNICODE_TOTITLE(ch);
2173
2174 if (Py_UNICODE_ISLOWER(ch) ||
2175 Py_UNICODE_ISUPPER(ch) ||
2176 Py_UNICODE_ISTITLE(ch))
2177 previous_is_cased = 1;
2178 else
2179 previous_is_cased = 0;
2180 }
2181 return 1;
2182}
2183
2184PyObject *PyUnicode_Join(PyObject *separator,
2185 PyObject *seq)
2186{
2187 Py_UNICODE *sep;
2188 int seplen;
2189 PyUnicodeObject *res = NULL;
2190 int reslen = 0;
2191 Py_UNICODE *p;
2192 int seqlen = 0;
2193 int sz = 100;
2194 int i;
2195
2196 seqlen = PySequence_Length(seq);
2197 if (seqlen < 0 && PyErr_Occurred())
2198 return NULL;
2199
2200 if (separator == NULL) {
2201 Py_UNICODE blank = ' ';
2202 sep = &blank;
2203 seplen = 1;
2204 }
2205 else {
2206 separator = PyUnicode_FromObject(separator);
2207 if (separator == NULL)
2208 return NULL;
2209 sep = PyUnicode_AS_UNICODE(separator);
2210 seplen = PyUnicode_GET_SIZE(separator);
2211 }
2212
2213 res = _PyUnicode_New(sz);
2214 if (res == NULL)
2215 goto onError;
2216 p = PyUnicode_AS_UNICODE(res);
2217 reslen = 0;
2218
2219 for (i = 0; i < seqlen; i++) {
2220 int itemlen;
2221 PyObject *item;
2222
2223 item = PySequence_GetItem(seq, i);
2224 if (item == NULL)
2225 goto onError;
2226 if (!PyUnicode_Check(item)) {
2227 PyObject *v;
2228 v = PyUnicode_FromObject(item);
2229 Py_DECREF(item);
2230 item = v;
2231 if (item == NULL)
2232 goto onError;
2233 }
2234 itemlen = PyUnicode_GET_SIZE(item);
2235 while (reslen + itemlen + seplen >= sz) {
2236 if (_PyUnicode_Resize(res, sz*2))
2237 goto onError;
2238 sz *= 2;
2239 p = PyUnicode_AS_UNICODE(res) + reslen;
2240 }
2241 if (i > 0) {
2242 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2243 p += seplen;
2244 reslen += seplen;
2245 }
2246 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2247 p += itemlen;
2248 reslen += itemlen;
2249 Py_DECREF(item);
2250 }
2251 if (_PyUnicode_Resize(res, reslen))
2252 goto onError;
2253
2254 Py_XDECREF(separator);
2255 return (PyObject *)res;
2256
2257 onError:
2258 Py_XDECREF(separator);
2259 Py_DECREF(res);
2260 return NULL;
2261}
2262
2263static
2264PyUnicodeObject *pad(PyUnicodeObject *self,
2265 int left,
2266 int right,
2267 Py_UNICODE fill)
2268{
2269 PyUnicodeObject *u;
2270
2271 if (left < 0)
2272 left = 0;
2273 if (right < 0)
2274 right = 0;
2275
2276 if (left == 0 && right == 0) {
2277 Py_INCREF(self);
2278 return self;
2279 }
2280
2281 u = _PyUnicode_New(left + self->length + right);
2282 if (u) {
2283 if (left)
2284 Py_UNICODE_FILL(u->str, fill, left);
2285 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2286 if (right)
2287 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2288 }
2289
2290 return u;
2291}
2292
2293#define SPLIT_APPEND(data, left, right) \
2294 str = PyUnicode_FromUnicode(data + left, right - left); \
2295 if (!str) \
2296 goto onError; \
2297 if (PyList_Append(list, str)) { \
2298 Py_DECREF(str); \
2299 goto onError; \
2300 } \
2301 else \
2302 Py_DECREF(str);
2303
2304static
2305PyObject *split_whitespace(PyUnicodeObject *self,
2306 PyObject *list,
2307 int maxcount)
2308{
2309 register int i;
2310 register int j;
2311 int len = self->length;
2312 PyObject *str;
2313
2314 for (i = j = 0; i < len; ) {
2315 /* find a token */
2316 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2317 i++;
2318 j = i;
2319 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2320 i++;
2321 if (j < i) {
2322 if (maxcount-- <= 0)
2323 break;
2324 SPLIT_APPEND(self->str, j, i);
2325 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2326 i++;
2327 j = i;
2328 }
2329 }
2330 if (j < len) {
2331 SPLIT_APPEND(self->str, j, len);
2332 }
2333 return list;
2334
2335 onError:
2336 Py_DECREF(list);
2337 return NULL;
2338}
2339
2340PyObject *PyUnicode_Splitlines(PyObject *string,
2341 int maxcount)
2342{
2343 register int i;
2344 register int j;
2345 int len;
2346 PyObject *list;
2347 PyObject *str;
2348 Py_UNICODE *data;
2349
2350 string = PyUnicode_FromObject(string);
2351 if (string == NULL)
2352 return NULL;
2353 data = PyUnicode_AS_UNICODE(string);
2354 len = PyUnicode_GET_SIZE(string);
2355
2356 if (maxcount < 0)
2357 maxcount = INT_MAX;
2358
2359 list = PyList_New(0);
2360 if (!list)
2361 goto onError;
2362
2363 for (i = j = 0; i < len; ) {
2364 /* Find a line and append it */
2365 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2366 i++;
2367 if (maxcount-- <= 0)
2368 break;
2369 SPLIT_APPEND(data, j, i);
2370
2371 /* Skip the line break reading CRLF as one line break */
2372 if (i < len) {
2373 if (data[i] == '\r' && i + 1 < len &&
2374 data[i+1] == '\n')
2375 i += 2;
2376 else
2377 i++;
2378 }
2379 j = i;
2380 }
2381 if (j < len) {
2382 SPLIT_APPEND(data, j, len);
2383 }
2384
2385 Py_DECREF(string);
2386 return list;
2387
2388 onError:
2389 Py_DECREF(list);
2390 Py_DECREF(string);
2391 return NULL;
2392}
2393
2394static
2395PyObject *split_char(PyUnicodeObject *self,
2396 PyObject *list,
2397 Py_UNICODE ch,
2398 int maxcount)
2399{
2400 register int i;
2401 register int j;
2402 int len = self->length;
2403 PyObject *str;
2404
2405 for (i = j = 0; i < len; ) {
2406 if (self->str[i] == ch) {
2407 if (maxcount-- <= 0)
2408 break;
2409 SPLIT_APPEND(self->str, j, i);
2410 i = j = i + 1;
2411 } else
2412 i++;
2413 }
2414 if (j <= len) {
2415 SPLIT_APPEND(self->str, j, len);
2416 }
2417 return list;
2418
2419 onError:
2420 Py_DECREF(list);
2421 return NULL;
2422}
2423
2424static
2425PyObject *split_substring(PyUnicodeObject *self,
2426 PyObject *list,
2427 PyUnicodeObject *substring,
2428 int maxcount)
2429{
2430 register int i;
2431 register int j;
2432 int len = self->length;
2433 int sublen = substring->length;
2434 PyObject *str;
2435
2436 for (i = j = 0; i < len - sublen; ) {
2437 if (Py_UNICODE_MATCH(self, i, substring)) {
2438 if (maxcount-- <= 0)
2439 break;
2440 SPLIT_APPEND(self->str, j, i);
2441 i = j = i + sublen;
2442 } else
2443 i++;
2444 }
2445 if (j <= len) {
2446 SPLIT_APPEND(self->str, j, len);
2447 }
2448 return list;
2449
2450 onError:
2451 Py_DECREF(list);
2452 return NULL;
2453}
2454
2455#undef SPLIT_APPEND
2456
2457static
2458PyObject *split(PyUnicodeObject *self,
2459 PyUnicodeObject *substring,
2460 int maxcount)
2461{
2462 PyObject *list;
2463
2464 if (maxcount < 0)
2465 maxcount = INT_MAX;
2466
2467 list = PyList_New(0);
2468 if (!list)
2469 return NULL;
2470
2471 if (substring == NULL)
2472 return split_whitespace(self,list,maxcount);
2473
2474 else if (substring->length == 1)
2475 return split_char(self,list,substring->str[0],maxcount);
2476
2477 else if (substring->length == 0) {
2478 Py_DECREF(list);
2479 PyErr_SetString(PyExc_ValueError, "empty separator");
2480 return NULL;
2481 }
2482 else
2483 return split_substring(self,list,substring,maxcount);
2484}
2485
2486static
2487PyObject *strip(PyUnicodeObject *self,
2488 int left,
2489 int right)
2490{
2491 Py_UNICODE *p = self->str;
2492 int start = 0;
2493 int end = self->length;
2494
2495 if (left)
2496 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2497 start++;
2498
2499 if (right)
2500 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2501 end--;
2502
2503 if (start == 0 && end == self->length) {
2504 /* couldn't strip anything off, return original string */
2505 Py_INCREF(self);
2506 return (PyObject*) self;
2507 }
2508
2509 return (PyObject*) PyUnicode_FromUnicode(
2510 self->str + start,
2511 end - start
2512 );
2513}
2514
2515static
2516PyObject *replace(PyUnicodeObject *self,
2517 PyUnicodeObject *str1,
2518 PyUnicodeObject *str2,
2519 int maxcount)
2520{
2521 PyUnicodeObject *u;
2522
2523 if (maxcount < 0)
2524 maxcount = INT_MAX;
2525
2526 if (str1->length == 1 && str2->length == 1) {
2527 int i;
2528
2529 /* replace characters */
2530 if (!findchar(self->str, self->length, str1->str[0])) {
2531 /* nothing to replace, return original string */
2532 Py_INCREF(self);
2533 u = self;
2534 } else {
2535 Py_UNICODE u1 = str1->str[0];
2536 Py_UNICODE u2 = str2->str[0];
2537
2538 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2539 self->str,
2540 self->length
2541 );
2542 if (u)
2543 for (i = 0; i < u->length; i++)
2544 if (u->str[i] == u1) {
2545 if (--maxcount < 0)
2546 break;
2547 u->str[i] = u2;
2548 }
2549 }
2550
2551 } else {
2552 int n, i;
2553 Py_UNICODE *p;
2554
2555 /* replace strings */
2556 n = count(self, 0, self->length, str1);
2557 if (n > maxcount)
2558 n = maxcount;
2559 if (n == 0) {
2560 /* nothing to replace, return original string */
2561 Py_INCREF(self);
2562 u = self;
2563 } else {
2564 u = _PyUnicode_New(
2565 self->length + n * (str2->length - str1->length));
2566 if (u) {
2567 i = 0;
2568 p = u->str;
2569 while (i <= self->length - str1->length)
2570 if (Py_UNICODE_MATCH(self, i, str1)) {
2571 /* replace string segment */
2572 Py_UNICODE_COPY(p, str2->str, str2->length);
2573 p += str2->length;
2574 i += str1->length;
2575 if (--n <= 0) {
2576 /* copy remaining part */
2577 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2578 break;
2579 }
2580 } else
2581 *p++ = self->str[i++];
2582 }
2583 }
2584 }
2585
2586 return (PyObject *) u;
2587}
2588
2589/* --- Unicode Object Methods --------------------------------------------- */
2590
2591static char title__doc__[] =
2592"S.title() -> unicode\n\
2593\n\
2594Return a titlecased version of S, i.e. words start with title case\n\
2595characters, all remaining cased characters have lower case.";
2596
2597static PyObject*
2598unicode_title(PyUnicodeObject *self, PyObject *args)
2599{
2600 if (!PyArg_NoArgs(args))
2601 return NULL;
2602 return fixup(self, fixtitle);
2603}
2604
2605static char capitalize__doc__[] =
2606"S.capitalize() -> unicode\n\
2607\n\
2608Return a capitalized version of S, i.e. make the first character\n\
2609have upper case.";
2610
2611static PyObject*
2612unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2613{
2614 if (!PyArg_NoArgs(args))
2615 return NULL;
2616 return fixup(self, fixcapitalize);
2617}
2618
2619#if 0
2620static char capwords__doc__[] =
2621"S.capwords() -> unicode\n\
2622\n\
2623Apply .capitalize() to all words in S and return the result with\n\
2624normalized whitespace (all whitespace strings are replaced by ' ').";
2625
2626static PyObject*
2627unicode_capwords(PyUnicodeObject *self, PyObject *args)
2628{
2629 PyObject *list;
2630 PyObject *item;
2631 int i;
2632
2633 if (!PyArg_NoArgs(args))
2634 return NULL;
2635
2636 /* Split into words */
2637 list = split(self, NULL, -1);
2638 if (!list)
2639 return NULL;
2640
2641 /* Capitalize each word */
2642 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2643 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2644 fixcapitalize);
2645 if (item == NULL)
2646 goto onError;
2647 Py_DECREF(PyList_GET_ITEM(list, i));
2648 PyList_SET_ITEM(list, i, item);
2649 }
2650
2651 /* Join the words to form a new string */
2652 item = PyUnicode_Join(NULL, list);
2653
2654onError:
2655 Py_DECREF(list);
2656 return (PyObject *)item;
2657}
2658#endif
2659
2660static char center__doc__[] =
2661"S.center(width) -> unicode\n\
2662\n\
2663Return S centered in a Unicode string of length width. Padding is done\n\
2664using spaces.";
2665
2666static PyObject *
2667unicode_center(PyUnicodeObject *self, PyObject *args)
2668{
2669 int marg, left;
2670 int width;
2671
2672 if (!PyArg_ParseTuple(args, "i:center", &width))
2673 return NULL;
2674
2675 if (self->length >= width) {
2676 Py_INCREF(self);
2677 return (PyObject*) self;
2678 }
2679
2680 marg = width - self->length;
2681 left = marg / 2 + (marg & width & 1);
2682
2683 return (PyObject*) pad(self, left, marg - left, ' ');
2684}
2685
2686static int
2687unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2688{
2689 int len1, len2;
2690 Py_UNICODE *s1 = str1->str;
2691 Py_UNICODE *s2 = str2->str;
2692
2693 len1 = str1->length;
2694 len2 = str2->length;
2695
2696 while (len1 > 0 && len2 > 0) {
2697 int cmp = (*s1++) - (*s2++);
2698 if (cmp)
2699 /* This should make Christian happy! */
2700 return (cmp < 0) ? -1 : (cmp != 0);
2701 len1--, len2--;
2702 }
2703
2704 return (len1 < len2) ? -1 : (len1 != len2);
2705}
2706
2707int PyUnicode_Compare(PyObject *left,
2708 PyObject *right)
2709{
2710 PyUnicodeObject *u = NULL, *v = NULL;
2711 int result;
2712
2713 /* Coerce the two arguments */
2714 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2715 if (u == NULL)
2716 goto onError;
2717 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2718 if (v == NULL)
2719 goto onError;
2720
2721 /* Shortcut for emtpy or interned objects */
2722 if (v == u) {
2723 Py_DECREF(u);
2724 Py_DECREF(v);
2725 return 0;
2726 }
2727
2728 result = unicode_compare(u, v);
2729
2730 Py_DECREF(u);
2731 Py_DECREF(v);
2732 return result;
2733
2734onError:
2735 Py_XDECREF(u);
2736 Py_XDECREF(v);
2737 return -1;
2738}
2739
2740/* Concat to string or Unicode object giving a new Unicode object. */
2741
2742PyObject *PyUnicode_Concat(PyObject *left,
2743 PyObject *right)
2744{
2745 PyUnicodeObject *u = NULL, *v = NULL, *w;
2746
2747 /* Coerce the two arguments */
2748 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2749 if (u == NULL)
2750 goto onError;
2751 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2752 if (v == NULL)
2753 goto onError;
2754
2755 /* Shortcuts */
2756 if (v == unicode_empty) {
2757 Py_DECREF(v);
2758 return (PyObject *)u;
2759 }
2760 if (u == unicode_empty) {
2761 Py_DECREF(u);
2762 return (PyObject *)v;
2763 }
2764
2765 /* Concat the two Unicode strings */
2766 w = _PyUnicode_New(u->length + v->length);
2767 if (w == NULL)
2768 goto onError;
2769 Py_UNICODE_COPY(w->str, u->str, u->length);
2770 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
2771
2772 Py_DECREF(u);
2773 Py_DECREF(v);
2774 return (PyObject *)w;
2775
2776onError:
2777 Py_XDECREF(u);
2778 Py_XDECREF(v);
2779 return NULL;
2780}
2781
2782static char count__doc__[] =
2783"S.count(sub[, start[, end]]) -> int\n\
2784\n\
2785Return the number of occurrences of substring sub in Unicode string\n\
2786S[start:end]. Optional arguments start and end are\n\
2787interpreted as in slice notation.";
2788
2789static PyObject *
2790unicode_count(PyUnicodeObject *self, PyObject *args)
2791{
2792 PyUnicodeObject *substring;
2793 int start = 0;
2794 int end = INT_MAX;
2795 PyObject *result;
2796
2797 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
2798 return NULL;
2799
2800 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2801 (PyObject *)substring);
2802 if (substring == NULL)
2803 return NULL;
2804
2805 if (substring->length == 0) {
2806 Py_DECREF(substring);
2807 return PyInt_FromLong((long) 0);
2808 }
2809
2810 if (start < 0)
2811 start += self->length;
2812 if (start < 0)
2813 start = 0;
2814 if (end > self->length)
2815 end = self->length;
2816 if (end < 0)
2817 end += self->length;
2818 if (end < 0)
2819 end = 0;
2820
2821 result = PyInt_FromLong((long) count(self, start, end, substring));
2822
2823 Py_DECREF(substring);
2824 return result;
2825}
2826
2827static char encode__doc__[] =
2828"S.encode([encoding[,errors]]) -> string\n\
2829\n\
2830Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
2831errors may be given to set a different error handling scheme. Default\n\
2832is 'strict' meaning that encoding errors raise a ValueError. Other\n\
2833possible values are 'ignore' and 'replace'.";
2834
2835static PyObject *
2836unicode_encode(PyUnicodeObject *self, PyObject *args)
2837{
2838 char *encoding = NULL;
2839 char *errors = NULL;
2840 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
2841 return NULL;
2842 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
2843}
2844
2845static char expandtabs__doc__[] =
2846"S.expandtabs([tabsize]) -> unicode\n\
2847\n\
2848Return a copy of S where all tab characters are expanded using spaces.\n\
2849If tabsize is not given, a tab size of 8 characters is assumed.";
2850
2851static PyObject*
2852unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
2853{
2854 Py_UNICODE *e;
2855 Py_UNICODE *p;
2856 Py_UNICODE *q;
2857 int i, j;
2858 PyUnicodeObject *u;
2859 int tabsize = 8;
2860
2861 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
2862 return NULL;
2863
2864 /* First pass: determine size of ouput string */
2865 i = j = 0;
2866 e = self->str + self->length;
2867 for (p = self->str; p < e; p++)
2868 if (*p == '\t') {
2869 if (tabsize > 0)
2870 j += tabsize - (j % tabsize);
2871 }
2872 else {
2873 j++;
2874 if (*p == '\n' || *p == '\r') {
2875 i += j;
2876 j = 0;
2877 }
2878 }
2879
2880 /* Second pass: create output string and fill it */
2881 u = _PyUnicode_New(i + j);
2882 if (!u)
2883 return NULL;
2884
2885 j = 0;
2886 q = u->str;
2887
2888 for (p = self->str; p < e; p++)
2889 if (*p == '\t') {
2890 if (tabsize > 0) {
2891 i = tabsize - (j % tabsize);
2892 j += i;
2893 while (i--)
2894 *q++ = ' ';
2895 }
2896 }
2897 else {
2898 j++;
2899 *q++ = *p;
2900 if (*p == '\n' || *p == '\r')
2901 j = 0;
2902 }
2903
2904 return (PyObject*) u;
2905}
2906
2907static char find__doc__[] =
2908"S.find(sub [,start [,end]]) -> int\n\
2909\n\
2910Return the lowest index in S where substring sub is found,\n\
2911such that sub is contained within s[start,end]. Optional\n\
2912arguments start and end are interpreted as in slice notation.\n\
2913\n\
2914Return -1 on failure.";
2915
2916static PyObject *
2917unicode_find(PyUnicodeObject *self, PyObject *args)
2918{
2919 PyUnicodeObject *substring;
2920 int start = 0;
2921 int end = INT_MAX;
2922 PyObject *result;
2923
2924 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
2925 return NULL;
2926 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2927 (PyObject *)substring);
2928 if (substring == NULL)
2929 return NULL;
2930
2931 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
2932
2933 Py_DECREF(substring);
2934 return result;
2935}
2936
2937static PyObject *
2938unicode_getitem(PyUnicodeObject *self, int index)
2939{
2940 if (index < 0 || index >= self->length) {
2941 PyErr_SetString(PyExc_IndexError, "string index out of range");
2942 return NULL;
2943 }
2944
2945 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
2946}
2947
2948static long
2949unicode_hash(PyUnicodeObject *self)
2950{
2951 long hash;
2952 PyObject *utf8;
2953
2954 /* Since Unicode objects compare equal to their UTF-8 string
2955 counterparts, they should also use the UTF-8 strings as basis
2956 for their hash value. This is needed to assure that strings and
2957 Unicode objects behave in the same way as dictionary
2958 keys. Unfortunately, this costs some performance and also some
2959 memory if the cached UTF-8 representation is not used later
2960 on. */
2961 if (self->hash != -1)
2962 return self->hash;
2963 utf8 = utf8_string(self, NULL);
2964 if (utf8 == NULL)
2965 return -1;
2966 hash = PyObject_Hash(utf8);
2967 if (hash == -1)
2968 return -1;
2969 self->hash = hash;
2970 return hash;
2971}
2972
2973static char index__doc__[] =
2974"S.index(sub [,start [,end]]) -> int\n\
2975\n\
2976Like S.find() but raise ValueError when the substring is not found.";
2977
2978static PyObject *
2979unicode_index(PyUnicodeObject *self, PyObject *args)
2980{
2981 int result;
2982 PyUnicodeObject *substring;
2983 int start = 0;
2984 int end = INT_MAX;
2985
2986 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
2987 return NULL;
2988
2989 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2990 (PyObject *)substring);
2991 if (substring == NULL)
2992 return NULL;
2993
2994 result = findstring(self, substring, start, end, 1);
2995
2996 Py_DECREF(substring);
2997 if (result < 0) {
2998 PyErr_SetString(PyExc_ValueError, "substring not found");
2999 return NULL;
3000 }
3001 return PyInt_FromLong(result);
3002}
3003
3004static char islower__doc__[] =
3005"S.islower() -> int\n\
3006\n\
3007Return 1 if all cased characters in S are lowercase and there is\n\
3008at least one cased character in S, 0 otherwise.";
3009
3010static PyObject*
3011unicode_islower(PyUnicodeObject *self, PyObject *args)
3012{
3013 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3014 register const Py_UNICODE *e;
3015 int cased;
3016
3017 if (!PyArg_NoArgs(args))
3018 return NULL;
3019
3020 /* Shortcut for single character strings */
3021 if (PyUnicode_GET_SIZE(self) == 1)
3022 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3023
3024 e = p + PyUnicode_GET_SIZE(self);
3025 cased = 0;
3026 for (; p < e; p++) {
3027 register const Py_UNICODE ch = *p;
3028
3029 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3030 return PyInt_FromLong(0);
3031 else if (!cased && Py_UNICODE_ISLOWER(ch))
3032 cased = 1;
3033 }
3034 return PyInt_FromLong(cased);
3035}
3036
3037static char isupper__doc__[] =
3038"S.isupper() -> int\n\
3039\n\
3040Return 1 if all cased characters in S are uppercase and there is\n\
3041at least one cased character in S, 0 otherwise.";
3042
3043static PyObject*
3044unicode_isupper(PyUnicodeObject *self, PyObject *args)
3045{
3046 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3047 register const Py_UNICODE *e;
3048 int cased;
3049
3050 if (!PyArg_NoArgs(args))
3051 return NULL;
3052
3053 /* Shortcut for single character strings */
3054 if (PyUnicode_GET_SIZE(self) == 1)
3055 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3056
3057 e = p + PyUnicode_GET_SIZE(self);
3058 cased = 0;
3059 for (; p < e; p++) {
3060 register const Py_UNICODE ch = *p;
3061
3062 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3063 return PyInt_FromLong(0);
3064 else if (!cased && Py_UNICODE_ISUPPER(ch))
3065 cased = 1;
3066 }
3067 return PyInt_FromLong(cased);
3068}
3069
3070static char istitle__doc__[] =
3071"S.istitle() -> int\n\
3072\n\
3073Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3074may only follow uncased characters and lowercase characters only cased\n\
3075ones. Return 0 otherwise.";
3076
3077static PyObject*
3078unicode_istitle(PyUnicodeObject *self, PyObject *args)
3079{
3080 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3081 register const Py_UNICODE *e;
3082 int cased, previous_is_cased;
3083
3084 if (!PyArg_NoArgs(args))
3085 return NULL;
3086
3087 /* Shortcut for single character strings */
3088 if (PyUnicode_GET_SIZE(self) == 1)
3089 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3090 (Py_UNICODE_ISUPPER(*p) != 0));
3091
3092 e = p + PyUnicode_GET_SIZE(self);
3093 cased = 0;
3094 previous_is_cased = 0;
3095 for (; p < e; p++) {
3096 register const Py_UNICODE ch = *p;
3097
3098 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3099 if (previous_is_cased)
3100 return PyInt_FromLong(0);
3101 previous_is_cased = 1;
3102 cased = 1;
3103 }
3104 else if (Py_UNICODE_ISLOWER(ch)) {
3105 if (!previous_is_cased)
3106 return PyInt_FromLong(0);
3107 previous_is_cased = 1;
3108 cased = 1;
3109 }
3110 else
3111 previous_is_cased = 0;
3112 }
3113 return PyInt_FromLong(cased);
3114}
3115
3116static char isspace__doc__[] =
3117"S.isspace() -> int\n\
3118\n\
3119Return 1 if there are only whitespace characters in S,\n\
31200 otherwise.";
3121
3122static PyObject*
3123unicode_isspace(PyUnicodeObject *self, PyObject *args)
3124{
3125 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3126 register const Py_UNICODE *e;
3127
3128 if (!PyArg_NoArgs(args))
3129 return NULL;
3130
3131 /* Shortcut for single character strings */
3132 if (PyUnicode_GET_SIZE(self) == 1 &&
3133 Py_UNICODE_ISSPACE(*p))
3134 return PyInt_FromLong(1);
3135
3136 e = p + PyUnicode_GET_SIZE(self);
3137 for (; p < e; p++) {
3138 if (!Py_UNICODE_ISSPACE(*p))
3139 return PyInt_FromLong(0);
3140 }
3141 return PyInt_FromLong(1);
3142}
3143
3144static char isdecimal__doc__[] =
3145"S.isdecimal() -> int\n\
3146\n\
3147Return 1 if there are only decimal characters in S,\n\
31480 otherwise.";
3149
3150static PyObject*
3151unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3152{
3153 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3154 register const Py_UNICODE *e;
3155
3156 if (!PyArg_NoArgs(args))
3157 return NULL;
3158
3159 /* Shortcut for single character strings */
3160 if (PyUnicode_GET_SIZE(self) == 1 &&
3161 Py_UNICODE_ISDECIMAL(*p))
3162 return PyInt_FromLong(1);
3163
3164 e = p + PyUnicode_GET_SIZE(self);
3165 for (; p < e; p++) {
3166 if (!Py_UNICODE_ISDECIMAL(*p))
3167 return PyInt_FromLong(0);
3168 }
3169 return PyInt_FromLong(1);
3170}
3171
3172static char isdigit__doc__[] =
3173"S.isdigit() -> int\n\
3174\n\
3175Return 1 if there are only digit characters in S,\n\
31760 otherwise.";
3177
3178static PyObject*
3179unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3180{
3181 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3182 register const Py_UNICODE *e;
3183
3184 if (!PyArg_NoArgs(args))
3185 return NULL;
3186
3187 /* Shortcut for single character strings */
3188 if (PyUnicode_GET_SIZE(self) == 1 &&
3189 Py_UNICODE_ISDIGIT(*p))
3190 return PyInt_FromLong(1);
3191
3192 e = p + PyUnicode_GET_SIZE(self);
3193 for (; p < e; p++) {
3194 if (!Py_UNICODE_ISDIGIT(*p))
3195 return PyInt_FromLong(0);
3196 }
3197 return PyInt_FromLong(1);
3198}
3199
3200static char isnumeric__doc__[] =
3201"S.isnumeric() -> int\n\
3202\n\
3203Return 1 if there are only numeric characters in S,\n\
32040 otherwise.";
3205
3206static PyObject*
3207unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3208{
3209 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3210 register const Py_UNICODE *e;
3211
3212 if (!PyArg_NoArgs(args))
3213 return NULL;
3214
3215 /* Shortcut for single character strings */
3216 if (PyUnicode_GET_SIZE(self) == 1 &&
3217 Py_UNICODE_ISNUMERIC(*p))
3218 return PyInt_FromLong(1);
3219
3220 e = p + PyUnicode_GET_SIZE(self);
3221 for (; p < e; p++) {
3222 if (!Py_UNICODE_ISNUMERIC(*p))
3223 return PyInt_FromLong(0);
3224 }
3225 return PyInt_FromLong(1);
3226}
3227
3228static char join__doc__[] =
3229"S.join(sequence) -> unicode\n\
3230\n\
3231Return a string which is the concatenation of the strings in the\n\
3232sequence. The separator between elements is S.";
3233
3234static PyObject*
3235unicode_join(PyUnicodeObject *self, PyObject *args)
3236{
3237 PyObject *data;
3238 if (!PyArg_ParseTuple(args, "O:join", &data))
3239 return NULL;
3240
3241 return PyUnicode_Join((PyObject *)self, data);
3242}
3243
3244static int
3245unicode_length(PyUnicodeObject *self)
3246{
3247 return self->length;
3248}
3249
3250static char ljust__doc__[] =
3251"S.ljust(width) -> unicode\n\
3252\n\
3253Return S left justified in a Unicode string of length width. Padding is\n\
3254done using spaces.";
3255
3256static PyObject *
3257unicode_ljust(PyUnicodeObject *self, PyObject *args)
3258{
3259 int width;
3260 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3261 return NULL;
3262
3263 if (self->length >= width) {
3264 Py_INCREF(self);
3265 return (PyObject*) self;
3266 }
3267
3268 return (PyObject*) pad(self, 0, width - self->length, ' ');
3269}
3270
3271static char lower__doc__[] =
3272"S.lower() -> unicode\n\
3273\n\
3274Return a copy of the string S converted to lowercase.";
3275
3276static PyObject*
3277unicode_lower(PyUnicodeObject *self, PyObject *args)
3278{
3279 if (!PyArg_NoArgs(args))
3280 return NULL;
3281 return fixup(self, fixlower);
3282}
3283
3284static char lstrip__doc__[] =
3285"S.lstrip() -> unicode\n\
3286\n\
3287Return a copy of the string S with leading whitespace removed.";
3288
3289static PyObject *
3290unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3291{
3292 if (!PyArg_NoArgs(args))
3293 return NULL;
3294 return strip(self, 1, 0);
3295}
3296
3297static PyObject*
3298unicode_repeat(PyUnicodeObject *str, int len)
3299{
3300 PyUnicodeObject *u;
3301 Py_UNICODE *p;
3302
3303 if (len < 0)
3304 len = 0;
3305
3306 if (len == 1) {
3307 /* no repeat, return original string */
3308 Py_INCREF(str);
3309 return (PyObject*) str;
3310 }
3311
3312 u = _PyUnicode_New(len * str->length);
3313 if (!u)
3314 return NULL;
3315
3316 p = u->str;
3317
3318 while (len-- > 0) {
3319 Py_UNICODE_COPY(p, str->str, str->length);
3320 p += str->length;
3321 }
3322
3323 return (PyObject*) u;
3324}
3325
3326PyObject *PyUnicode_Replace(PyObject *obj,
3327 PyObject *subobj,
3328 PyObject *replobj,
3329 int maxcount)
3330{
3331 PyObject *self;
3332 PyObject *str1;
3333 PyObject *str2;
3334 PyObject *result;
3335
3336 self = PyUnicode_FromObject(obj);
3337 if (self == NULL)
3338 return NULL;
3339 str1 = PyUnicode_FromObject(subobj);
3340 if (str1 == NULL) {
3341 Py_DECREF(self);
3342 return NULL;
3343 }
3344 str2 = PyUnicode_FromObject(replobj);
3345 if (str2 == NULL) {
3346 Py_DECREF(self);
3347 Py_DECREF(str1);
3348 return NULL;
3349 }
3350 result = replace((PyUnicodeObject *)self,
3351 (PyUnicodeObject *)str1,
3352 (PyUnicodeObject *)str2,
3353 maxcount);
3354 Py_DECREF(self);
3355 Py_DECREF(str1);
3356 Py_DECREF(str2);
3357 return result;
3358}
3359
3360static char replace__doc__[] =
3361"S.replace (old, new[, maxsplit]) -> unicode\n\
3362\n\
3363Return a copy of S with all occurrences of substring\n\
3364old replaced by new. If the optional argument maxsplit is\n\
3365given, only the first maxsplit occurrences are replaced.";
3366
3367static PyObject*
3368unicode_replace(PyUnicodeObject *self, PyObject *args)
3369{
3370 PyUnicodeObject *str1;
3371 PyUnicodeObject *str2;
3372 int maxcount = -1;
3373 PyObject *result;
3374
3375 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3376 return NULL;
3377 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3378 if (str1 == NULL)
3379 return NULL;
3380 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3381 if (str2 == NULL)
3382 return NULL;
3383
3384 result = replace(self, str1, str2, maxcount);
3385
3386 Py_DECREF(str1);
3387 Py_DECREF(str2);
3388 return result;
3389}
3390
3391static
3392PyObject *unicode_repr(PyObject *unicode)
3393{
3394 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3395 PyUnicode_GET_SIZE(unicode),
3396 1);
3397}
3398
3399static char rfind__doc__[] =
3400"S.rfind(sub [,start [,end]]) -> int\n\
3401\n\
3402Return the highest index in S where substring sub is found,\n\
3403such that sub is contained within s[start,end]. Optional\n\
3404arguments start and end are interpreted as in slice notation.\n\
3405\n\
3406Return -1 on failure.";
3407
3408static PyObject *
3409unicode_rfind(PyUnicodeObject *self, PyObject *args)
3410{
3411 PyUnicodeObject *substring;
3412 int start = 0;
3413 int end = INT_MAX;
3414 PyObject *result;
3415
3416 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3417 return NULL;
3418 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3419 (PyObject *)substring);
3420 if (substring == NULL)
3421 return NULL;
3422
3423 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3424
3425 Py_DECREF(substring);
3426 return result;
3427}
3428
3429static char rindex__doc__[] =
3430"S.rindex(sub [,start [,end]]) -> int\n\
3431\n\
3432Like S.rfind() but raise ValueError when the substring is not found.";
3433
3434static PyObject *
3435unicode_rindex(PyUnicodeObject *self, PyObject *args)
3436{
3437 int result;
3438 PyUnicodeObject *substring;
3439 int start = 0;
3440 int end = INT_MAX;
3441
3442 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3443 return NULL;
3444 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3445 (PyObject *)substring);
3446 if (substring == NULL)
3447 return NULL;
3448
3449 result = findstring(self, substring, start, end, -1);
3450
3451 Py_DECREF(substring);
3452 if (result < 0) {
3453 PyErr_SetString(PyExc_ValueError, "substring not found");
3454 return NULL;
3455 }
3456 return PyInt_FromLong(result);
3457}
3458
3459static char rjust__doc__[] =
3460"S.rjust(width) -> unicode\n\
3461\n\
3462Return S right justified in a Unicode string of length width. Padding is\n\
3463done using spaces.";
3464
3465static PyObject *
3466unicode_rjust(PyUnicodeObject *self, PyObject *args)
3467{
3468 int width;
3469 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3470 return NULL;
3471
3472 if (self->length >= width) {
3473 Py_INCREF(self);
3474 return (PyObject*) self;
3475 }
3476
3477 return (PyObject*) pad(self, width - self->length, 0, ' ');
3478}
3479
3480static char rstrip__doc__[] =
3481"S.rstrip() -> unicode\n\
3482\n\
3483Return a copy of the string S with trailing whitespace removed.";
3484
3485static PyObject *
3486unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3487{
3488 if (!PyArg_NoArgs(args))
3489 return NULL;
3490 return strip(self, 0, 1);
3491}
3492
3493static PyObject*
3494unicode_slice(PyUnicodeObject *self, int start, int end)
3495{
3496 /* standard clamping */
3497 if (start < 0)
3498 start = 0;
3499 if (end < 0)
3500 end = 0;
3501 if (end > self->length)
3502 end = self->length;
3503 if (start == 0 && end == self->length) {
3504 /* full slice, return original string */
3505 Py_INCREF(self);
3506 return (PyObject*) self;
3507 }
3508 if (start > end)
3509 start = end;
3510 /* copy slice */
3511 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3512 end - start);
3513}
3514
3515PyObject *PyUnicode_Split(PyObject *s,
3516 PyObject *sep,
3517 int maxsplit)
3518{
3519 PyObject *result;
3520
3521 s = PyUnicode_FromObject(s);
3522 if (s == NULL)
3523 return NULL;
3524 if (sep != NULL) {
3525 sep = PyUnicode_FromObject(sep);
3526 if (sep == NULL) {
3527 Py_DECREF(s);
3528 return NULL;
3529 }
3530 }
3531
3532 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3533
3534 Py_DECREF(s);
3535 Py_XDECREF(sep);
3536 return result;
3537}
3538
3539static char split__doc__[] =
3540"S.split([sep [,maxsplit]]) -> list of strings\n\
3541\n\
3542Return a list of the words in S, using sep as the\n\
3543delimiter string. If maxsplit is given, at most maxsplit\n\
3544splits are done. If sep is not specified, any whitespace string\n\
3545is a separator.";
3546
3547static PyObject*
3548unicode_split(PyUnicodeObject *self, PyObject *args)
3549{
3550 PyObject *substring = Py_None;
3551 int maxcount = -1;
3552
3553 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3554 return NULL;
3555
3556 if (substring == Py_None)
3557 return split(self, NULL, maxcount);
3558 else if (PyUnicode_Check(substring))
3559 return split(self, (PyUnicodeObject *)substring, maxcount);
3560 else
3561 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3562}
3563
3564static char splitlines__doc__[] =
3565"S.splitlines([maxsplit]]) -> list of strings\n\
3566\n\
3567Return a list of the lines in S, breaking at line boundaries.\n\
3568If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
3569included in the resulting list.";
3570
3571static PyObject*
3572unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3573{
3574 int maxcount = -1;
3575
3576 if (!PyArg_ParseTuple(args, "|i:splitlines", &maxcount))
3577 return NULL;
3578
3579 return PyUnicode_Splitlines((PyObject *)self, maxcount);
3580}
3581
3582static
3583PyObject *unicode_str(PyUnicodeObject *self)
3584{
3585 return PyUnicode_AsUTF8String((PyObject *)self);
3586}
3587
3588static char strip__doc__[] =
3589"S.strip() -> unicode\n\
3590\n\
3591Return a copy of S with leading and trailing whitespace removed.";
3592
3593static PyObject *
3594unicode_strip(PyUnicodeObject *self, PyObject *args)
3595{
3596 if (!PyArg_NoArgs(args))
3597 return NULL;
3598 return strip(self, 1, 1);
3599}
3600
3601static char swapcase__doc__[] =
3602"S.swapcase() -> unicode\n\
3603\n\
3604Return a copy of S with uppercase characters converted to lowercase\n\
3605and vice versa.";
3606
3607static PyObject*
3608unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3609{
3610 if (!PyArg_NoArgs(args))
3611 return NULL;
3612 return fixup(self, fixswapcase);
3613}
3614
3615static char translate__doc__[] =
3616"S.translate(table) -> unicode\n\
3617\n\
3618Return a copy of the string S, where all characters have been mapped\n\
3619through the given translation table, which must be a mapping of\n\
3620Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3621are left untouched. Characters mapped to None are deleted.";
3622
3623static PyObject*
3624unicode_translate(PyUnicodeObject *self, PyObject *args)
3625{
3626 PyObject *table;
3627
3628 if (!PyArg_ParseTuple(args, "O:translate", &table))
3629 return NULL;
3630 return PyUnicode_TranslateCharmap(self->str,
3631 self->length,
3632 table,
3633 "ignore");
3634}
3635
3636static char upper__doc__[] =
3637"S.upper() -> unicode\n\
3638\n\
3639Return a copy of S converted to uppercase.";
3640
3641static PyObject*
3642unicode_upper(PyUnicodeObject *self, PyObject *args)
3643{
3644 if (!PyArg_NoArgs(args))
3645 return NULL;
3646 return fixup(self, fixupper);
3647}
3648
3649#if 0
3650static char zfill__doc__[] =
3651"S.zfill(width) -> unicode\n\
3652\n\
3653Pad a numeric string x with zeros on the left, to fill a field\n\
3654of the specified width. The string x is never truncated.";
3655
3656static PyObject *
3657unicode_zfill(PyUnicodeObject *self, PyObject *args)
3658{
3659 int fill;
3660 PyUnicodeObject *u;
3661
3662 int width;
3663 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3664 return NULL;
3665
3666 if (self->length >= width) {
3667 Py_INCREF(self);
3668 return (PyObject*) self;
3669 }
3670
3671 fill = width - self->length;
3672
3673 u = pad(self, fill, 0, '0');
3674
3675 if (u->str[fill] == '+' || u->str[fill] == '-') {
3676 /* move sign to beginning of string */
3677 u->str[0] = u->str[fill];
3678 u->str[fill] = '0';
3679 }
3680
3681 return (PyObject*) u;
3682}
3683#endif
3684
3685#if 0
3686static PyObject*
3687unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3688{
3689 if (!PyArg_NoArgs(args))
3690 return NULL;
3691 return PyInt_FromLong(unicode_freelist_size);
3692}
3693#endif
3694
3695static char startswith__doc__[] =
3696"S.startswith(prefix[, start[, end]]) -> int\n\
3697\n\
3698Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3699optional start, test S beginning at that position. With optional end, stop\n\
3700comparing S at that position.";
3701
3702static PyObject *
3703unicode_startswith(PyUnicodeObject *self,
3704 PyObject *args)
3705{
3706 PyUnicodeObject *substring;
3707 int start = 0;
3708 int end = INT_MAX;
3709 PyObject *result;
3710
3711 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3712 return NULL;
3713 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3714 (PyObject *)substring);
3715 if (substring == NULL)
3716 return NULL;
3717
3718 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3719
3720 Py_DECREF(substring);
3721 return result;
3722}
3723
3724
3725static char endswith__doc__[] =
3726"S.endswith(suffix[, start[, end]]) -> int\n\
3727\n\
3728Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3729optional start, test S beginning at that position. With optional end, stop\n\
3730comparing S at that position.";
3731
3732static PyObject *
3733unicode_endswith(PyUnicodeObject *self,
3734 PyObject *args)
3735{
3736 PyUnicodeObject *substring;
3737 int start = 0;
3738 int end = INT_MAX;
3739 PyObject *result;
3740
3741 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3742 return NULL;
3743 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3744 (PyObject *)substring);
3745 if (substring == NULL)
3746 return NULL;
3747
3748 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3749
3750 Py_DECREF(substring);
3751 return result;
3752}
3753
3754
3755static PyMethodDef unicode_methods[] = {
3756
3757 /* Order is according to common usage: often used methods should
3758 appear first, since lookup is done sequentially. */
3759
3760 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3761 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3762 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3763 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3764 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3765 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3766 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3767 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3768 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3769 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3770 {"index", (PyCFunction) unicode_index, 1, index__doc__},
3771 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
3772 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
3773 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
3774/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
3775 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
3776 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
3777 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
3778 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
3779 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
3780 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
3781 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
3782 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
3783 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
3784 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
3785 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
3786 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
3787 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
3788 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
3789 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
3790 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
3791 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
3792 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
3793#if 0
3794 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
3795 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
3796#endif
3797
3798#if 0
3799 /* This one is just used for debugging the implementation. */
3800 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
3801#endif
3802
3803 {NULL, NULL}
3804};
3805
3806static PyObject *
3807unicode_getattr(PyUnicodeObject *self, char *name)
3808{
3809 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
3810}
3811
3812static PySequenceMethods unicode_as_sequence = {
3813 (inquiry) unicode_length, /* sq_length */
3814 (binaryfunc) PyUnicode_Concat, /* sq_concat */
3815 (intargfunc) unicode_repeat, /* sq_repeat */
3816 (intargfunc) unicode_getitem, /* sq_item */
3817 (intintargfunc) unicode_slice, /* sq_slice */
3818 0, /* sq_ass_item */
3819 0, /* sq_ass_slice */
3820};
3821
3822static int
3823unicode_buffer_getreadbuf(PyUnicodeObject *self,
3824 int index,
3825 const void **ptr)
3826{
3827 if (index != 0) {
3828 PyErr_SetString(PyExc_SystemError,
3829 "accessing non-existent unicode segment");
3830 return -1;
3831 }
3832 *ptr = (void *) self->str;
3833 return PyUnicode_GET_DATA_SIZE(self);
3834}
3835
3836static int
3837unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
3838 const void **ptr)
3839{
3840 PyErr_SetString(PyExc_TypeError,
3841 "cannot use unicode as modifyable buffer");
3842 return -1;
3843}
3844
3845static int
3846unicode_buffer_getsegcount(PyUnicodeObject *self,
3847 int *lenp)
3848{
3849 if (lenp)
3850 *lenp = PyUnicode_GET_DATA_SIZE(self);
3851 return 1;
3852}
3853
3854static int
3855unicode_buffer_getcharbuf(PyUnicodeObject *self,
3856 int index,
3857 const void **ptr)
3858{
3859 PyObject *str;
3860
3861 if (index != 0) {
3862 PyErr_SetString(PyExc_SystemError,
3863 "accessing non-existent unicode segment");
3864 return -1;
3865 }
3866 str = utf8_string(self, NULL);
3867 if (str == NULL)
3868 return -1;
3869 *ptr = (void *) PyString_AS_STRING(str);
3870 return PyString_GET_SIZE(str);
3871}
3872
3873/* Helpers for PyUnicode_Format() */
3874
3875static PyObject *
3876getnextarg(args, arglen, p_argidx)
3877 PyObject *args;
3878int arglen;
3879int *p_argidx;
3880{
3881 int argidx = *p_argidx;
3882 if (argidx < arglen) {
3883 (*p_argidx)++;
3884 if (arglen < 0)
3885 return args;
3886 else
3887 return PyTuple_GetItem(args, argidx);
3888 }
3889 PyErr_SetString(PyExc_TypeError,
3890 "not enough arguments for format string");
3891 return NULL;
3892}
3893
3894#define F_LJUST (1<<0)
3895#define F_SIGN (1<<1)
3896#define F_BLANK (1<<2)
3897#define F_ALT (1<<3)
3898#define F_ZERO (1<<4)
3899
3900static
3901#ifdef HAVE_STDARG_PROTOTYPES
3902int usprintf(register Py_UNICODE *buffer, char *format, ...)
3903#else
3904int usprintf(va_alist) va_dcl
3905#endif
3906{
3907 register int i;
3908 int len;
3909 va_list va;
3910 char *charbuffer;
3911#ifdef HAVE_STDARG_PROTOTYPES
3912 va_start(va, format);
3913#else
3914 Py_UNICODE *args;
3915 char *format;
3916
3917 va_start(va);
3918 buffer = va_arg(va, Py_UNICODE *);
3919 format = va_arg(va, char *);
3920#endif
3921
3922 /* First, format the string as char array, then expand to Py_UNICODE
3923 array. */
3924 charbuffer = (char *)buffer;
3925 len = vsprintf(charbuffer, format, va);
3926 for (i = len - 1; i >= 0; i--)
3927 buffer[i] = (Py_UNICODE) charbuffer[i];
3928
3929 va_end(va);
3930 return len;
3931}
3932
3933static int
3934formatfloat(Py_UNICODE *buf,
3935 int flags,
3936 int prec,
3937 int type,
3938 PyObject *v)
3939{
3940 char fmt[20];
3941 double x;
3942
3943 x = PyFloat_AsDouble(v);
3944 if (x == -1.0 && PyErr_Occurred())
3945 return -1;
3946 if (prec < 0)
3947 prec = 6;
3948 if (prec > 50)
3949 prec = 50; /* Arbitrary limitation */
3950 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
3951 type = 'g';
3952 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
3953 return usprintf(buf, fmt, x);
3954}
3955
3956static int
3957formatint(Py_UNICODE *buf,
3958 int flags,
3959 int prec,
3960 int type,
3961 PyObject *v)
3962{
3963 char fmt[20];
3964 long x;
3965
3966 x = PyInt_AsLong(v);
3967 if (x == -1 && PyErr_Occurred())
3968 return -1;
3969 if (prec < 0)
3970 prec = 1;
3971 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
3972 return usprintf(buf, fmt, x);
3973}
3974
3975static int
3976formatchar(Py_UNICODE *buf,
3977 PyObject *v)
3978{
3979 if (PyUnicode_Check(v))
3980 buf[0] = PyUnicode_AS_UNICODE(v)[0];
3981
3982 else if (PyString_Check(v))
3983 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
3984
3985 else {
3986 /* Integer input truncated to a character */
3987 long x;
3988 x = PyInt_AsLong(v);
3989 if (x == -1 && PyErr_Occurred())
3990 return -1;
3991 buf[0] = (char) x;
3992 }
3993 buf[1] = '\0';
3994 return 1;
3995}
3996
3997PyObject *PyUnicode_Format(PyObject *format,
3998 PyObject *args)
3999{
4000 Py_UNICODE *fmt, *res;
4001 int fmtcnt, rescnt, reslen, arglen, argidx;
4002 int args_owned = 0;
4003 PyUnicodeObject *result = NULL;
4004 PyObject *dict = NULL;
4005 PyObject *uformat;
4006
4007 if (format == NULL || args == NULL) {
4008 PyErr_BadInternalCall();
4009 return NULL;
4010 }
4011 uformat = PyUnicode_FromObject(format);
4012 fmt = PyUnicode_AS_UNICODE(uformat);
4013 fmtcnt = PyUnicode_GET_SIZE(uformat);
4014
4015 reslen = rescnt = fmtcnt + 100;
4016 result = _PyUnicode_New(reslen);
4017 if (result == NULL)
4018 goto onError;
4019 res = PyUnicode_AS_UNICODE(result);
4020
4021 if (PyTuple_Check(args)) {
4022 arglen = PyTuple_Size(args);
4023 argidx = 0;
4024 }
4025 else {
4026 arglen = -1;
4027 argidx = -2;
4028 }
4029 if (args->ob_type->tp_as_mapping)
4030 dict = args;
4031
4032 while (--fmtcnt >= 0) {
4033 if (*fmt != '%') {
4034 if (--rescnt < 0) {
4035 rescnt = fmtcnt + 100;
4036 reslen += rescnt;
4037 if (_PyUnicode_Resize(result, reslen) < 0)
4038 return NULL;
4039 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4040 --rescnt;
4041 }
4042 *res++ = *fmt++;
4043 }
4044 else {
4045 /* Got a format specifier */
4046 int flags = 0;
4047 int width = -1;
4048 int prec = -1;
4049 int size = 0;
4050 Py_UNICODE c = '\0';
4051 Py_UNICODE fill;
4052 PyObject *v = NULL;
4053 PyObject *temp = NULL;
4054 Py_UNICODE *buf;
4055 Py_UNICODE sign;
4056 int len;
4057 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4058
4059 fmt++;
4060 if (*fmt == '(') {
4061 Py_UNICODE *keystart;
4062 int keylen;
4063 PyObject *key;
4064 int pcount = 1;
4065
4066 if (dict == NULL) {
4067 PyErr_SetString(PyExc_TypeError,
4068 "format requires a mapping");
4069 goto onError;
4070 }
4071 ++fmt;
4072 --fmtcnt;
4073 keystart = fmt;
4074 /* Skip over balanced parentheses */
4075 while (pcount > 0 && --fmtcnt >= 0) {
4076 if (*fmt == ')')
4077 --pcount;
4078 else if (*fmt == '(')
4079 ++pcount;
4080 fmt++;
4081 }
4082 keylen = fmt - keystart - 1;
4083 if (fmtcnt < 0 || pcount > 0) {
4084 PyErr_SetString(PyExc_ValueError,
4085 "incomplete format key");
4086 goto onError;
4087 }
4088 /* keys are converted to strings (using UTF-8) and
4089 then looked up since Python uses strings to hold
4090 variables names etc. in its namespaces and we
4091 wouldn't want to break common idioms. The
4092 alternative would be using Unicode objects for the
4093 lookup but u"abc" and "abc" have different hash
4094 values (on purpose). */
4095 key = PyUnicode_EncodeUTF8(keystart,
4096 keylen,
4097 NULL);
4098 if (key == NULL)
4099 goto onError;
4100 if (args_owned) {
4101 Py_DECREF(args);
4102 args_owned = 0;
4103 }
4104 args = PyObject_GetItem(dict, key);
4105 Py_DECREF(key);
4106 if (args == NULL) {
4107 goto onError;
4108 }
4109 args_owned = 1;
4110 arglen = -1;
4111 argidx = -2;
4112 }
4113 while (--fmtcnt >= 0) {
4114 switch (c = *fmt++) {
4115 case '-': flags |= F_LJUST; continue;
4116 case '+': flags |= F_SIGN; continue;
4117 case ' ': flags |= F_BLANK; continue;
4118 case '#': flags |= F_ALT; continue;
4119 case '0': flags |= F_ZERO; continue;
4120 }
4121 break;
4122 }
4123 if (c == '*') {
4124 v = getnextarg(args, arglen, &argidx);
4125 if (v == NULL)
4126 goto onError;
4127 if (!PyInt_Check(v)) {
4128 PyErr_SetString(PyExc_TypeError,
4129 "* wants int");
4130 goto onError;
4131 }
4132 width = PyInt_AsLong(v);
4133 if (width < 0) {
4134 flags |= F_LJUST;
4135 width = -width;
4136 }
4137 if (--fmtcnt >= 0)
4138 c = *fmt++;
4139 }
4140 else if (c >= '0' && c <= '9') {
4141 width = c - '0';
4142 while (--fmtcnt >= 0) {
4143 c = *fmt++;
4144 if (c < '0' || c > '9')
4145 break;
4146 if ((width*10) / 10 != width) {
4147 PyErr_SetString(PyExc_ValueError,
4148 "width too big");
4149 goto onError;
4150 }
4151 width = width*10 + (c - '0');
4152 }
4153 }
4154 if (c == '.') {
4155 prec = 0;
4156 if (--fmtcnt >= 0)
4157 c = *fmt++;
4158 if (c == '*') {
4159 v = getnextarg(args, arglen, &argidx);
4160 if (v == NULL)
4161 goto onError;
4162 if (!PyInt_Check(v)) {
4163 PyErr_SetString(PyExc_TypeError,
4164 "* wants int");
4165 goto onError;
4166 }
4167 prec = PyInt_AsLong(v);
4168 if (prec < 0)
4169 prec = 0;
4170 if (--fmtcnt >= 0)
4171 c = *fmt++;
4172 }
4173 else if (c >= '0' && c <= '9') {
4174 prec = c - '0';
4175 while (--fmtcnt >= 0) {
4176 c = Py_CHARMASK(*fmt++);
4177 if (c < '0' || c > '9')
4178 break;
4179 if ((prec*10) / 10 != prec) {
4180 PyErr_SetString(PyExc_ValueError,
4181 "prec too big");
4182 goto onError;
4183 }
4184 prec = prec*10 + (c - '0');
4185 }
4186 }
4187 } /* prec */
4188 if (fmtcnt >= 0) {
4189 if (c == 'h' || c == 'l' || c == 'L') {
4190 size = c;
4191 if (--fmtcnt >= 0)
4192 c = *fmt++;
4193 }
4194 }
4195 if (fmtcnt < 0) {
4196 PyErr_SetString(PyExc_ValueError,
4197 "incomplete format");
4198 goto onError;
4199 }
4200 if (c != '%') {
4201 v = getnextarg(args, arglen, &argidx);
4202 if (v == NULL)
4203 goto onError;
4204 }
4205 sign = 0;
4206 fill = ' ';
4207 switch (c) {
4208
4209 case '%':
4210 buf = tmpbuf;
4211 buf[0] = '%';
4212 len = 1;
4213 break;
4214
4215 case 's':
4216 case 'r':
4217 if (PyUnicode_Check(v) && c == 's') {
4218 temp = v;
4219 Py_INCREF(temp);
4220 }
4221 else {
4222 PyObject *unicode;
4223 if (c == 's')
4224 temp = PyObject_Str(v);
4225 else
4226 temp = PyObject_Repr(v);
4227 if (temp == NULL)
4228 goto onError;
4229 if (!PyString_Check(temp)) {
4230 /* XXX Note: this should never happen, since
4231 PyObject_Repr() and PyObject_Str() assure
4232 this */
4233 Py_DECREF(temp);
4234 PyErr_SetString(PyExc_TypeError,
4235 "%s argument has non-string str()");
4236 goto onError;
4237 }
4238 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4239 PyString_GET_SIZE(temp),
4240 "strict");
4241 Py_DECREF(temp);
4242 temp = unicode;
4243 if (temp == NULL)
4244 goto onError;
4245 }
4246 buf = PyUnicode_AS_UNICODE(temp);
4247 len = PyUnicode_GET_SIZE(temp);
4248 if (prec >= 0 && len > prec)
4249 len = prec;
4250 break;
4251
4252 case 'i':
4253 case 'd':
4254 case 'u':
4255 case 'o':
4256 case 'x':
4257 case 'X':
4258 if (c == 'i')
4259 c = 'd';
4260 buf = tmpbuf;
4261 len = formatint(buf, flags, prec, c, v);
4262 if (len < 0)
4263 goto onError;
4264 sign = (c == 'd');
4265 if (flags & F_ZERO) {
4266 fill = '0';
4267 if ((flags&F_ALT) &&
4268 (c == 'x' || c == 'X') &&
4269 buf[0] == '0' && buf[1] == c) {
4270 *res++ = *buf++;
4271 *res++ = *buf++;
4272 rescnt -= 2;
4273 len -= 2;
4274 width -= 2;
4275 if (width < 0)
4276 width = 0;
4277 }
4278 }
4279 break;
4280
4281 case 'e':
4282 case 'E':
4283 case 'f':
4284 case 'g':
4285 case 'G':
4286 buf = tmpbuf;
4287 len = formatfloat(buf, flags, prec, c, v);
4288 if (len < 0)
4289 goto onError;
4290 sign = 1;
4291 if (flags&F_ZERO)
4292 fill = '0';
4293 break;
4294
4295 case 'c':
4296 buf = tmpbuf;
4297 len = formatchar(buf, v);
4298 if (len < 0)
4299 goto onError;
4300 break;
4301
4302 default:
4303 PyErr_Format(PyExc_ValueError,
4304 "unsupported format character '%c' (0x%x)",
4305 c, c);
4306 goto onError;
4307 }
4308 if (sign) {
4309 if (*buf == '-' || *buf == '+') {
4310 sign = *buf++;
4311 len--;
4312 }
4313 else if (flags & F_SIGN)
4314 sign = '+';
4315 else if (flags & F_BLANK)
4316 sign = ' ';
4317 else
4318 sign = 0;
4319 }
4320 if (width < len)
4321 width = len;
4322 if (rescnt < width + (sign != 0)) {
4323 reslen -= rescnt;
4324 rescnt = width + fmtcnt + 100;
4325 reslen += rescnt;
4326 if (_PyUnicode_Resize(result, reslen) < 0)
4327 return NULL;
4328 res = PyUnicode_AS_UNICODE(result)
4329 + reslen - rescnt;
4330 }
4331 if (sign) {
4332 if (fill != ' ')
4333 *res++ = sign;
4334 rescnt--;
4335 if (width > len)
4336 width--;
4337 }
4338 if (width > len && !(flags & F_LJUST)) {
4339 do {
4340 --rescnt;
4341 *res++ = fill;
4342 } while (--width > len);
4343 }
4344 if (sign && fill == ' ')
4345 *res++ = sign;
4346 memcpy(res, buf, len * sizeof(Py_UNICODE));
4347 res += len;
4348 rescnt -= len;
4349 while (--width >= len) {
4350 --rescnt;
4351 *res++ = ' ';
4352 }
4353 if (dict && (argidx < arglen) && c != '%') {
4354 PyErr_SetString(PyExc_TypeError,
4355 "not all arguments converted");
4356 goto onError;
4357 }
4358 Py_XDECREF(temp);
4359 } /* '%' */
4360 } /* until end */
4361 if (argidx < arglen && !dict) {
4362 PyErr_SetString(PyExc_TypeError,
4363 "not all arguments converted");
4364 goto onError;
4365 }
4366
4367 if (args_owned) {
4368 Py_DECREF(args);
4369 }
4370 Py_DECREF(uformat);
4371 _PyUnicode_Resize(result, reslen - rescnt);
4372 return (PyObject *)result;
4373
4374 onError:
4375 Py_XDECREF(result);
4376 Py_DECREF(uformat);
4377 if (args_owned) {
4378 Py_DECREF(args);
4379 }
4380 return NULL;
4381}
4382
4383static PyBufferProcs unicode_as_buffer = {
4384 (getreadbufferproc) unicode_buffer_getreadbuf,
4385 (getwritebufferproc) unicode_buffer_getwritebuf,
4386 (getsegcountproc) unicode_buffer_getsegcount,
4387 (getcharbufferproc) unicode_buffer_getcharbuf,
4388};
4389
4390PyTypeObject PyUnicode_Type = {
4391 PyObject_HEAD_INIT(&PyType_Type)
4392 0, /* ob_size */
4393 "unicode", /* tp_name */
4394 sizeof(PyUnicodeObject), /* tp_size */
4395 0, /* tp_itemsize */
4396 /* Slots */
4397 (destructor)_PyUnicode_Free, /* tp_dealloc */
4398 0, /* tp_print */
4399 (getattrfunc)unicode_getattr, /* tp_getattr */
4400 0, /* tp_setattr */
4401 (cmpfunc) unicode_compare, /* tp_compare */
4402 (reprfunc) unicode_repr, /* tp_repr */
4403 0, /* tp_as_number */
4404 &unicode_as_sequence, /* tp_as_sequence */
4405 0, /* tp_as_mapping */
4406 (hashfunc) unicode_hash, /* tp_hash*/
4407 0, /* tp_call*/
4408 (reprfunc) unicode_str, /* tp_str */
4409 (getattrofunc) NULL, /* tp_getattro */
4410 (setattrofunc) NULL, /* tp_setattro */
4411 &unicode_as_buffer, /* tp_as_buffer */
4412 Py_TPFLAGS_DEFAULT, /* tp_flags */
4413};
4414
4415/* Initialize the Unicode implementation */
4416
4417void _PyUnicode_Init()
4418{
4419 /* Doublecheck the configuration... */
4420 if (sizeof(Py_UNICODE) != 2)
4421 Py_FatalError("Unicode configuration error: "
4422 "sizeof(Py_UNICODE) != 2 bytes");
4423
4424 unicode_empty = _PyUnicode_New(0);
4425}
4426
4427/* Finalize the Unicode implementation */
4428
4429void
4430_PyUnicode_Fini()
4431{
4432 PyUnicodeObject *u = unicode_freelist;
4433
4434 while (u != NULL) {
4435 PyUnicodeObject *v = u;
4436 u = *(PyUnicodeObject **)u;
4437 free(v);
4438 }
4439 Py_XDECREF(unicode_empty);
4440}