blob: da12da264481ab30cd62d7ae9ac4ffacfbd7e2a0 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
76/* Limit for the Unicode object free list */
77
78#define MAX_UNICODE_FREELIST_SIZE 1024
79
80/* Limit for the Unicode object free list stay alive optimization.
81
82 The implementation will keep allocated Unicode memory intact for
83 all objects on the free list having a size less than this
84 limit. This reduces malloc() overhead for small Unicode objects.
85
86 At worse this will result in MAX_UNICODE_FREELIST_SIZE *
87 (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
88 malloc()-overhead) bytes of unused garbage.
89
90 Setting the limit to 0 effectively turns the feature off.
91
92 XXX The feature is currently turned off because there are
93 apparently some lingering bugs in its implementation which I
94 haven't yet been able to sort out.
95
96*/
97
98#define STAYALIVE_SIZE_LIMIT 0
99
100/* Endianness switches; defaults to little endian */
101
102#ifdef WORDS_BIGENDIAN
103# define BYTEORDER_IS_BIG_ENDIAN
104#else
105# define BYTEORDER_IS_LITTLE_ENDIAN
106#endif
107
108/* --- Globals ------------------------------------------------------------ */
109
110/* The empty Unicode object */
111static PyUnicodeObject *unicode_empty = NULL;
112
113/* Free list for Unicode objects */
114static PyUnicodeObject *unicode_freelist = NULL;
115static int unicode_freelist_size = 0;
116
117/* --- Unicode Object ----------------------------------------------------- */
118
119static
120int _PyUnicode_Resize(register PyUnicodeObject *unicode,
121 int length)
122{
123 void *oldstr;
124
125 /* Shortcut if there's nothing to do. */
126 if (unicode->length == length)
127 return 0;
128
129 /* Resizing unicode_empty is not allowed. */
130 if (unicode == unicode_empty) {
131 PyErr_SetString(PyExc_SystemError,
132 "can't resize empty unicode object");
133 return -1;
134 }
135
136 /* We allocate one more byte to make sure the string is
137 Ux0000 terminated -- XXX is this needed ? */
138 oldstr = unicode->str;
139 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
140 if (!unicode->str) {
141 unicode->str = oldstr;
142 PyErr_NoMemory();
143 return -1;
144 }
145 unicode->str[length] = 0;
146 unicode->length = length;
147
148 /* Reset the object caches */
149 if (unicode->utf8str) {
150 Py_DECREF(unicode->utf8str);
151 unicode->utf8str = NULL;
152 }
153 unicode->hash = -1;
154
155 return 0;
156}
157
158/* We allocate one more byte to make sure the string is
159 Ux0000 terminated -- XXX is this needed ?
160
161 XXX This allocator could further be enhanced by assuring that the
162 free list never reduces its size below 1.
163
164*/
165
166static
167PyUnicodeObject *_PyUnicode_New(int length)
168{
169 register PyUnicodeObject *unicode;
170
171 /* Optimization for empty strings */
172 if (length == 0 && unicode_empty != NULL) {
173 Py_INCREF(unicode_empty);
174 return unicode_empty;
175 }
176
177 /* Unicode freelist & memory allocation */
178 if (unicode_freelist) {
179 unicode = unicode_freelist;
180 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
181 unicode_freelist_size--;
182 unicode->ob_type = &PyUnicode_Type;
183 _Py_NewReference(unicode);
184 if (unicode->str) {
185 if (unicode->length < length &&
186 _PyUnicode_Resize(unicode, length)) {
187 free(unicode->str);
188 PyMem_DEL(unicode);
189 return NULL;
190 }
191 }
192 else
193 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
194 }
195 else {
196 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
197 if (unicode == NULL)
198 return NULL;
199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200 }
201
202 if (!unicode->str) {
203 PyMem_DEL(unicode);
204 PyErr_NoMemory();
205 return NULL;
206 }
207 unicode->str[length] = 0;
208 unicode->length = length;
209 unicode->hash = -1;
210 unicode->utf8str = NULL;
211 return unicode;
212}
213
214static
215void _PyUnicode_Free(register PyUnicodeObject *unicode)
216{
217 Py_XDECREF(unicode->utf8str);
218 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
219 if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
220 free(unicode->str);
221 unicode->str = NULL;
222 unicode->length = 0;
223 }
224 *(PyUnicodeObject **)unicode = unicode_freelist;
225 unicode_freelist = unicode;
226 unicode_freelist_size++;
227 _Py_ForgetReference(unicode);
228 }
229 else {
230 free(unicode->str);
231 PyMem_DEL(unicode);
232 }
233}
234
235PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
236 int size)
237{
238 PyUnicodeObject *unicode;
239
240 unicode = _PyUnicode_New(size);
241 if (!unicode)
242 return NULL;
243
244 /* Copy the Unicode data into the new object */
245 if (u != NULL)
246 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
247
248 return (PyObject *)unicode;
249}
250
251#ifdef HAVE_WCHAR_H
252
253PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
254 int size)
255{
256 PyUnicodeObject *unicode;
257
258 if (w == NULL) {
259 PyErr_BadInternalCall();
260 return NULL;
261 }
262
263 unicode = _PyUnicode_New(size);
264 if (!unicode)
265 return NULL;
266
267 /* Copy the wchar_t data into the new object */
268#ifdef HAVE_USABLE_WCHAR_T
269 memcpy(unicode->str, w, size * sizeof(wchar_t));
270#else
271 {
272 register Py_UNICODE *u;
273 register int i;
274 u = PyUnicode_AS_UNICODE(unicode);
275 for (i = size; i >= 0; i--)
276 *u++ = *w++;
277 }
278#endif
279
280 return (PyObject *)unicode;
281}
282
283int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
284 register wchar_t *w,
285 int size)
286{
287 if (unicode == NULL) {
288 PyErr_BadInternalCall();
289 return -1;
290 }
291 if (size > PyUnicode_GET_SIZE(unicode))
292 size = PyUnicode_GET_SIZE(unicode);
293#ifdef HAVE_USABLE_WCHAR_T
294 memcpy(w, unicode->str, size * sizeof(wchar_t));
295#else
296 {
297 register Py_UNICODE *u;
298 register int i;
299 u = PyUnicode_AS_UNICODE(unicode);
300 for (i = size; i >= 0; i--)
301 *w++ = *u++;
302 }
303#endif
304
305 return size;
306}
307
308#endif
309
310PyObject *PyUnicode_FromObject(register PyObject *obj)
311{
312 const char *s;
313 int len;
314
315 if (obj == NULL) {
316 PyErr_BadInternalCall();
317 return NULL;
318 }
319 else if (PyUnicode_Check(obj)) {
320 Py_INCREF(obj);
321 return obj;
322 }
323 else if (PyString_Check(obj)) {
324 s = PyString_AS_STRING(obj);
325 len = PyString_GET_SIZE(obj);
326 }
327 else if (PyObject_AsCharBuffer(obj, &s, &len))
328 return NULL;
329 if (len == 0) {
330 Py_INCREF(unicode_empty);
331 return (PyObject *)unicode_empty;
332 }
333 return PyUnicode_DecodeUTF8(s, len, "strict");
334}
335
336PyObject *PyUnicode_Decode(const char *s,
337 int size,
338 const char *encoding,
339 const char *errors)
340{
341 PyObject *buffer = NULL, *unicode;
342
343 /* Shortcut for the default encoding UTF-8 */
344 if (encoding == NULL ||
345 (strcmp(encoding, "utf-8") == 0))
346 return PyUnicode_DecodeUTF8(s, size, errors);
347
348 /* Decode via the codec registry */
349 buffer = PyBuffer_FromMemory((void *)s, size);
350 if (buffer == NULL)
351 goto onError;
352 unicode = PyCodec_Decode(buffer, encoding, errors);
353 if (unicode == NULL)
354 goto onError;
355 if (!PyUnicode_Check(unicode)) {
356 PyErr_Format(PyExc_TypeError,
357 "decoder did not return an unicode object (type=%s)",
358 unicode->ob_type->tp_name);
359 Py_DECREF(unicode);
360 goto onError;
361 }
362 Py_DECREF(buffer);
363 return unicode;
364
365 onError:
366 Py_XDECREF(buffer);
367 return NULL;
368}
369
370PyObject *PyUnicode_Encode(const Py_UNICODE *s,
371 int size,
372 const char *encoding,
373 const char *errors)
374{
375 PyObject *v, *unicode;
376
377 unicode = PyUnicode_FromUnicode(s, size);
378 if (unicode == NULL)
379 return NULL;
380 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
381 Py_DECREF(unicode);
382 return v;
383}
384
385PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
386 const char *encoding,
387 const char *errors)
388{
389 PyObject *v;
390
391 if (!PyUnicode_Check(unicode)) {
392 PyErr_BadArgument();
393 goto onError;
394 }
395 /* Shortcut for the default encoding UTF-8 */
396 if ((encoding == NULL ||
397 (strcmp(encoding, "utf-8") == 0)) &&
398 errors == NULL)
399 return PyUnicode_AsUTF8String(unicode);
400
401 /* Encode via the codec registry */
402 v = PyCodec_Encode(unicode, encoding, errors);
403 if (v == NULL)
404 goto onError;
405 /* XXX Should we really enforce this ? */
406 if (!PyString_Check(v)) {
407 PyErr_Format(PyExc_TypeError,
408 "encoder did not return a string object (type=%s)",
409 v->ob_type->tp_name);
410 Py_DECREF(v);
411 goto onError;
412 }
413 return v;
414
415 onError:
416 return NULL;
417}
418
419Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
420{
421 if (!PyUnicode_Check(unicode)) {
422 PyErr_BadArgument();
423 goto onError;
424 }
425 return PyUnicode_AS_UNICODE(unicode);
426
427 onError:
428 return NULL;
429}
430
431int PyUnicode_GetSize(PyObject *unicode)
432{
433 if (!PyUnicode_Check(unicode)) {
434 PyErr_BadArgument();
435 goto onError;
436 }
437 return PyUnicode_GET_SIZE(unicode);
438
439 onError:
440 return -1;
441}
442
443/* --- UTF-8 Codec -------------------------------------------------------- */
444
445static
446char utf8_code_length[256] = {
447 /* Map UTF-8 encoded prefix byte to sequence length. zero means
448 illegal prefix. see RFC 2279 for details */
449 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
450 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
451 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
452 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
453 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
454 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
455 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
456 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
457 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
458 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
462 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
463 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
464 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
465};
466
467static
468int utf8_decoding_error(const char **source,
469 Py_UNICODE **dest,
470 const char *errors,
471 const char *details)
472{
473 if ((errors == NULL) ||
474 (strcmp(errors,"strict") == 0)) {
475 PyErr_Format(PyExc_UnicodeError,
476 "UTF-8 decoding error: %s",
477 details);
478 return -1;
479 }
480 else if (strcmp(errors,"ignore") == 0) {
481 (*source)++;
482 return 0;
483 }
484 else if (strcmp(errors,"replace") == 0) {
485 (*source)++;
486 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
487 (*dest)++;
488 return 0;
489 }
490 else {
491 PyErr_Format(PyExc_ValueError,
492 "UTF-8 decoding error; unkown error handling code: %s",
493 errors);
494 return -1;
495 }
496}
497
498#define UTF8_ERROR(details) do { \
499 if (utf8_decoding_error(&s, &p, errors, details)) \
500 goto onError; \
501 continue; \
502} while (0)
503
504PyObject *PyUnicode_DecodeUTF8(const char *s,
505 int size,
506 const char *errors)
507{
508 int n;
509 const char *e;
510 PyUnicodeObject *unicode;
511 Py_UNICODE *p;
512
513 /* Note: size will always be longer than the resulting Unicode
514 character count */
515 unicode = _PyUnicode_New(size);
516 if (!unicode)
517 return NULL;
518 if (size == 0)
519 return (PyObject *)unicode;
520
521 /* Unpack UTF-8 encoded data */
522 p = unicode->str;
523 e = s + size;
524
525 while (s < e) {
526 register Py_UNICODE ch = (unsigned char)*s;
527
528 if (ch < 0x80) {
529 *p++ = ch;
530 s++;
531 continue;
532 }
533
534 n = utf8_code_length[ch];
535
536 if (s + n > e)
537 UTF8_ERROR("unexpected end of data");
538
539 switch (n) {
540
541 case 0:
542 UTF8_ERROR("unexpected code byte");
543 break;
544
545 case 1:
546 UTF8_ERROR("internal error");
547 break;
548
549 case 2:
550 if ((s[1] & 0xc0) != 0x80)
551 UTF8_ERROR("invalid data");
552 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
553 if (ch < 0x80)
554 UTF8_ERROR("illegal encoding");
555 else
556 *p++ = ch;
557 break;
558
559 case 3:
560 if ((s[1] & 0xc0) != 0x80 ||
561 (s[2] & 0xc0) != 0x80)
562 UTF8_ERROR("invalid data");
563 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
564 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
565 UTF8_ERROR("illegal encoding");
566 else
567 *p++ = ch;
568 break;
569
570 default:
571 /* Other sizes are only needed for UCS-4 */
572 UTF8_ERROR("unsupported Unicode code range");
573 }
574 s += n;
575 }
576
577 /* Adjust length */
578 if (_PyUnicode_Resize(unicode, p - unicode->str))
579 goto onError;
580
581 return (PyObject *)unicode;
582
583onError:
584 Py_DECREF(unicode);
585 return NULL;
586}
587
588#undef UTF8_ERROR
589
590static
591int utf8_encoding_error(const Py_UNICODE **source,
592 char **dest,
593 const char *errors,
594 const char *details)
595{
596 if ((errors == NULL) ||
597 (strcmp(errors,"strict") == 0)) {
598 PyErr_Format(PyExc_UnicodeError,
599 "UTF-8 encoding error: %s",
600 details);
601 return -1;
602 }
603 else if (strcmp(errors,"ignore") == 0) {
604 return 0;
605 }
606 else if (strcmp(errors,"replace") == 0) {
607 **dest = '?';
608 (*dest)++;
609 return 0;
610 }
611 else {
612 PyErr_Format(PyExc_ValueError,
613 "UTF-8 encoding error; "
614 "unkown error handling code: %s",
615 errors);
616 return -1;
617 }
618}
619
620PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
621 int size,
622 const char *errors)
623{
624 PyObject *v;
625 char *p;
626 char *q;
627
628 v = PyString_FromStringAndSize(NULL, 3 * size);
629 if (v == NULL)
630 return NULL;
631 if (size == 0)
632 goto done;
633
634 p = q = PyString_AS_STRING(v);
635 while (size-- > 0) {
636 Py_UNICODE ch = *s++;
637 if (ch < 0x80)
638 *p++ = (char) ch;
639 else if (ch < 0x0800) {
640 *p++ = 0xc0 | (ch >> 6);
641 *p++ = 0x80 | (ch & 0x3f);
642 } else if (0xD800 <= ch && ch <= 0xDFFF) {
643 /* These byte ranges are reserved for UTF-16 surrogate
644 bytes which the Python implementation currently does
645 not support. */
646 printf("code range problem: U+%04x\n", ch);
647 if (utf8_encoding_error(&s, &p, errors,
648 "unsupported code range"))
649 goto onError;
650 } else {
651 *p++ = 0xe0 | (ch >> 12);
652 *p++ = 0x80 | ((ch >> 6) & 0x3f);
653 *p++ = 0x80 | (ch & 0x3f);
654 }
655 }
656 *p = '\0';
657 _PyString_Resize(&v, p - q);
658
659 done:
660 return v;
661
662 onError:
663 Py_DECREF(v);
664 return NULL;
665}
666
667/* Return a Python string holding the UTF-8 encoded value of the
668 Unicode object.
669
670 The resulting string is cached in the Unicode object for subsequent
671 usage by this function. The cached version is needed to implement
672 the character buffer interface.
673
674 The refcount of the string is *not* incremented.
675
676*/
677
678static
679PyObject *utf8_string(PyUnicodeObject *self,
680 const char *errors)
681{
682 PyObject *v = self->utf8str;
683
684 if (v)
685 return v;
686 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
687 PyUnicode_GET_SIZE(self),
688 errors);
689 if (v && errors == NULL)
690 self->utf8str = v;
691 return v;
692}
693
694PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
695{
696 PyObject *str;
697
698 if (!PyUnicode_Check(unicode)) {
699 PyErr_BadArgument();
700 return NULL;
701 }
702 str = utf8_string((PyUnicodeObject *)unicode, NULL);
703 if (str == NULL)
704 return NULL;
705 Py_INCREF(str);
706 return str;
707}
708
709/* --- UTF-16 Codec ------------------------------------------------------- */
710
711static
712int utf16_decoding_error(const Py_UNICODE **source,
713 Py_UNICODE **dest,
714 const char *errors,
715 const char *details)
716{
717 if ((errors == NULL) ||
718 (strcmp(errors,"strict") == 0)) {
719 PyErr_Format(PyExc_UnicodeError,
720 "UTF-16 decoding error: %s",
721 details);
722 return -1;
723 }
724 else if (strcmp(errors,"ignore") == 0) {
725 return 0;
726 }
727 else if (strcmp(errors,"replace") == 0) {
728 if (dest) {
729 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
730 (*dest)++;
731 }
732 return 0;
733 }
734 else {
735 PyErr_Format(PyExc_ValueError,
736 "UTF-16 decoding error; unkown error handling code: %s",
737 errors);
738 return -1;
739 }
740}
741
742#define UTF16_ERROR(details) do { \
743 if (utf16_decoding_error(&q, &p, errors, details)) \
744 goto onError; \
745 continue; \
746} while(0)
747
748PyObject *PyUnicode_DecodeUTF16(const char *s,
749 int size,
750 const char *errors,
751 int *byteorder)
752{
753 PyUnicodeObject *unicode;
754 Py_UNICODE *p;
755 const Py_UNICODE *q, *e;
756 int bo = 0;
757
758 /* size should be an even number */
759 if (size % sizeof(Py_UNICODE) != 0) {
760 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
761 return NULL;
762 /* The remaining input chars are ignored if we fall through
763 here... */
764 }
765
766 /* Note: size will always be longer than the resulting Unicode
767 character count */
768 unicode = _PyUnicode_New(size);
769 if (!unicode)
770 return NULL;
771 if (size == 0)
772 return (PyObject *)unicode;
773
774 /* Unpack UTF-16 encoded data */
775 p = unicode->str;
776 q = (Py_UNICODE *)s;
777 e = q + (size / sizeof(Py_UNICODE));
778
779 if (byteorder)
780 bo = *byteorder;
781
782 while (q < e) {
783 register Py_UNICODE ch = *q++;
784
785 /* Check for BOM marks (U+FEFF) in the input and adjust
786 current byte order setting accordingly. Swap input
787 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
788 !) */
789#ifdef BYTEORDER_IS_LITTLE_ENDIAN
790 if (ch == 0xFEFF) {
791 bo = -1;
792 continue;
793 } else if (ch == 0xFFFE) {
794 bo = 1;
795 continue;
796 }
797 if (bo == 1)
798 ch = (ch >> 8) | (ch << 8);
799#else
800 if (ch == 0xFEFF) {
801 bo = 1;
802 continue;
803 } else if (ch == 0xFFFE) {
804 bo = -1;
805 continue;
806 }
807 if (bo == -1)
808 ch = (ch >> 8) | (ch << 8);
809#endif
810 if (ch < 0xD800 || ch > 0xDFFF) {
811 *p++ = ch;
812 continue;
813 }
814
815 /* UTF-16 code pair: */
816 if (q >= e)
817 UTF16_ERROR("unexpected end of data");
818 if (0xDC00 <= *q && *q <= 0xDFFF) {
819 q++;
820 if (0xD800 <= *q && *q <= 0xDBFF)
821 /* This is valid data (a UTF-16 surrogate pair), but
822 we are not able to store this information since our
823 Py_UNICODE type only has 16 bits... this might
824 change someday, even though it's unlikely. */
825 UTF16_ERROR("code pairs are not supported");
826 else
827 continue;
828 }
829 UTF16_ERROR("illegal encoding");
830 }
831
832 if (byteorder)
833 *byteorder = bo;
834
835 /* Adjust length */
836 if (_PyUnicode_Resize(unicode, p - unicode->str))
837 goto onError;
838
839 return (PyObject *)unicode;
840
841onError:
842 Py_DECREF(unicode);
843 return NULL;
844}
845
846#undef UTF16_ERROR
847
848PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
849 int size,
850 const char *errors,
851 int byteorder)
852{
853 PyObject *v;
854 Py_UNICODE *p;
855 char *q;
856
857 /* We don't create UTF-16 pairs... */
858 v = PyString_FromStringAndSize(NULL,
859 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
860 if (v == NULL)
861 return NULL;
862 if (size == 0)
863 goto done;
864
865 q = PyString_AS_STRING(v);
866 p = (Py_UNICODE *)q;
867
868 if (byteorder == 0)
869 *p++ = 0xFEFF;
870 if (byteorder == 0 ||
871#ifdef BYTEORDER_IS_LITTLE_ENDIAN
872 byteorder == -1
873#else
874 byteorder == 1
875#endif
876 )
877 memcpy(p, s, size * sizeof(Py_UNICODE));
878 else
879 while (size-- > 0) {
880 Py_UNICODE ch = *s++;
881 *p++ = (ch >> 8) | (ch << 8);
882 }
883 done:
884 return v;
885}
886
887PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
888{
889 if (!PyUnicode_Check(unicode)) {
890 PyErr_BadArgument();
891 return NULL;
892 }
893 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
894 PyUnicode_GET_SIZE(unicode),
895 NULL,
896 0);
897}
898
899/* --- Unicode Escape Codec ----------------------------------------------- */
900
901static
902int unicodeescape_decoding_error(const char **source,
903 unsigned int *x,
904 const char *errors,
905 const char *details)
906{
907 if ((errors == NULL) ||
908 (strcmp(errors,"strict") == 0)) {
909 PyErr_Format(PyExc_UnicodeError,
910 "Unicode-Escape decoding error: %s",
911 details);
912 return -1;
913 }
914 else if (strcmp(errors,"ignore") == 0) {
915 return 0;
916 }
917 else if (strcmp(errors,"replace") == 0) {
918 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
919 return 0;
920 }
921 else {
922 PyErr_Format(PyExc_ValueError,
923 "Unicode-Escape decoding error; "
924 "unkown error handling code: %s",
925 errors);
926 return -1;
927 }
928}
929
930PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
931 int size,
932 const char *errors)
933{
934 PyUnicodeObject *v;
935 Py_UNICODE *p = NULL, *buf = NULL;
936 const char *end;
937
938 /* Escaped strings will always be longer than the resulting
939 Unicode string, so we start with size here and then reduce the
940 length after conversion to the true value. */
941 v = _PyUnicode_New(size);
942 if (v == NULL)
943 goto onError;
944 if (size == 0)
945 return (PyObject *)v;
946 p = buf = PyUnicode_AS_UNICODE(v);
947 end = s + size;
948 while (s < end) {
949 unsigned char c;
950 unsigned int x;
951 int i;
952
953 /* Non-escape characters are interpreted as Unicode ordinals */
954 if (*s != '\\') {
955 *p++ = (unsigned char)*s++;
956 continue;
957 }
958
959 /* \ - Escapes */
960 s++;
961 switch (*s++) {
962
963 /* \x escapes */
964 case '\n': break;
965 case '\\': *p++ = '\\'; break;
966 case '\'': *p++ = '\''; break;
967 case '\"': *p++ = '\"'; break;
968 case 'b': *p++ = '\b'; break;
969 case 'f': *p++ = '\014'; break; /* FF */
970 case 't': *p++ = '\t'; break;
971 case 'n': *p++ = '\n'; break;
972 case 'r': *p++ = '\r'; break;
973 case 'v': *p++ = '\013'; break; /* VT */
974 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
975
976 /* \OOO (octal) escapes */
977 case '0': case '1': case '2': case '3':
978 case '4': case '5': case '6': case '7':
979 c = s[-1] - '0';
980 if ('0' <= *s && *s <= '7') {
981 c = (c<<3) + *s++ - '0';
982 if ('0' <= *s && *s <= '7')
983 c = (c<<3) + *s++ - '0';
984 }
985 *p++ = c;
986 break;
987
988 /* \xXXXX escape with 0-4 hex digits */
989 case 'x':
990 x = 0;
991 c = (unsigned char)*s;
992 if (isxdigit(c)) {
993 do {
994 x = (x<<4) & ~0xF;
995 if ('0' <= c && c <= '9')
996 x += c - '0';
997 else if ('a' <= c && c <= 'f')
998 x += 10 + c - 'a';
999 else
1000 x += 10 + c - 'A';
1001 c = (unsigned char)*++s;
1002 } while (isxdigit(c));
1003 *p++ = x;
1004 } else {
1005 *p++ = '\\';
1006 *p++ = (unsigned char)s[-1];
1007 }
1008 break;
1009
1010 /* \uXXXX with 4 hex digits */
1011 case 'u':
1012 for (x = 0, i = 0; i < 4; i++) {
1013 c = (unsigned char)s[i];
1014 if (!isxdigit(c)) {
1015 if (unicodeescape_decoding_error(&s, &x, errors,
1016 "truncated \\uXXXX"))
1017 goto onError;
1018 i++;
1019 break;
1020 }
1021 x = (x<<4) & ~0xF;
1022 if (c >= '0' && c <= '9')
1023 x += c - '0';
1024 else if (c >= 'a' && c <= 'f')
1025 x += 10 + c - 'a';
1026 else
1027 x += 10 + c - 'A';
1028 }
1029 s += i;
1030 *p++ = x;
1031 break;
1032
1033 default:
1034 *p++ = '\\';
1035 *p++ = (unsigned char)s[-1];
1036 break;
1037 }
1038 }
1039 _PyUnicode_Resize(v, (int)(p - buf));
1040 return (PyObject *)v;
1041
1042 onError:
1043 Py_XDECREF(v);
1044 return NULL;
1045}
1046
1047/* Return a Unicode-Escape string version of the Unicode object.
1048
1049 If quotes is true, the string is enclosed in u"" or u'' quotes as
1050 appropriate.
1051
1052*/
1053
1054static
1055PyObject *unicodeescape_string(const Py_UNICODE *s,
1056 int size,
1057 int quotes)
1058{
1059 PyObject *repr;
1060 char *p;
1061 char *q;
1062
1063 static const char *hexdigit = "0123456789ABCDEF";
1064
1065 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1066 if (repr == NULL)
1067 return NULL;
1068
1069 p = q = PyString_AS_STRING(repr);
1070
1071 if (quotes) {
1072 static const Py_UNICODE *findchar(const Py_UNICODE *s,
1073 int size,
1074 Py_UNICODE ch);
1075 *p++ = 'u';
1076 *p++ = (findchar(s, size, '\'') &&
1077 !findchar(s, size, '"')) ? '"' : '\'';
1078 }
1079 while (size-- > 0) {
1080 Py_UNICODE ch = *s++;
1081 /* Escape quotes */
1082 if (quotes && (ch == q[1] || ch == '\\')) {
1083 *p++ = '\\';
1084 *p++ = (char) ch;
1085 }
1086 /* Map 16-bit characters to '\uxxxx' */
1087 else if (ch >= 256) {
1088 *p++ = '\\';
1089 *p++ = 'u';
1090 *p++ = hexdigit[(ch >> 12) & 0xf];
1091 *p++ = hexdigit[(ch >> 8) & 0xf];
1092 *p++ = hexdigit[(ch >> 4) & 0xf];
1093 *p++ = hexdigit[ch & 15];
1094 }
1095 /* Map non-printable US ASCII to '\ooo' */
1096 else if (ch < ' ' || ch >= 128) {
1097 *p++ = '\\';
1098 *p++ = hexdigit[(ch >> 6) & 7];
1099 *p++ = hexdigit[(ch >> 3) & 7];
1100 *p++ = hexdigit[ch & 7];
1101 }
1102 /* Copy everything else as-is */
1103 else
1104 *p++ = (char) ch;
1105 }
1106 if (quotes)
1107 *p++ = q[1];
1108
1109 *p = '\0';
1110 _PyString_Resize(&repr, p - q);
1111
1112 return repr;
1113}
1114
1115PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1116 int size)
1117{
1118 return unicodeescape_string(s, size, 0);
1119}
1120
1121PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1122{
1123 if (!PyUnicode_Check(unicode)) {
1124 PyErr_BadArgument();
1125 return NULL;
1126 }
1127 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1128 PyUnicode_GET_SIZE(unicode));
1129}
1130
1131/* --- Raw Unicode Escape Codec ------------------------------------------- */
1132
1133PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1134 int size,
1135 const char *errors)
1136{
1137 PyUnicodeObject *v;
1138 Py_UNICODE *p, *buf;
1139 const char *end;
1140 const char *bs;
1141
1142 /* Escaped strings will always be longer than the resulting
1143 Unicode string, so we start with size here and then reduce the
1144 length after conversion to the true value. */
1145 v = _PyUnicode_New(size);
1146 if (v == NULL)
1147 goto onError;
1148 if (size == 0)
1149 return (PyObject *)v;
1150 p = buf = PyUnicode_AS_UNICODE(v);
1151 end = s + size;
1152 while (s < end) {
1153 unsigned char c;
1154 unsigned int x;
1155 int i;
1156
1157 /* Non-escape characters are interpreted as Unicode ordinals */
1158 if (*s != '\\') {
1159 *p++ = (unsigned char)*s++;
1160 continue;
1161 }
1162
1163 /* \u-escapes are only interpreted iff the number of leading
1164 backslashes if odd */
1165 bs = s;
1166 for (;s < end;) {
1167 if (*s != '\\')
1168 break;
1169 *p++ = (unsigned char)*s++;
1170 }
1171 if (((s - bs) & 1) == 0 ||
1172 s >= end ||
1173 *s != 'u') {
1174 continue;
1175 }
1176 p--;
1177 s++;
1178
1179 /* \uXXXX with 4 hex digits */
1180 for (x = 0, i = 0; i < 4; i++) {
1181 c = (unsigned char)s[i];
1182 if (!isxdigit(c)) {
1183 if (unicodeescape_decoding_error(&s, &x, errors,
1184 "truncated \\uXXXX"))
1185 goto onError;
1186 i++;
1187 break;
1188 }
1189 x = (x<<4) & ~0xF;
1190 if (c >= '0' && c <= '9')
1191 x += c - '0';
1192 else if (c >= 'a' && c <= 'f')
1193 x += 10 + c - 'a';
1194 else
1195 x += 10 + c - 'A';
1196 }
1197 s += i;
1198 *p++ = x;
1199 }
1200 _PyUnicode_Resize(v, (int)(p - buf));
1201 return (PyObject *)v;
1202
1203 onError:
1204 Py_XDECREF(v);
1205 return NULL;
1206}
1207
1208PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1209 int size)
1210{
1211 PyObject *repr;
1212 char *p;
1213 char *q;
1214
1215 static const char *hexdigit = "0123456789ABCDEF";
1216
1217 repr = PyString_FromStringAndSize(NULL, 6 * size);
1218 if (repr == NULL)
1219 return NULL;
1220
1221 p = q = PyString_AS_STRING(repr);
1222 while (size-- > 0) {
1223 Py_UNICODE ch = *s++;
1224 /* Map 16-bit characters to '\uxxxx' */
1225 if (ch >= 256) {
1226 *p++ = '\\';
1227 *p++ = 'u';
1228 *p++ = hexdigit[(ch >> 12) & 0xf];
1229 *p++ = hexdigit[(ch >> 8) & 0xf];
1230 *p++ = hexdigit[(ch >> 4) & 0xf];
1231 *p++ = hexdigit[ch & 15];
1232 }
1233 /* Copy everything else as-is */
1234 else
1235 *p++ = (char) ch;
1236 }
1237 *p = '\0';
1238 _PyString_Resize(&repr, p - q);
1239
1240 return repr;
1241}
1242
1243PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1244{
1245 if (!PyUnicode_Check(unicode)) {
1246 PyErr_BadArgument();
1247 return NULL;
1248 }
1249 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1250 PyUnicode_GET_SIZE(unicode));
1251}
1252
1253/* --- Latin-1 Codec ------------------------------------------------------ */
1254
1255PyObject *PyUnicode_DecodeLatin1(const char *s,
1256 int size,
1257 const char *errors)
1258{
1259 PyUnicodeObject *v;
1260 Py_UNICODE *p;
1261
1262 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1263 v = _PyUnicode_New(size);
1264 if (v == NULL)
1265 goto onError;
1266 if (size == 0)
1267 return (PyObject *)v;
1268 p = PyUnicode_AS_UNICODE(v);
1269 while (size-- > 0)
1270 *p++ = (unsigned char)*s++;
1271 return (PyObject *)v;
1272
1273 onError:
1274 Py_XDECREF(v);
1275 return NULL;
1276}
1277
1278static
1279int latin1_encoding_error(const Py_UNICODE **source,
1280 char **dest,
1281 const char *errors,
1282 const char *details)
1283{
1284 if ((errors == NULL) ||
1285 (strcmp(errors,"strict") == 0)) {
1286 PyErr_Format(PyExc_UnicodeError,
1287 "Latin-1 encoding error: %s",
1288 details);
1289 return -1;
1290 }
1291 else if (strcmp(errors,"ignore") == 0) {
1292 return 0;
1293 }
1294 else if (strcmp(errors,"replace") == 0) {
1295 **dest = '?';
1296 return 0;
1297 }
1298 else {
1299 PyErr_Format(PyExc_ValueError,
1300 "Latin-1 encoding error; "
1301 "unkown error handling code: %s",
1302 errors);
1303 return -1;
1304 }
1305}
1306
1307PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1308 int size,
1309 const char *errors)
1310{
1311 PyObject *repr;
1312 char *s;
1313 repr = PyString_FromStringAndSize(NULL, size);
1314 if (repr == NULL)
1315 return NULL;
1316
1317 s = PyString_AS_STRING(repr);
1318 while (size-- > 0) {
1319 Py_UNICODE ch = *p++;
1320 if (ch >= 256) {
1321 if (latin1_encoding_error(&p, &s, errors,
1322 "ordinal not in range(256)"))
1323 goto onError;
1324 }
1325 else
1326 *s++ = (char)ch;
1327 }
1328 return repr;
1329
1330 onError:
1331 Py_DECREF(repr);
1332 return NULL;
1333}
1334
1335PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1336{
1337 if (!PyUnicode_Check(unicode)) {
1338 PyErr_BadArgument();
1339 return NULL;
1340 }
1341 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1342 PyUnicode_GET_SIZE(unicode),
1343 NULL);
1344}
1345
1346/* --- 7-bit ASCII Codec -------------------------------------------------- */
1347
1348static
1349int ascii_decoding_error(const char **source,
1350 Py_UNICODE **dest,
1351 const char *errors,
1352 const char *details)
1353{
1354 if ((errors == NULL) ||
1355 (strcmp(errors,"strict") == 0)) {
1356 PyErr_Format(PyExc_UnicodeError,
1357 "ASCII decoding error: %s",
1358 details);
1359 return -1;
1360 }
1361 else if (strcmp(errors,"ignore") == 0) {
1362 return 0;
1363 }
1364 else if (strcmp(errors,"replace") == 0) {
1365 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1366 (*dest)++;
1367 return 0;
1368 }
1369 else {
1370 PyErr_Format(PyExc_ValueError,
1371 "ASCII decoding error; "
1372 "unkown error handling code: %s",
1373 errors);
1374 return -1;
1375 }
1376}
1377
1378PyObject *PyUnicode_DecodeASCII(const char *s,
1379 int size,
1380 const char *errors)
1381{
1382 PyUnicodeObject *v;
1383 Py_UNICODE *p;
1384
1385 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1386 v = _PyUnicode_New(size);
1387 if (v == NULL)
1388 goto onError;
1389 if (size == 0)
1390 return (PyObject *)v;
1391 p = PyUnicode_AS_UNICODE(v);
1392 while (size-- > 0) {
1393 register unsigned char c;
1394
1395 c = (unsigned char)*s++;
1396 if (c < 128)
1397 *p++ = c;
1398 else if (ascii_decoding_error(&s, &p, errors,
1399 "ordinal not in range(128)"))
1400 goto onError;
1401 }
1402 if (p - PyUnicode_AS_UNICODE(v) < size)
1403 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1404 return (PyObject *)v;
1405
1406 onError:
1407 Py_XDECREF(v);
1408 return NULL;
1409}
1410
1411static
1412int ascii_encoding_error(const Py_UNICODE **source,
1413 char **dest,
1414 const char *errors,
1415 const char *details)
1416{
1417 if ((errors == NULL) ||
1418 (strcmp(errors,"strict") == 0)) {
1419 PyErr_Format(PyExc_UnicodeError,
1420 "ASCII encoding error: %s",
1421 details);
1422 return -1;
1423 }
1424 else if (strcmp(errors,"ignore") == 0) {
1425 return 0;
1426 }
1427 else if (strcmp(errors,"replace") == 0) {
1428 **dest = '?';
1429 return 0;
1430 }
1431 else {
1432 PyErr_Format(PyExc_ValueError,
1433 "ASCII encoding error; "
1434 "unkown error handling code: %s",
1435 errors);
1436 return -1;
1437 }
1438}
1439
1440PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1441 int size,
1442 const char *errors)
1443{
1444 PyObject *repr;
1445 char *s;
1446 repr = PyString_FromStringAndSize(NULL, size);
1447 if (repr == NULL)
1448 return NULL;
1449
1450 s = PyString_AS_STRING(repr);
1451 while (size-- > 0) {
1452 Py_UNICODE ch = *p++;
1453 if (ch >= 128) {
1454 if (ascii_encoding_error(&p, &s, errors,
1455 "ordinal not in range(128)"))
1456 goto onError;
1457 }
1458 else
1459 *s++ = (char)ch;
1460 }
1461 return repr;
1462
1463 onError:
1464 Py_DECREF(repr);
1465 return NULL;
1466}
1467
1468PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1469{
1470 if (!PyUnicode_Check(unicode)) {
1471 PyErr_BadArgument();
1472 return NULL;
1473 }
1474 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1475 PyUnicode_GET_SIZE(unicode),
1476 NULL);
1477}
1478
1479/* --- Character Mapping Codec -------------------------------------------- */
1480
1481static
1482int charmap_decoding_error(const char **source,
1483 Py_UNICODE **dest,
1484 const char *errors,
1485 const char *details)
1486{
1487 if ((errors == NULL) ||
1488 (strcmp(errors,"strict") == 0)) {
1489 PyErr_Format(PyExc_UnicodeError,
1490 "charmap decoding error: %s",
1491 details);
1492 return -1;
1493 }
1494 else if (strcmp(errors,"ignore") == 0) {
1495 return 0;
1496 }
1497 else if (strcmp(errors,"replace") == 0) {
1498 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1499 (*dest)++;
1500 return 0;
1501 }
1502 else {
1503 PyErr_Format(PyExc_ValueError,
1504 "charmap decoding error; "
1505 "unkown error handling code: %s",
1506 errors);
1507 return -1;
1508 }
1509}
1510
1511PyObject *PyUnicode_DecodeCharmap(const char *s,
1512 int size,
1513 PyObject *mapping,
1514 const char *errors)
1515{
1516 PyUnicodeObject *v;
1517 Py_UNICODE *p;
1518
1519 /* Default to Latin-1 */
1520 if (mapping == NULL)
1521 return PyUnicode_DecodeLatin1(s, size, errors);
1522
1523 v = _PyUnicode_New(size);
1524 if (v == NULL)
1525 goto onError;
1526 if (size == 0)
1527 return (PyObject *)v;
1528 p = PyUnicode_AS_UNICODE(v);
1529 while (size-- > 0) {
1530 unsigned char ch = *s++;
1531 PyObject *w, *x;
1532
1533 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1534 w = PyInt_FromLong((long)ch);
1535 if (w == NULL)
1536 goto onError;
1537 x = PyObject_GetItem(mapping, w);
1538 Py_DECREF(w);
1539 if (x == NULL) {
1540 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1541 /* No mapping found: default to Latin-1 mapping */
1542 PyErr_Clear();
1543 *p++ = (Py_UNICODE)ch;
1544 continue;
1545 }
1546 goto onError;
1547 }
1548
1549 /* Apply mapping */
1550 if (PyInt_Check(x)) {
1551 int value = PyInt_AS_LONG(x);
1552 if (value < 0 || value > 65535) {
1553 PyErr_SetString(PyExc_TypeError,
1554 "character mapping must be in range(65336)");
1555 Py_DECREF(x);
1556 goto onError;
1557 }
1558 *p++ = (Py_UNICODE)value;
1559 }
1560 else if (x == Py_None) {
1561 /* undefined mapping */
1562 if (charmap_decoding_error(&s, &p, errors,
1563 "character maps to <undefined>")) {
1564 Py_DECREF(x);
1565 goto onError;
1566 }
1567 }
1568 else if (PyUnicode_Check(x)) {
1569 if (PyUnicode_GET_SIZE(x) != 1) {
1570 /* 1-n mapping */
1571 PyErr_SetString(PyExc_NotImplementedError,
1572 "1-n mappings are currently not implemented");
1573 Py_DECREF(x);
1574 goto onError;
1575 }
1576 *p++ = *PyUnicode_AS_UNICODE(x);
1577 }
1578 else {
1579 /* wrong return value */
1580 PyErr_SetString(PyExc_TypeError,
1581 "character mapping must return integer, None or unicode");
1582 Py_DECREF(x);
1583 goto onError;
1584 }
1585 Py_DECREF(x);
1586 }
1587 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1588 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1589 goto onError;
1590 return (PyObject *)v;
1591
1592 onError:
1593 Py_XDECREF(v);
1594 return NULL;
1595}
1596
1597static
1598int charmap_encoding_error(const Py_UNICODE **source,
1599 char **dest,
1600 const char *errors,
1601 const char *details)
1602{
1603 if ((errors == NULL) ||
1604 (strcmp(errors,"strict") == 0)) {
1605 PyErr_Format(PyExc_UnicodeError,
1606 "charmap encoding error: %s",
1607 details);
1608 return -1;
1609 }
1610 else if (strcmp(errors,"ignore") == 0) {
1611 return 0;
1612 }
1613 else if (strcmp(errors,"replace") == 0) {
1614 **dest = '?';
1615 (*dest)++;
1616 return 0;
1617 }
1618 else {
1619 PyErr_Format(PyExc_ValueError,
1620 "charmap encoding error; "
1621 "unkown error handling code: %s",
1622 errors);
1623 return -1;
1624 }
1625}
1626
1627PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1628 int size,
1629 PyObject *mapping,
1630 const char *errors)
1631{
1632 PyObject *v;
1633 char *s;
1634
1635 /* Default to Latin-1 */
1636 if (mapping == NULL)
1637 return PyUnicode_EncodeLatin1(p, size, errors);
1638
1639 v = PyString_FromStringAndSize(NULL, size);
1640 if (v == NULL)
1641 return NULL;
1642 s = PyString_AS_STRING(v);
1643 while (size-- > 0) {
1644 Py_UNICODE ch = *p++;
1645 PyObject *w, *x;
1646
1647 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1648 w = PyInt_FromLong((long)ch);
1649 if (w == NULL)
1650 goto onError;
1651 x = PyObject_GetItem(mapping, w);
1652 Py_DECREF(w);
1653 if (x == NULL) {
1654 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1655 /* No mapping found: default to Latin-1 mapping if possible */
1656 PyErr_Clear();
1657 if (ch < 256) {
1658 *s++ = (char)ch;
1659 continue;
1660 }
1661 else if (!charmap_encoding_error(&p, &s, errors,
1662 "missing character mapping"))
1663 continue;
1664 }
1665 goto onError;
1666 }
1667
1668 /* Apply mapping */
1669 if (PyInt_Check(x)) {
1670 int value = PyInt_AS_LONG(x);
1671 if (value < 0 || value > 255) {
1672 PyErr_SetString(PyExc_TypeError,
1673 "character mapping must be in range(256)");
1674 Py_DECREF(x);
1675 goto onError;
1676 }
1677 *s++ = (char)value;
1678 }
1679 else if (x == Py_None) {
1680 /* undefined mapping */
1681 if (charmap_encoding_error(&p, &s, errors,
1682 "character maps to <undefined>")) {
1683 Py_DECREF(x);
1684 goto onError;
1685 }
1686 }
1687 else if (PyString_Check(x)) {
1688 if (PyString_GET_SIZE(x) != 1) {
1689 /* 1-n mapping */
1690 PyErr_SetString(PyExc_NotImplementedError,
1691 "1-n mappings are currently not implemented");
1692 Py_DECREF(x);
1693 goto onError;
1694 }
1695 *s++ = *PyString_AS_STRING(x);
1696 }
1697 else {
1698 /* wrong return value */
1699 PyErr_SetString(PyExc_TypeError,
1700 "character mapping must return integer, None or unicode");
1701 Py_DECREF(x);
1702 goto onError;
1703 }
1704 Py_DECREF(x);
1705 }
1706 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1707 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1708 goto onError;
1709 return v;
1710
1711 onError:
1712 Py_DECREF(v);
1713 return NULL;
1714}
1715
1716PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1717 PyObject *mapping)
1718{
1719 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1720 PyErr_BadArgument();
1721 return NULL;
1722 }
1723 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1724 PyUnicode_GET_SIZE(unicode),
1725 mapping,
1726 NULL);
1727}
1728
1729static
1730int translate_error(const Py_UNICODE **source,
1731 Py_UNICODE **dest,
1732 const char *errors,
1733 const char *details)
1734{
1735 if ((errors == NULL) ||
1736 (strcmp(errors,"strict") == 0)) {
1737 PyErr_Format(PyExc_UnicodeError,
1738 "translate error: %s",
1739 details);
1740 return -1;
1741 }
1742 else if (strcmp(errors,"ignore") == 0) {
1743 return 0;
1744 }
1745 else if (strcmp(errors,"replace") == 0) {
1746 **dest = '?';
1747 (*dest)++;
1748 return 0;
1749 }
1750 else {
1751 PyErr_Format(PyExc_ValueError,
1752 "translate error; "
1753 "unkown error handling code: %s",
1754 errors);
1755 return -1;
1756 }
1757}
1758
1759PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1760 int size,
1761 PyObject *mapping,
1762 const char *errors)
1763{
1764 PyUnicodeObject *v;
1765 Py_UNICODE *p;
1766
1767 if (mapping == NULL) {
1768 PyErr_BadArgument();
1769 return NULL;
1770 }
1771
1772 /* Output will never be longer than input */
1773 v = _PyUnicode_New(size);
1774 if (v == NULL)
1775 goto onError;
1776 if (size == 0)
1777 goto done;
1778 p = PyUnicode_AS_UNICODE(v);
1779 while (size-- > 0) {
1780 Py_UNICODE ch = *s++;
1781 PyObject *w, *x;
1782
1783 /* Get mapping */
1784 w = PyInt_FromLong(ch);
1785 if (w == NULL)
1786 goto onError;
1787 x = PyObject_GetItem(mapping, w);
1788 Py_DECREF(w);
1789 if (x == NULL) {
1790 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1791 /* No mapping found: default to 1-1 mapping */
1792 PyErr_Clear();
1793 *p++ = ch;
1794 continue;
1795 }
1796 goto onError;
1797 }
1798
1799 /* Apply mapping */
1800 if (PyInt_Check(x))
1801 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1802 else if (x == Py_None) {
1803 /* undefined mapping */
1804 if (translate_error(&s, &p, errors,
1805 "character maps to <undefined>")) {
1806 Py_DECREF(x);
1807 goto onError;
1808 }
1809 }
1810 else if (PyUnicode_Check(x)) {
1811 if (PyUnicode_GET_SIZE(x) != 1) {
1812 /* 1-n mapping */
1813 PyErr_SetString(PyExc_NotImplementedError,
1814 "1-n mappings are currently not implemented");
1815 Py_DECREF(x);
1816 goto onError;
1817 }
1818 *p++ = *PyUnicode_AS_UNICODE(x);
1819 }
1820 else {
1821 /* wrong return value */
1822 PyErr_SetString(PyExc_TypeError,
1823 "translate mapping must return integer, None or unicode");
1824 Py_DECREF(x);
1825 goto onError;
1826 }
1827 Py_DECREF(x);
1828 }
1829 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1830 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1831
1832 done:
1833 return (PyObject *)v;
1834
1835 onError:
1836 Py_XDECREF(v);
1837 return NULL;
1838}
1839
1840PyObject *PyUnicode_Translate(PyObject *str,
1841 PyObject *mapping,
1842 const char *errors)
1843{
1844 PyObject *result;
1845
1846 str = PyUnicode_FromObject(str);
1847 if (str == NULL)
1848 goto onError;
1849 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1850 PyUnicode_GET_SIZE(str),
1851 mapping,
1852 errors);
1853 Py_DECREF(str);
1854 return result;
1855
1856 onError:
1857 Py_XDECREF(str);
1858 return NULL;
1859}
1860
1861/* --- Helpers ------------------------------------------------------------ */
1862
1863static
1864int count(PyUnicodeObject *self,
1865 int start,
1866 int end,
1867 PyUnicodeObject *substring)
1868{
1869 int count = 0;
1870
1871 end -= substring->length;
1872
1873 while (start <= end)
1874 if (Py_UNICODE_MATCH(self, start, substring)) {
1875 count++;
1876 start += substring->length;
1877 } else
1878 start++;
1879
1880 return count;
1881}
1882
1883int PyUnicode_Count(PyObject *str,
1884 PyObject *substr,
1885 int start,
1886 int end)
1887{
1888 int result;
1889
1890 str = PyUnicode_FromObject(str);
1891 if (str == NULL)
1892 return -1;
1893 substr = PyUnicode_FromObject(substr);
1894 if (substr == NULL) {
1895 Py_DECREF(substr);
1896 return -1;
1897 }
1898
1899 result = count((PyUnicodeObject *)str,
1900 start, end,
1901 (PyUnicodeObject *)substr);
1902
1903 Py_DECREF(str);
1904 Py_DECREF(substr);
1905 return result;
1906}
1907
1908static
1909int findstring(PyUnicodeObject *self,
1910 PyUnicodeObject *substring,
1911 int start,
1912 int end,
1913 int direction)
1914{
1915 if (start < 0)
1916 start += self->length;
1917 if (start < 0)
1918 start = 0;
1919
1920 if (substring->length == 0)
1921 return start;
1922
1923 if (end > self->length)
1924 end = self->length;
1925 if (end < 0)
1926 end += self->length;
1927 if (end < 0)
1928 end = 0;
1929
1930 end -= substring->length;
1931
1932 if (direction < 0) {
1933 for (; end >= start; end--)
1934 if (Py_UNICODE_MATCH(self, end, substring))
1935 return end;
1936 } else {
1937 for (; start <= end; start++)
1938 if (Py_UNICODE_MATCH(self, start, substring))
1939 return start;
1940 }
1941
1942 return -1;
1943}
1944
1945int PyUnicode_Find(PyObject *str,
1946 PyObject *substr,
1947 int start,
1948 int end,
1949 int direction)
1950{
1951 int result;
1952
1953 str = PyUnicode_FromObject(str);
1954 if (str == NULL)
1955 return -1;
1956 substr = PyUnicode_FromObject(substr);
1957 if (substr == NULL) {
1958 Py_DECREF(substr);
1959 return -1;
1960 }
1961
1962 result = findstring((PyUnicodeObject *)str,
1963 (PyUnicodeObject *)substr,
1964 start, end, direction);
1965 Py_DECREF(str);
1966 Py_DECREF(substr);
1967 return result;
1968}
1969
1970static
1971int tailmatch(PyUnicodeObject *self,
1972 PyUnicodeObject *substring,
1973 int start,
1974 int end,
1975 int direction)
1976{
1977 if (start < 0)
1978 start += self->length;
1979 if (start < 0)
1980 start = 0;
1981
1982 if (substring->length == 0)
1983 return 1;
1984
1985 if (end > self->length)
1986 end = self->length;
1987 if (end < 0)
1988 end += self->length;
1989 if (end < 0)
1990 end = 0;
1991
1992 end -= substring->length;
1993 if (end < start)
1994 return 0;
1995
1996 if (direction > 0) {
1997 if (Py_UNICODE_MATCH(self, end, substring))
1998 return 1;
1999 } else {
2000 if (Py_UNICODE_MATCH(self, start, substring))
2001 return 1;
2002 }
2003
2004 return 0;
2005}
2006
2007int PyUnicode_Tailmatch(PyObject *str,
2008 PyObject *substr,
2009 int start,
2010 int end,
2011 int direction)
2012{
2013 int result;
2014
2015 str = PyUnicode_FromObject(str);
2016 if (str == NULL)
2017 return -1;
2018 substr = PyUnicode_FromObject(substr);
2019 if (substr == NULL) {
2020 Py_DECREF(substr);
2021 return -1;
2022 }
2023
2024 result = tailmatch((PyUnicodeObject *)str,
2025 (PyUnicodeObject *)substr,
2026 start, end, direction);
2027 Py_DECREF(str);
2028 Py_DECREF(substr);
2029 return result;
2030}
2031
2032static
2033const Py_UNICODE *findchar(const Py_UNICODE *s,
2034 int size,
2035 Py_UNICODE ch)
2036{
2037 /* like wcschr, but doesn't stop at NULL characters */
2038
2039 while (size-- > 0) {
2040 if (*s == ch)
2041 return s;
2042 s++;
2043 }
2044
2045 return NULL;
2046}
2047
2048/* Apply fixfct filter to the Unicode object self and return a
2049 reference to the modified object */
2050
2051static
2052PyObject *fixup(PyUnicodeObject *self,
2053 int (*fixfct)(PyUnicodeObject *s))
2054{
2055
2056 PyUnicodeObject *u;
2057
2058 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2059 self->length);
2060 if (u == NULL)
2061 return NULL;
2062 if (!fixfct(u)) {
2063 /* fixfct should return TRUE if it modified the buffer. If
2064 FALSE, return a reference to the original buffer instead
2065 (to save space, not time) */
2066 Py_INCREF(self);
2067 Py_DECREF(u);
2068 return (PyObject*) self;
2069 }
2070 return (PyObject*) u;
2071}
2072
2073static
2074int fixupper(PyUnicodeObject *self)
2075{
2076 int len = self->length;
2077 Py_UNICODE *s = self->str;
2078 int status = 0;
2079
2080 while (len-- > 0) {
2081 register Py_UNICODE ch;
2082
2083 ch = Py_UNICODE_TOUPPER(*s);
2084 if (ch != *s) {
2085 status = 1;
2086 *s = ch;
2087 }
2088 s++;
2089 }
2090
2091 return status;
2092}
2093
2094static
2095int fixlower(PyUnicodeObject *self)
2096{
2097 int len = self->length;
2098 Py_UNICODE *s = self->str;
2099 int status = 0;
2100
2101 while (len-- > 0) {
2102 register Py_UNICODE ch;
2103
2104 ch = Py_UNICODE_TOLOWER(*s);
2105 if (ch != *s) {
2106 status = 1;
2107 *s = ch;
2108 }
2109 s++;
2110 }
2111
2112 return status;
2113}
2114
2115static
2116int fixswapcase(PyUnicodeObject *self)
2117{
2118 int len = self->length;
2119 Py_UNICODE *s = self->str;
2120 int status = 0;
2121
2122 while (len-- > 0) {
2123 if (Py_UNICODE_ISUPPER(*s)) {
2124 *s = Py_UNICODE_TOLOWER(*s);
2125 status = 1;
2126 } else if (Py_UNICODE_ISLOWER(*s)) {
2127 *s = Py_UNICODE_TOUPPER(*s);
2128 status = 1;
2129 }
2130 s++;
2131 }
2132
2133 return status;
2134}
2135
2136static
2137int fixcapitalize(PyUnicodeObject *self)
2138{
2139 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2140 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2141 return 1;
2142 }
2143 return 0;
2144}
2145
2146static
2147int fixtitle(PyUnicodeObject *self)
2148{
2149 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2150 register Py_UNICODE *e;
2151 int previous_is_cased;
2152
2153 /* Shortcut for single character strings */
2154 if (PyUnicode_GET_SIZE(self) == 1) {
2155 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2156 if (*p != ch) {
2157 *p = ch;
2158 return 1;
2159 }
2160 else
2161 return 0;
2162 }
2163
2164 e = p + PyUnicode_GET_SIZE(self);
2165 previous_is_cased = 0;
2166 for (; p < e; p++) {
2167 register const Py_UNICODE ch = *p;
2168
2169 if (previous_is_cased)
2170 *p = Py_UNICODE_TOLOWER(ch);
2171 else
2172 *p = Py_UNICODE_TOTITLE(ch);
2173
2174 if (Py_UNICODE_ISLOWER(ch) ||
2175 Py_UNICODE_ISUPPER(ch) ||
2176 Py_UNICODE_ISTITLE(ch))
2177 previous_is_cased = 1;
2178 else
2179 previous_is_cased = 0;
2180 }
2181 return 1;
2182}
2183
2184PyObject *PyUnicode_Join(PyObject *separator,
2185 PyObject *seq)
2186{
2187 Py_UNICODE *sep;
2188 int seplen;
2189 PyUnicodeObject *res = NULL;
2190 int reslen = 0;
2191 Py_UNICODE *p;
2192 int seqlen = 0;
2193 int sz = 100;
2194 int i;
2195
2196 seqlen = PySequence_Length(seq);
2197 if (seqlen < 0 && PyErr_Occurred())
2198 return NULL;
2199
2200 if (separator == NULL) {
2201 Py_UNICODE blank = ' ';
2202 sep = &blank;
2203 seplen = 1;
2204 }
2205 else {
2206 separator = PyUnicode_FromObject(separator);
2207 if (separator == NULL)
2208 return NULL;
2209 sep = PyUnicode_AS_UNICODE(separator);
2210 seplen = PyUnicode_GET_SIZE(separator);
2211 }
2212
2213 res = _PyUnicode_New(sz);
2214 if (res == NULL)
2215 goto onError;
2216 p = PyUnicode_AS_UNICODE(res);
2217 reslen = 0;
2218
2219 for (i = 0; i < seqlen; i++) {
2220 int itemlen;
2221 PyObject *item;
2222
2223 item = PySequence_GetItem(seq, i);
2224 if (item == NULL)
2225 goto onError;
2226 if (!PyUnicode_Check(item)) {
2227 PyObject *v;
2228 v = PyUnicode_FromObject(item);
2229 Py_DECREF(item);
2230 item = v;
2231 if (item == NULL)
2232 goto onError;
2233 }
2234 itemlen = PyUnicode_GET_SIZE(item);
2235 while (reslen + itemlen + seplen >= sz) {
2236 if (_PyUnicode_Resize(res, sz*2))
2237 goto onError;
2238 sz *= 2;
2239 p = PyUnicode_AS_UNICODE(res) + reslen;
2240 }
2241 if (i > 0) {
2242 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2243 p += seplen;
2244 reslen += seplen;
2245 }
2246 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2247 p += itemlen;
2248 reslen += itemlen;
2249 Py_DECREF(item);
2250 }
2251 if (_PyUnicode_Resize(res, reslen))
2252 goto onError;
2253
2254 Py_XDECREF(separator);
2255 return (PyObject *)res;
2256
2257 onError:
2258 Py_XDECREF(separator);
2259 Py_DECREF(res);
2260 return NULL;
2261}
2262
2263static
2264PyUnicodeObject *pad(PyUnicodeObject *self,
2265 int left,
2266 int right,
2267 Py_UNICODE fill)
2268{
2269 PyUnicodeObject *u;
2270
2271 if (left < 0)
2272 left = 0;
2273 if (right < 0)
2274 right = 0;
2275
2276 if (left == 0 && right == 0) {
2277 Py_INCREF(self);
2278 return self;
2279 }
2280
2281 u = _PyUnicode_New(left + self->length + right);
2282 if (u) {
2283 if (left)
2284 Py_UNICODE_FILL(u->str, fill, left);
2285 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2286 if (right)
2287 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2288 }
2289
2290 return u;
2291}
2292
2293#define SPLIT_APPEND(data, left, right) \
2294 str = PyUnicode_FromUnicode(data + left, right - left); \
2295 if (!str) \
2296 goto onError; \
2297 if (PyList_Append(list, str)) { \
2298 Py_DECREF(str); \
2299 goto onError; \
2300 } \
2301 else \
2302 Py_DECREF(str);
2303
2304static
2305PyObject *split_whitespace(PyUnicodeObject *self,
2306 PyObject *list,
2307 int maxcount)
2308{
2309 register int i;
2310 register int j;
2311 int len = self->length;
2312 PyObject *str;
2313
2314 for (i = j = 0; i < len; ) {
2315 /* find a token */
2316 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2317 i++;
2318 j = i;
2319 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2320 i++;
2321 if (j < i) {
2322 if (maxcount-- <= 0)
2323 break;
2324 SPLIT_APPEND(self->str, j, i);
2325 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2326 i++;
2327 j = i;
2328 }
2329 }
2330 if (j < len) {
2331 SPLIT_APPEND(self->str, j, len);
2332 }
2333 return list;
2334
2335 onError:
2336 Py_DECREF(list);
2337 return NULL;
2338}
2339
2340PyObject *PyUnicode_Splitlines(PyObject *string,
2341 int maxcount)
2342{
2343 register int i;
2344 register int j;
2345 int len;
2346 PyObject *list;
2347 PyObject *str;
2348 Py_UNICODE *data;
2349
2350 string = PyUnicode_FromObject(string);
2351 if (string == NULL)
2352 return NULL;
2353 data = PyUnicode_AS_UNICODE(string);
2354 len = PyUnicode_GET_SIZE(string);
2355
2356 if (maxcount < 0)
2357 maxcount = INT_MAX;
2358
2359 list = PyList_New(0);
2360 if (!list)
2361 goto onError;
2362
2363 for (i = j = 0; i < len; ) {
2364 /* Find a line and append it */
2365 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2366 i++;
2367 if (maxcount-- <= 0)
2368 break;
2369 SPLIT_APPEND(data, j, i);
2370
2371 /* Skip the line break reading CRLF as one line break */
2372 if (i < len) {
2373 if (data[i] == '\r' && i + 1 < len &&
2374 data[i+1] == '\n')
2375 i += 2;
2376 else
2377 i++;
2378 }
2379 j = i;
2380 }
2381 if (j < len) {
2382 SPLIT_APPEND(data, j, len);
2383 }
2384
2385 Py_DECREF(string);
2386 return list;
2387
2388 onError:
2389 Py_DECREF(list);
2390 Py_DECREF(string);
2391 return NULL;
2392}
2393
2394static
2395PyObject *split_char(PyUnicodeObject *self,
2396 PyObject *list,
2397 Py_UNICODE ch,
2398 int maxcount)
2399{
2400 register int i;
2401 register int j;
2402 int len = self->length;
2403 PyObject *str;
2404
2405 for (i = j = 0; i < len; ) {
2406 if (self->str[i] == ch) {
2407 if (maxcount-- <= 0)
2408 break;
2409 SPLIT_APPEND(self->str, j, i);
2410 i = j = i + 1;
2411 } else
2412 i++;
2413 }
2414 if (j <= len) {
2415 SPLIT_APPEND(self->str, j, len);
2416 }
2417 return list;
2418
2419 onError:
2420 Py_DECREF(list);
2421 return NULL;
2422}
2423
2424static
2425PyObject *split_substring(PyUnicodeObject *self,
2426 PyObject *list,
2427 PyUnicodeObject *substring,
2428 int maxcount)
2429{
2430 register int i;
2431 register int j;
2432 int len = self->length;
2433 int sublen = substring->length;
2434 PyObject *str;
2435
2436 for (i = j = 0; i < len - sublen; ) {
2437 if (Py_UNICODE_MATCH(self, i, substring)) {
2438 if (maxcount-- <= 0)
2439 break;
2440 SPLIT_APPEND(self->str, j, i);
2441 i = j = i + sublen;
2442 } else
2443 i++;
2444 }
2445 if (j <= len) {
2446 SPLIT_APPEND(self->str, j, len);
2447 }
2448 return list;
2449
2450 onError:
2451 Py_DECREF(list);
2452 return NULL;
2453}
2454
2455#undef SPLIT_APPEND
2456
2457static
2458PyObject *split(PyUnicodeObject *self,
2459 PyUnicodeObject *substring,
2460 int maxcount)
2461{
2462 PyObject *list;
2463
2464 if (maxcount < 0)
2465 maxcount = INT_MAX;
2466
2467 list = PyList_New(0);
2468 if (!list)
2469 return NULL;
2470
2471 if (substring == NULL)
2472 return split_whitespace(self,list,maxcount);
2473
2474 else if (substring->length == 1)
2475 return split_char(self,list,substring->str[0],maxcount);
2476
2477 else if (substring->length == 0) {
2478 Py_DECREF(list);
2479 PyErr_SetString(PyExc_ValueError, "empty separator");
2480 return NULL;
2481 }
2482 else
2483 return split_substring(self,list,substring,maxcount);
2484}
2485
2486static
2487PyObject *strip(PyUnicodeObject *self,
2488 int left,
2489 int right)
2490{
2491 Py_UNICODE *p = self->str;
2492 int start = 0;
2493 int end = self->length;
2494
2495 if (left)
2496 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2497 start++;
2498
2499 if (right)
2500 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2501 end--;
2502
2503 if (start == 0 && end == self->length) {
2504 /* couldn't strip anything off, return original string */
2505 Py_INCREF(self);
2506 return (PyObject*) self;
2507 }
2508
2509 return (PyObject*) PyUnicode_FromUnicode(
2510 self->str + start,
2511 end - start
2512 );
2513}
2514
2515static
2516PyObject *replace(PyUnicodeObject *self,
2517 PyUnicodeObject *str1,
2518 PyUnicodeObject *str2,
2519 int maxcount)
2520{
2521 PyUnicodeObject *u;
2522
2523 if (maxcount < 0)
2524 maxcount = INT_MAX;
2525
2526 if (str1->length == 1 && str2->length == 1) {
2527 int i;
2528
2529 /* replace characters */
2530 if (!findchar(self->str, self->length, str1->str[0])) {
2531 /* nothing to replace, return original string */
2532 Py_INCREF(self);
2533 u = self;
2534 } else {
2535 Py_UNICODE u1 = str1->str[0];
2536 Py_UNICODE u2 = str2->str[0];
2537
2538 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2539 self->str,
2540 self->length
2541 );
2542 if (u)
2543 for (i = 0; i < u->length; i++)
2544 if (u->str[i] == u1) {
2545 if (--maxcount < 0)
2546 break;
2547 u->str[i] = u2;
2548 }
2549 }
2550
2551 } else {
2552 int n, i;
2553 Py_UNICODE *p;
2554
2555 /* replace strings */
2556 n = count(self, 0, self->length, str1);
2557 if (n > maxcount)
2558 n = maxcount;
2559 if (n == 0) {
2560 /* nothing to replace, return original string */
2561 Py_INCREF(self);
2562 u = self;
2563 } else {
2564 u = _PyUnicode_New(
2565 self->length + n * (str2->length - str1->length));
2566 if (u) {
2567 i = 0;
2568 p = u->str;
2569 while (i <= self->length - str1->length)
2570 if (Py_UNICODE_MATCH(self, i, str1)) {
2571 /* replace string segment */
2572 Py_UNICODE_COPY(p, str2->str, str2->length);
2573 p += str2->length;
2574 i += str1->length;
2575 if (--n <= 0) {
2576 /* copy remaining part */
2577 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2578 break;
2579 }
2580 } else
2581 *p++ = self->str[i++];
2582 }
2583 }
2584 }
2585
2586 return (PyObject *) u;
2587}
2588
2589/* --- Unicode Object Methods --------------------------------------------- */
2590
2591static char title__doc__[] =
2592"S.title() -> unicode\n\
2593\n\
2594Return a titlecased version of S, i.e. words start with title case\n\
2595characters, all remaining cased characters have lower case.";
2596
2597static PyObject*
2598unicode_title(PyUnicodeObject *self, PyObject *args)
2599{
2600 if (!PyArg_NoArgs(args))
2601 return NULL;
2602 return fixup(self, fixtitle);
2603}
2604
2605static char capitalize__doc__[] =
2606"S.capitalize() -> unicode\n\
2607\n\
2608Return a capitalized version of S, i.e. make the first character\n\
2609have upper case.";
2610
2611static PyObject*
2612unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2613{
2614 if (!PyArg_NoArgs(args))
2615 return NULL;
2616 return fixup(self, fixcapitalize);
2617}
2618
2619#if 0
2620static char capwords__doc__[] =
2621"S.capwords() -> unicode\n\
2622\n\
2623Apply .capitalize() to all words in S and return the result with\n\
2624normalized whitespace (all whitespace strings are replaced by ' ').";
2625
2626static PyObject*
2627unicode_capwords(PyUnicodeObject *self, PyObject *args)
2628{
2629 PyObject *list;
2630 PyObject *item;
2631 int i;
2632
2633 if (!PyArg_NoArgs(args))
2634 return NULL;
2635
2636 /* Split into words */
2637 list = split(self, NULL, -1);
2638 if (!list)
2639 return NULL;
2640
2641 /* Capitalize each word */
2642 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2643 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2644 fixcapitalize);
2645 if (item == NULL)
2646 goto onError;
2647 Py_DECREF(PyList_GET_ITEM(list, i));
2648 PyList_SET_ITEM(list, i, item);
2649 }
2650
2651 /* Join the words to form a new string */
2652 item = PyUnicode_Join(NULL, list);
2653
2654onError:
2655 Py_DECREF(list);
2656 return (PyObject *)item;
2657}
2658#endif
2659
2660static char center__doc__[] =
2661"S.center(width) -> unicode\n\
2662\n\
2663Return S centered in a Unicode string of length width. Padding is done\n\
2664using spaces.";
2665
2666static PyObject *
2667unicode_center(PyUnicodeObject *self, PyObject *args)
2668{
2669 int marg, left;
2670 int width;
2671
2672 if (!PyArg_ParseTuple(args, "i:center", &width))
2673 return NULL;
2674
2675 if (self->length >= width) {
2676 Py_INCREF(self);
2677 return (PyObject*) self;
2678 }
2679
2680 marg = width - self->length;
2681 left = marg / 2 + (marg & width & 1);
2682
2683 return (PyObject*) pad(self, left, marg - left, ' ');
2684}
2685
2686static int
2687unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2688{
2689 int len1, len2;
2690 Py_UNICODE *s1 = str1->str;
2691 Py_UNICODE *s2 = str2->str;
2692
2693 len1 = str1->length;
2694 len2 = str2->length;
2695
2696 while (len1 > 0 && len2 > 0) {
2697 int cmp = (*s1++) - (*s2++);
2698 if (cmp)
2699 /* This should make Christian happy! */
2700 return (cmp < 0) ? -1 : (cmp != 0);
2701 len1--, len2--;
2702 }
2703
2704 return (len1 < len2) ? -1 : (len1 != len2);
2705}
2706
2707int PyUnicode_Compare(PyObject *left,
2708 PyObject *right)
2709{
2710 PyUnicodeObject *u = NULL, *v = NULL;
2711 int result;
2712
2713 /* Coerce the two arguments */
2714 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2715 if (u == NULL)
2716 goto onError;
2717 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2718 if (v == NULL)
2719 goto onError;
2720
2721 /* Shortcut for emtpy or interned objects */
2722 if (v == u) {
2723 Py_DECREF(u);
2724 Py_DECREF(v);
2725 return 0;
2726 }
2727
2728 result = unicode_compare(u, v);
2729
2730 Py_DECREF(u);
2731 Py_DECREF(v);
2732 return result;
2733
2734onError:
2735 Py_XDECREF(u);
2736 Py_XDECREF(v);
2737 return -1;
2738}
2739
Guido van Rossum403d68b2000-03-13 15:55:09 +00002740int PyUnicode_Contains(PyObject *container,
2741 PyObject *element)
2742{
2743 PyUnicodeObject *u = NULL, *v = NULL;
2744 int result;
2745 register const Py_UNICODE *p, *e;
2746 register Py_UNICODE ch;
2747
2748 /* Coerce the two arguments */
2749 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2750 if (u == NULL)
2751 goto onError;
2752 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2753 if (v == NULL)
2754 goto onError;
2755
2756 /* Check v in u */
2757 if (PyUnicode_GET_SIZE(v) != 1) {
2758 PyErr_SetString(PyExc_TypeError,
2759 "string member test needs char left operand");
2760 goto onError;
2761 }
2762 ch = *PyUnicode_AS_UNICODE(v);
2763 p = PyUnicode_AS_UNICODE(u);
2764 e = p + PyUnicode_GET_SIZE(u);
2765 result = 0;
2766 while (p < e) {
2767 if (*p++ == ch) {
2768 result = 1;
2769 break;
2770 }
2771 }
2772
2773 Py_DECREF(u);
2774 Py_DECREF(v);
2775 return result;
2776
2777onError:
2778 Py_XDECREF(u);
2779 Py_XDECREF(v);
2780 return -1;
2781}
2782
Guido van Rossumd57fd912000-03-10 22:53:23 +00002783/* Concat to string or Unicode object giving a new Unicode object. */
2784
2785PyObject *PyUnicode_Concat(PyObject *left,
2786 PyObject *right)
2787{
2788 PyUnicodeObject *u = NULL, *v = NULL, *w;
2789
2790 /* Coerce the two arguments */
2791 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2792 if (u == NULL)
2793 goto onError;
2794 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2795 if (v == NULL)
2796 goto onError;
2797
2798 /* Shortcuts */
2799 if (v == unicode_empty) {
2800 Py_DECREF(v);
2801 return (PyObject *)u;
2802 }
2803 if (u == unicode_empty) {
2804 Py_DECREF(u);
2805 return (PyObject *)v;
2806 }
2807
2808 /* Concat the two Unicode strings */
2809 w = _PyUnicode_New(u->length + v->length);
2810 if (w == NULL)
2811 goto onError;
2812 Py_UNICODE_COPY(w->str, u->str, u->length);
2813 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
2814
2815 Py_DECREF(u);
2816 Py_DECREF(v);
2817 return (PyObject *)w;
2818
2819onError:
2820 Py_XDECREF(u);
2821 Py_XDECREF(v);
2822 return NULL;
2823}
2824
2825static char count__doc__[] =
2826"S.count(sub[, start[, end]]) -> int\n\
2827\n\
2828Return the number of occurrences of substring sub in Unicode string\n\
2829S[start:end]. Optional arguments start and end are\n\
2830interpreted as in slice notation.";
2831
2832static PyObject *
2833unicode_count(PyUnicodeObject *self, PyObject *args)
2834{
2835 PyUnicodeObject *substring;
2836 int start = 0;
2837 int end = INT_MAX;
2838 PyObject *result;
2839
2840 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
2841 return NULL;
2842
2843 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2844 (PyObject *)substring);
2845 if (substring == NULL)
2846 return NULL;
2847
2848 if (substring->length == 0) {
2849 Py_DECREF(substring);
2850 return PyInt_FromLong((long) 0);
2851 }
2852
2853 if (start < 0)
2854 start += self->length;
2855 if (start < 0)
2856 start = 0;
2857 if (end > self->length)
2858 end = self->length;
2859 if (end < 0)
2860 end += self->length;
2861 if (end < 0)
2862 end = 0;
2863
2864 result = PyInt_FromLong((long) count(self, start, end, substring));
2865
2866 Py_DECREF(substring);
2867 return result;
2868}
2869
2870static char encode__doc__[] =
2871"S.encode([encoding[,errors]]) -> string\n\
2872\n\
2873Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
2874errors may be given to set a different error handling scheme. Default\n\
2875is 'strict' meaning that encoding errors raise a ValueError. Other\n\
2876possible values are 'ignore' and 'replace'.";
2877
2878static PyObject *
2879unicode_encode(PyUnicodeObject *self, PyObject *args)
2880{
2881 char *encoding = NULL;
2882 char *errors = NULL;
2883 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
2884 return NULL;
2885 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
2886}
2887
2888static char expandtabs__doc__[] =
2889"S.expandtabs([tabsize]) -> unicode\n\
2890\n\
2891Return a copy of S where all tab characters are expanded using spaces.\n\
2892If tabsize is not given, a tab size of 8 characters is assumed.";
2893
2894static PyObject*
2895unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
2896{
2897 Py_UNICODE *e;
2898 Py_UNICODE *p;
2899 Py_UNICODE *q;
2900 int i, j;
2901 PyUnicodeObject *u;
2902 int tabsize = 8;
2903
2904 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
2905 return NULL;
2906
2907 /* First pass: determine size of ouput string */
2908 i = j = 0;
2909 e = self->str + self->length;
2910 for (p = self->str; p < e; p++)
2911 if (*p == '\t') {
2912 if (tabsize > 0)
2913 j += tabsize - (j % tabsize);
2914 }
2915 else {
2916 j++;
2917 if (*p == '\n' || *p == '\r') {
2918 i += j;
2919 j = 0;
2920 }
2921 }
2922
2923 /* Second pass: create output string and fill it */
2924 u = _PyUnicode_New(i + j);
2925 if (!u)
2926 return NULL;
2927
2928 j = 0;
2929 q = u->str;
2930
2931 for (p = self->str; p < e; p++)
2932 if (*p == '\t') {
2933 if (tabsize > 0) {
2934 i = tabsize - (j % tabsize);
2935 j += i;
2936 while (i--)
2937 *q++ = ' ';
2938 }
2939 }
2940 else {
2941 j++;
2942 *q++ = *p;
2943 if (*p == '\n' || *p == '\r')
2944 j = 0;
2945 }
2946
2947 return (PyObject*) u;
2948}
2949
2950static char find__doc__[] =
2951"S.find(sub [,start [,end]]) -> int\n\
2952\n\
2953Return the lowest index in S where substring sub is found,\n\
2954such that sub is contained within s[start,end]. Optional\n\
2955arguments start and end are interpreted as in slice notation.\n\
2956\n\
2957Return -1 on failure.";
2958
2959static PyObject *
2960unicode_find(PyUnicodeObject *self, PyObject *args)
2961{
2962 PyUnicodeObject *substring;
2963 int start = 0;
2964 int end = INT_MAX;
2965 PyObject *result;
2966
2967 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
2968 return NULL;
2969 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2970 (PyObject *)substring);
2971 if (substring == NULL)
2972 return NULL;
2973
2974 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
2975
2976 Py_DECREF(substring);
2977 return result;
2978}
2979
2980static PyObject *
2981unicode_getitem(PyUnicodeObject *self, int index)
2982{
2983 if (index < 0 || index >= self->length) {
2984 PyErr_SetString(PyExc_IndexError, "string index out of range");
2985 return NULL;
2986 }
2987
2988 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
2989}
2990
2991static long
2992unicode_hash(PyUnicodeObject *self)
2993{
2994 long hash;
2995 PyObject *utf8;
2996
2997 /* Since Unicode objects compare equal to their UTF-8 string
2998 counterparts, they should also use the UTF-8 strings as basis
2999 for their hash value. This is needed to assure that strings and
3000 Unicode objects behave in the same way as dictionary
3001 keys. Unfortunately, this costs some performance and also some
3002 memory if the cached UTF-8 representation is not used later
3003 on. */
3004 if (self->hash != -1)
3005 return self->hash;
3006 utf8 = utf8_string(self, NULL);
3007 if (utf8 == NULL)
3008 return -1;
3009 hash = PyObject_Hash(utf8);
3010 if (hash == -1)
3011 return -1;
3012 self->hash = hash;
3013 return hash;
3014}
3015
3016static char index__doc__[] =
3017"S.index(sub [,start [,end]]) -> int\n\
3018\n\
3019Like S.find() but raise ValueError when the substring is not found.";
3020
3021static PyObject *
3022unicode_index(PyUnicodeObject *self, PyObject *args)
3023{
3024 int result;
3025 PyUnicodeObject *substring;
3026 int start = 0;
3027 int end = INT_MAX;
3028
3029 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
3030 return NULL;
3031
3032 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3033 (PyObject *)substring);
3034 if (substring == NULL)
3035 return NULL;
3036
3037 result = findstring(self, substring, start, end, 1);
3038
3039 Py_DECREF(substring);
3040 if (result < 0) {
3041 PyErr_SetString(PyExc_ValueError, "substring not found");
3042 return NULL;
3043 }
3044 return PyInt_FromLong(result);
3045}
3046
3047static char islower__doc__[] =
3048"S.islower() -> int\n\
3049\n\
3050Return 1 if all cased characters in S are lowercase and there is\n\
3051at least one cased character in S, 0 otherwise.";
3052
3053static PyObject*
3054unicode_islower(PyUnicodeObject *self, PyObject *args)
3055{
3056 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3057 register const Py_UNICODE *e;
3058 int cased;
3059
3060 if (!PyArg_NoArgs(args))
3061 return NULL;
3062
3063 /* Shortcut for single character strings */
3064 if (PyUnicode_GET_SIZE(self) == 1)
3065 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3066
3067 e = p + PyUnicode_GET_SIZE(self);
3068 cased = 0;
3069 for (; p < e; p++) {
3070 register const Py_UNICODE ch = *p;
3071
3072 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3073 return PyInt_FromLong(0);
3074 else if (!cased && Py_UNICODE_ISLOWER(ch))
3075 cased = 1;
3076 }
3077 return PyInt_FromLong(cased);
3078}
3079
3080static char isupper__doc__[] =
3081"S.isupper() -> int\n\
3082\n\
3083Return 1 if all cased characters in S are uppercase and there is\n\
3084at least one cased character in S, 0 otherwise.";
3085
3086static PyObject*
3087unicode_isupper(PyUnicodeObject *self, PyObject *args)
3088{
3089 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3090 register const Py_UNICODE *e;
3091 int cased;
3092
3093 if (!PyArg_NoArgs(args))
3094 return NULL;
3095
3096 /* Shortcut for single character strings */
3097 if (PyUnicode_GET_SIZE(self) == 1)
3098 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3099
3100 e = p + PyUnicode_GET_SIZE(self);
3101 cased = 0;
3102 for (; p < e; p++) {
3103 register const Py_UNICODE ch = *p;
3104
3105 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3106 return PyInt_FromLong(0);
3107 else if (!cased && Py_UNICODE_ISUPPER(ch))
3108 cased = 1;
3109 }
3110 return PyInt_FromLong(cased);
3111}
3112
3113static char istitle__doc__[] =
3114"S.istitle() -> int\n\
3115\n\
3116Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3117may only follow uncased characters and lowercase characters only cased\n\
3118ones. Return 0 otherwise.";
3119
3120static PyObject*
3121unicode_istitle(PyUnicodeObject *self, PyObject *args)
3122{
3123 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3124 register const Py_UNICODE *e;
3125 int cased, previous_is_cased;
3126
3127 if (!PyArg_NoArgs(args))
3128 return NULL;
3129
3130 /* Shortcut for single character strings */
3131 if (PyUnicode_GET_SIZE(self) == 1)
3132 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3133 (Py_UNICODE_ISUPPER(*p) != 0));
3134
3135 e = p + PyUnicode_GET_SIZE(self);
3136 cased = 0;
3137 previous_is_cased = 0;
3138 for (; p < e; p++) {
3139 register const Py_UNICODE ch = *p;
3140
3141 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3142 if (previous_is_cased)
3143 return PyInt_FromLong(0);
3144 previous_is_cased = 1;
3145 cased = 1;
3146 }
3147 else if (Py_UNICODE_ISLOWER(ch)) {
3148 if (!previous_is_cased)
3149 return PyInt_FromLong(0);
3150 previous_is_cased = 1;
3151 cased = 1;
3152 }
3153 else
3154 previous_is_cased = 0;
3155 }
3156 return PyInt_FromLong(cased);
3157}
3158
3159static char isspace__doc__[] =
3160"S.isspace() -> int\n\
3161\n\
3162Return 1 if there are only whitespace characters in S,\n\
31630 otherwise.";
3164
3165static PyObject*
3166unicode_isspace(PyUnicodeObject *self, PyObject *args)
3167{
3168 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3169 register const Py_UNICODE *e;
3170
3171 if (!PyArg_NoArgs(args))
3172 return NULL;
3173
3174 /* Shortcut for single character strings */
3175 if (PyUnicode_GET_SIZE(self) == 1 &&
3176 Py_UNICODE_ISSPACE(*p))
3177 return PyInt_FromLong(1);
3178
3179 e = p + PyUnicode_GET_SIZE(self);
3180 for (; p < e; p++) {
3181 if (!Py_UNICODE_ISSPACE(*p))
3182 return PyInt_FromLong(0);
3183 }
3184 return PyInt_FromLong(1);
3185}
3186
3187static char isdecimal__doc__[] =
3188"S.isdecimal() -> int\n\
3189\n\
3190Return 1 if there are only decimal characters in S,\n\
31910 otherwise.";
3192
3193static PyObject*
3194unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3195{
3196 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3197 register const Py_UNICODE *e;
3198
3199 if (!PyArg_NoArgs(args))
3200 return NULL;
3201
3202 /* Shortcut for single character strings */
3203 if (PyUnicode_GET_SIZE(self) == 1 &&
3204 Py_UNICODE_ISDECIMAL(*p))
3205 return PyInt_FromLong(1);
3206
3207 e = p + PyUnicode_GET_SIZE(self);
3208 for (; p < e; p++) {
3209 if (!Py_UNICODE_ISDECIMAL(*p))
3210 return PyInt_FromLong(0);
3211 }
3212 return PyInt_FromLong(1);
3213}
3214
3215static char isdigit__doc__[] =
3216"S.isdigit() -> int\n\
3217\n\
3218Return 1 if there are only digit characters in S,\n\
32190 otherwise.";
3220
3221static PyObject*
3222unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3223{
3224 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3225 register const Py_UNICODE *e;
3226
3227 if (!PyArg_NoArgs(args))
3228 return NULL;
3229
3230 /* Shortcut for single character strings */
3231 if (PyUnicode_GET_SIZE(self) == 1 &&
3232 Py_UNICODE_ISDIGIT(*p))
3233 return PyInt_FromLong(1);
3234
3235 e = p + PyUnicode_GET_SIZE(self);
3236 for (; p < e; p++) {
3237 if (!Py_UNICODE_ISDIGIT(*p))
3238 return PyInt_FromLong(0);
3239 }
3240 return PyInt_FromLong(1);
3241}
3242
3243static char isnumeric__doc__[] =
3244"S.isnumeric() -> int\n\
3245\n\
3246Return 1 if there are only numeric characters in S,\n\
32470 otherwise.";
3248
3249static PyObject*
3250unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3251{
3252 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3253 register const Py_UNICODE *e;
3254
3255 if (!PyArg_NoArgs(args))
3256 return NULL;
3257
3258 /* Shortcut for single character strings */
3259 if (PyUnicode_GET_SIZE(self) == 1 &&
3260 Py_UNICODE_ISNUMERIC(*p))
3261 return PyInt_FromLong(1);
3262
3263 e = p + PyUnicode_GET_SIZE(self);
3264 for (; p < e; p++) {
3265 if (!Py_UNICODE_ISNUMERIC(*p))
3266 return PyInt_FromLong(0);
3267 }
3268 return PyInt_FromLong(1);
3269}
3270
3271static char join__doc__[] =
3272"S.join(sequence) -> unicode\n\
3273\n\
3274Return a string which is the concatenation of the strings in the\n\
3275sequence. The separator between elements is S.";
3276
3277static PyObject*
3278unicode_join(PyUnicodeObject *self, PyObject *args)
3279{
3280 PyObject *data;
3281 if (!PyArg_ParseTuple(args, "O:join", &data))
3282 return NULL;
3283
3284 return PyUnicode_Join((PyObject *)self, data);
3285}
3286
3287static int
3288unicode_length(PyUnicodeObject *self)
3289{
3290 return self->length;
3291}
3292
3293static char ljust__doc__[] =
3294"S.ljust(width) -> unicode\n\
3295\n\
3296Return S left justified in a Unicode string of length width. Padding is\n\
3297done using spaces.";
3298
3299static PyObject *
3300unicode_ljust(PyUnicodeObject *self, PyObject *args)
3301{
3302 int width;
3303 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3304 return NULL;
3305
3306 if (self->length >= width) {
3307 Py_INCREF(self);
3308 return (PyObject*) self;
3309 }
3310
3311 return (PyObject*) pad(self, 0, width - self->length, ' ');
3312}
3313
3314static char lower__doc__[] =
3315"S.lower() -> unicode\n\
3316\n\
3317Return a copy of the string S converted to lowercase.";
3318
3319static PyObject*
3320unicode_lower(PyUnicodeObject *self, PyObject *args)
3321{
3322 if (!PyArg_NoArgs(args))
3323 return NULL;
3324 return fixup(self, fixlower);
3325}
3326
3327static char lstrip__doc__[] =
3328"S.lstrip() -> unicode\n\
3329\n\
3330Return a copy of the string S with leading whitespace removed.";
3331
3332static PyObject *
3333unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3334{
3335 if (!PyArg_NoArgs(args))
3336 return NULL;
3337 return strip(self, 1, 0);
3338}
3339
3340static PyObject*
3341unicode_repeat(PyUnicodeObject *str, int len)
3342{
3343 PyUnicodeObject *u;
3344 Py_UNICODE *p;
3345
3346 if (len < 0)
3347 len = 0;
3348
3349 if (len == 1) {
3350 /* no repeat, return original string */
3351 Py_INCREF(str);
3352 return (PyObject*) str;
3353 }
3354
3355 u = _PyUnicode_New(len * str->length);
3356 if (!u)
3357 return NULL;
3358
3359 p = u->str;
3360
3361 while (len-- > 0) {
3362 Py_UNICODE_COPY(p, str->str, str->length);
3363 p += str->length;
3364 }
3365
3366 return (PyObject*) u;
3367}
3368
3369PyObject *PyUnicode_Replace(PyObject *obj,
3370 PyObject *subobj,
3371 PyObject *replobj,
3372 int maxcount)
3373{
3374 PyObject *self;
3375 PyObject *str1;
3376 PyObject *str2;
3377 PyObject *result;
3378
3379 self = PyUnicode_FromObject(obj);
3380 if (self == NULL)
3381 return NULL;
3382 str1 = PyUnicode_FromObject(subobj);
3383 if (str1 == NULL) {
3384 Py_DECREF(self);
3385 return NULL;
3386 }
3387 str2 = PyUnicode_FromObject(replobj);
3388 if (str2 == NULL) {
3389 Py_DECREF(self);
3390 Py_DECREF(str1);
3391 return NULL;
3392 }
3393 result = replace((PyUnicodeObject *)self,
3394 (PyUnicodeObject *)str1,
3395 (PyUnicodeObject *)str2,
3396 maxcount);
3397 Py_DECREF(self);
3398 Py_DECREF(str1);
3399 Py_DECREF(str2);
3400 return result;
3401}
3402
3403static char replace__doc__[] =
3404"S.replace (old, new[, maxsplit]) -> unicode\n\
3405\n\
3406Return a copy of S with all occurrences of substring\n\
3407old replaced by new. If the optional argument maxsplit is\n\
3408given, only the first maxsplit occurrences are replaced.";
3409
3410static PyObject*
3411unicode_replace(PyUnicodeObject *self, PyObject *args)
3412{
3413 PyUnicodeObject *str1;
3414 PyUnicodeObject *str2;
3415 int maxcount = -1;
3416 PyObject *result;
3417
3418 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3419 return NULL;
3420 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3421 if (str1 == NULL)
3422 return NULL;
3423 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3424 if (str2 == NULL)
3425 return NULL;
3426
3427 result = replace(self, str1, str2, maxcount);
3428
3429 Py_DECREF(str1);
3430 Py_DECREF(str2);
3431 return result;
3432}
3433
3434static
3435PyObject *unicode_repr(PyObject *unicode)
3436{
3437 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3438 PyUnicode_GET_SIZE(unicode),
3439 1);
3440}
3441
3442static char rfind__doc__[] =
3443"S.rfind(sub [,start [,end]]) -> int\n\
3444\n\
3445Return the highest index in S where substring sub is found,\n\
3446such that sub is contained within s[start,end]. Optional\n\
3447arguments start and end are interpreted as in slice notation.\n\
3448\n\
3449Return -1 on failure.";
3450
3451static PyObject *
3452unicode_rfind(PyUnicodeObject *self, PyObject *args)
3453{
3454 PyUnicodeObject *substring;
3455 int start = 0;
3456 int end = INT_MAX;
3457 PyObject *result;
3458
3459 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3460 return NULL;
3461 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3462 (PyObject *)substring);
3463 if (substring == NULL)
3464 return NULL;
3465
3466 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3467
3468 Py_DECREF(substring);
3469 return result;
3470}
3471
3472static char rindex__doc__[] =
3473"S.rindex(sub [,start [,end]]) -> int\n\
3474\n\
3475Like S.rfind() but raise ValueError when the substring is not found.";
3476
3477static PyObject *
3478unicode_rindex(PyUnicodeObject *self, PyObject *args)
3479{
3480 int result;
3481 PyUnicodeObject *substring;
3482 int start = 0;
3483 int end = INT_MAX;
3484
3485 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3486 return NULL;
3487 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3488 (PyObject *)substring);
3489 if (substring == NULL)
3490 return NULL;
3491
3492 result = findstring(self, substring, start, end, -1);
3493
3494 Py_DECREF(substring);
3495 if (result < 0) {
3496 PyErr_SetString(PyExc_ValueError, "substring not found");
3497 return NULL;
3498 }
3499 return PyInt_FromLong(result);
3500}
3501
3502static char rjust__doc__[] =
3503"S.rjust(width) -> unicode\n\
3504\n\
3505Return S right justified in a Unicode string of length width. Padding is\n\
3506done using spaces.";
3507
3508static PyObject *
3509unicode_rjust(PyUnicodeObject *self, PyObject *args)
3510{
3511 int width;
3512 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3513 return NULL;
3514
3515 if (self->length >= width) {
3516 Py_INCREF(self);
3517 return (PyObject*) self;
3518 }
3519
3520 return (PyObject*) pad(self, width - self->length, 0, ' ');
3521}
3522
3523static char rstrip__doc__[] =
3524"S.rstrip() -> unicode\n\
3525\n\
3526Return a copy of the string S with trailing whitespace removed.";
3527
3528static PyObject *
3529unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3530{
3531 if (!PyArg_NoArgs(args))
3532 return NULL;
3533 return strip(self, 0, 1);
3534}
3535
3536static PyObject*
3537unicode_slice(PyUnicodeObject *self, int start, int end)
3538{
3539 /* standard clamping */
3540 if (start < 0)
3541 start = 0;
3542 if (end < 0)
3543 end = 0;
3544 if (end > self->length)
3545 end = self->length;
3546 if (start == 0 && end == self->length) {
3547 /* full slice, return original string */
3548 Py_INCREF(self);
3549 return (PyObject*) self;
3550 }
3551 if (start > end)
3552 start = end;
3553 /* copy slice */
3554 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3555 end - start);
3556}
3557
3558PyObject *PyUnicode_Split(PyObject *s,
3559 PyObject *sep,
3560 int maxsplit)
3561{
3562 PyObject *result;
3563
3564 s = PyUnicode_FromObject(s);
3565 if (s == NULL)
3566 return NULL;
3567 if (sep != NULL) {
3568 sep = PyUnicode_FromObject(sep);
3569 if (sep == NULL) {
3570 Py_DECREF(s);
3571 return NULL;
3572 }
3573 }
3574
3575 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3576
3577 Py_DECREF(s);
3578 Py_XDECREF(sep);
3579 return result;
3580}
3581
3582static char split__doc__[] =
3583"S.split([sep [,maxsplit]]) -> list of strings\n\
3584\n\
3585Return a list of the words in S, using sep as the\n\
3586delimiter string. If maxsplit is given, at most maxsplit\n\
3587splits are done. If sep is not specified, any whitespace string\n\
3588is a separator.";
3589
3590static PyObject*
3591unicode_split(PyUnicodeObject *self, PyObject *args)
3592{
3593 PyObject *substring = Py_None;
3594 int maxcount = -1;
3595
3596 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3597 return NULL;
3598
3599 if (substring == Py_None)
3600 return split(self, NULL, maxcount);
3601 else if (PyUnicode_Check(substring))
3602 return split(self, (PyUnicodeObject *)substring, maxcount);
3603 else
3604 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3605}
3606
3607static char splitlines__doc__[] =
3608"S.splitlines([maxsplit]]) -> list of strings\n\
3609\n\
3610Return a list of the lines in S, breaking at line boundaries.\n\
3611If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
3612included in the resulting list.";
3613
3614static PyObject*
3615unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3616{
3617 int maxcount = -1;
3618
3619 if (!PyArg_ParseTuple(args, "|i:splitlines", &maxcount))
3620 return NULL;
3621
3622 return PyUnicode_Splitlines((PyObject *)self, maxcount);
3623}
3624
3625static
3626PyObject *unicode_str(PyUnicodeObject *self)
3627{
3628 return PyUnicode_AsUTF8String((PyObject *)self);
3629}
3630
3631static char strip__doc__[] =
3632"S.strip() -> unicode\n\
3633\n\
3634Return a copy of S with leading and trailing whitespace removed.";
3635
3636static PyObject *
3637unicode_strip(PyUnicodeObject *self, PyObject *args)
3638{
3639 if (!PyArg_NoArgs(args))
3640 return NULL;
3641 return strip(self, 1, 1);
3642}
3643
3644static char swapcase__doc__[] =
3645"S.swapcase() -> unicode\n\
3646\n\
3647Return a copy of S with uppercase characters converted to lowercase\n\
3648and vice versa.";
3649
3650static PyObject*
3651unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3652{
3653 if (!PyArg_NoArgs(args))
3654 return NULL;
3655 return fixup(self, fixswapcase);
3656}
3657
3658static char translate__doc__[] =
3659"S.translate(table) -> unicode\n\
3660\n\
3661Return a copy of the string S, where all characters have been mapped\n\
3662through the given translation table, which must be a mapping of\n\
3663Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3664are left untouched. Characters mapped to None are deleted.";
3665
3666static PyObject*
3667unicode_translate(PyUnicodeObject *self, PyObject *args)
3668{
3669 PyObject *table;
3670
3671 if (!PyArg_ParseTuple(args, "O:translate", &table))
3672 return NULL;
3673 return PyUnicode_TranslateCharmap(self->str,
3674 self->length,
3675 table,
3676 "ignore");
3677}
3678
3679static char upper__doc__[] =
3680"S.upper() -> unicode\n\
3681\n\
3682Return a copy of S converted to uppercase.";
3683
3684static PyObject*
3685unicode_upper(PyUnicodeObject *self, PyObject *args)
3686{
3687 if (!PyArg_NoArgs(args))
3688 return NULL;
3689 return fixup(self, fixupper);
3690}
3691
3692#if 0
3693static char zfill__doc__[] =
3694"S.zfill(width) -> unicode\n\
3695\n\
3696Pad a numeric string x with zeros on the left, to fill a field\n\
3697of the specified width. The string x is never truncated.";
3698
3699static PyObject *
3700unicode_zfill(PyUnicodeObject *self, PyObject *args)
3701{
3702 int fill;
3703 PyUnicodeObject *u;
3704
3705 int width;
3706 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3707 return NULL;
3708
3709 if (self->length >= width) {
3710 Py_INCREF(self);
3711 return (PyObject*) self;
3712 }
3713
3714 fill = width - self->length;
3715
3716 u = pad(self, fill, 0, '0');
3717
3718 if (u->str[fill] == '+' || u->str[fill] == '-') {
3719 /* move sign to beginning of string */
3720 u->str[0] = u->str[fill];
3721 u->str[fill] = '0';
3722 }
3723
3724 return (PyObject*) u;
3725}
3726#endif
3727
3728#if 0
3729static PyObject*
3730unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3731{
3732 if (!PyArg_NoArgs(args))
3733 return NULL;
3734 return PyInt_FromLong(unicode_freelist_size);
3735}
3736#endif
3737
3738static char startswith__doc__[] =
3739"S.startswith(prefix[, start[, end]]) -> int\n\
3740\n\
3741Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3742optional start, test S beginning at that position. With optional end, stop\n\
3743comparing S at that position.";
3744
3745static PyObject *
3746unicode_startswith(PyUnicodeObject *self,
3747 PyObject *args)
3748{
3749 PyUnicodeObject *substring;
3750 int start = 0;
3751 int end = INT_MAX;
3752 PyObject *result;
3753
3754 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3755 return NULL;
3756 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3757 (PyObject *)substring);
3758 if (substring == NULL)
3759 return NULL;
3760
3761 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3762
3763 Py_DECREF(substring);
3764 return result;
3765}
3766
3767
3768static char endswith__doc__[] =
3769"S.endswith(suffix[, start[, end]]) -> int\n\
3770\n\
3771Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3772optional start, test S beginning at that position. With optional end, stop\n\
3773comparing S at that position.";
3774
3775static PyObject *
3776unicode_endswith(PyUnicodeObject *self,
3777 PyObject *args)
3778{
3779 PyUnicodeObject *substring;
3780 int start = 0;
3781 int end = INT_MAX;
3782 PyObject *result;
3783
3784 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3785 return NULL;
3786 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3787 (PyObject *)substring);
3788 if (substring == NULL)
3789 return NULL;
3790
3791 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3792
3793 Py_DECREF(substring);
3794 return result;
3795}
3796
3797
3798static PyMethodDef unicode_methods[] = {
3799
3800 /* Order is according to common usage: often used methods should
3801 appear first, since lookup is done sequentially. */
3802
3803 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3804 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3805 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3806 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3807 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3808 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3809 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3810 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3811 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3812 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3813 {"index", (PyCFunction) unicode_index, 1, index__doc__},
3814 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
3815 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
3816 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
3817/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
3818 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
3819 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
3820 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
3821 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
3822 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
3823 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
3824 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
3825 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
3826 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
3827 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
3828 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
3829 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
3830 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
3831 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
3832 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
3833 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
3834 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
3835 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
3836#if 0
3837 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
3838 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
3839#endif
3840
3841#if 0
3842 /* This one is just used for debugging the implementation. */
3843 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
3844#endif
3845
3846 {NULL, NULL}
3847};
3848
3849static PyObject *
3850unicode_getattr(PyUnicodeObject *self, char *name)
3851{
3852 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
3853}
3854
3855static PySequenceMethods unicode_as_sequence = {
3856 (inquiry) unicode_length, /* sq_length */
3857 (binaryfunc) PyUnicode_Concat, /* sq_concat */
3858 (intargfunc) unicode_repeat, /* sq_repeat */
3859 (intargfunc) unicode_getitem, /* sq_item */
3860 (intintargfunc) unicode_slice, /* sq_slice */
3861 0, /* sq_ass_item */
3862 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003863 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00003864};
3865
3866static int
3867unicode_buffer_getreadbuf(PyUnicodeObject *self,
3868 int index,
3869 const void **ptr)
3870{
3871 if (index != 0) {
3872 PyErr_SetString(PyExc_SystemError,
3873 "accessing non-existent unicode segment");
3874 return -1;
3875 }
3876 *ptr = (void *) self->str;
3877 return PyUnicode_GET_DATA_SIZE(self);
3878}
3879
3880static int
3881unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
3882 const void **ptr)
3883{
3884 PyErr_SetString(PyExc_TypeError,
3885 "cannot use unicode as modifyable buffer");
3886 return -1;
3887}
3888
3889static int
3890unicode_buffer_getsegcount(PyUnicodeObject *self,
3891 int *lenp)
3892{
3893 if (lenp)
3894 *lenp = PyUnicode_GET_DATA_SIZE(self);
3895 return 1;
3896}
3897
3898static int
3899unicode_buffer_getcharbuf(PyUnicodeObject *self,
3900 int index,
3901 const void **ptr)
3902{
3903 PyObject *str;
3904
3905 if (index != 0) {
3906 PyErr_SetString(PyExc_SystemError,
3907 "accessing non-existent unicode segment");
3908 return -1;
3909 }
3910 str = utf8_string(self, NULL);
3911 if (str == NULL)
3912 return -1;
3913 *ptr = (void *) PyString_AS_STRING(str);
3914 return PyString_GET_SIZE(str);
3915}
3916
3917/* Helpers for PyUnicode_Format() */
3918
3919static PyObject *
3920getnextarg(args, arglen, p_argidx)
3921 PyObject *args;
3922int arglen;
3923int *p_argidx;
3924{
3925 int argidx = *p_argidx;
3926 if (argidx < arglen) {
3927 (*p_argidx)++;
3928 if (arglen < 0)
3929 return args;
3930 else
3931 return PyTuple_GetItem(args, argidx);
3932 }
3933 PyErr_SetString(PyExc_TypeError,
3934 "not enough arguments for format string");
3935 return NULL;
3936}
3937
3938#define F_LJUST (1<<0)
3939#define F_SIGN (1<<1)
3940#define F_BLANK (1<<2)
3941#define F_ALT (1<<3)
3942#define F_ZERO (1<<4)
3943
3944static
3945#ifdef HAVE_STDARG_PROTOTYPES
3946int usprintf(register Py_UNICODE *buffer, char *format, ...)
3947#else
3948int usprintf(va_alist) va_dcl
3949#endif
3950{
3951 register int i;
3952 int len;
3953 va_list va;
3954 char *charbuffer;
3955#ifdef HAVE_STDARG_PROTOTYPES
3956 va_start(va, format);
3957#else
3958 Py_UNICODE *args;
3959 char *format;
3960
3961 va_start(va);
3962 buffer = va_arg(va, Py_UNICODE *);
3963 format = va_arg(va, char *);
3964#endif
3965
3966 /* First, format the string as char array, then expand to Py_UNICODE
3967 array. */
3968 charbuffer = (char *)buffer;
3969 len = vsprintf(charbuffer, format, va);
3970 for (i = len - 1; i >= 0; i--)
3971 buffer[i] = (Py_UNICODE) charbuffer[i];
3972
3973 va_end(va);
3974 return len;
3975}
3976
3977static int
3978formatfloat(Py_UNICODE *buf,
3979 int flags,
3980 int prec,
3981 int type,
3982 PyObject *v)
3983{
3984 char fmt[20];
3985 double x;
3986
3987 x = PyFloat_AsDouble(v);
3988 if (x == -1.0 && PyErr_Occurred())
3989 return -1;
3990 if (prec < 0)
3991 prec = 6;
3992 if (prec > 50)
3993 prec = 50; /* Arbitrary limitation */
3994 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
3995 type = 'g';
3996 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
3997 return usprintf(buf, fmt, x);
3998}
3999
4000static int
4001formatint(Py_UNICODE *buf,
4002 int flags,
4003 int prec,
4004 int type,
4005 PyObject *v)
4006{
4007 char fmt[20];
4008 long x;
4009
4010 x = PyInt_AsLong(v);
4011 if (x == -1 && PyErr_Occurred())
4012 return -1;
4013 if (prec < 0)
4014 prec = 1;
4015 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4016 return usprintf(buf, fmt, x);
4017}
4018
4019static int
4020formatchar(Py_UNICODE *buf,
4021 PyObject *v)
4022{
4023 if (PyUnicode_Check(v))
4024 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4025
4026 else if (PyString_Check(v))
4027 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4028
4029 else {
4030 /* Integer input truncated to a character */
4031 long x;
4032 x = PyInt_AsLong(v);
4033 if (x == -1 && PyErr_Occurred())
4034 return -1;
4035 buf[0] = (char) x;
4036 }
4037 buf[1] = '\0';
4038 return 1;
4039}
4040
4041PyObject *PyUnicode_Format(PyObject *format,
4042 PyObject *args)
4043{
4044 Py_UNICODE *fmt, *res;
4045 int fmtcnt, rescnt, reslen, arglen, argidx;
4046 int args_owned = 0;
4047 PyUnicodeObject *result = NULL;
4048 PyObject *dict = NULL;
4049 PyObject *uformat;
4050
4051 if (format == NULL || args == NULL) {
4052 PyErr_BadInternalCall();
4053 return NULL;
4054 }
4055 uformat = PyUnicode_FromObject(format);
4056 fmt = PyUnicode_AS_UNICODE(uformat);
4057 fmtcnt = PyUnicode_GET_SIZE(uformat);
4058
4059 reslen = rescnt = fmtcnt + 100;
4060 result = _PyUnicode_New(reslen);
4061 if (result == NULL)
4062 goto onError;
4063 res = PyUnicode_AS_UNICODE(result);
4064
4065 if (PyTuple_Check(args)) {
4066 arglen = PyTuple_Size(args);
4067 argidx = 0;
4068 }
4069 else {
4070 arglen = -1;
4071 argidx = -2;
4072 }
4073 if (args->ob_type->tp_as_mapping)
4074 dict = args;
4075
4076 while (--fmtcnt >= 0) {
4077 if (*fmt != '%') {
4078 if (--rescnt < 0) {
4079 rescnt = fmtcnt + 100;
4080 reslen += rescnt;
4081 if (_PyUnicode_Resize(result, reslen) < 0)
4082 return NULL;
4083 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4084 --rescnt;
4085 }
4086 *res++ = *fmt++;
4087 }
4088 else {
4089 /* Got a format specifier */
4090 int flags = 0;
4091 int width = -1;
4092 int prec = -1;
4093 int size = 0;
4094 Py_UNICODE c = '\0';
4095 Py_UNICODE fill;
4096 PyObject *v = NULL;
4097 PyObject *temp = NULL;
4098 Py_UNICODE *buf;
4099 Py_UNICODE sign;
4100 int len;
4101 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4102
4103 fmt++;
4104 if (*fmt == '(') {
4105 Py_UNICODE *keystart;
4106 int keylen;
4107 PyObject *key;
4108 int pcount = 1;
4109
4110 if (dict == NULL) {
4111 PyErr_SetString(PyExc_TypeError,
4112 "format requires a mapping");
4113 goto onError;
4114 }
4115 ++fmt;
4116 --fmtcnt;
4117 keystart = fmt;
4118 /* Skip over balanced parentheses */
4119 while (pcount > 0 && --fmtcnt >= 0) {
4120 if (*fmt == ')')
4121 --pcount;
4122 else if (*fmt == '(')
4123 ++pcount;
4124 fmt++;
4125 }
4126 keylen = fmt - keystart - 1;
4127 if (fmtcnt < 0 || pcount > 0) {
4128 PyErr_SetString(PyExc_ValueError,
4129 "incomplete format key");
4130 goto onError;
4131 }
4132 /* keys are converted to strings (using UTF-8) and
4133 then looked up since Python uses strings to hold
4134 variables names etc. in its namespaces and we
4135 wouldn't want to break common idioms. The
4136 alternative would be using Unicode objects for the
4137 lookup but u"abc" and "abc" have different hash
4138 values (on purpose). */
4139 key = PyUnicode_EncodeUTF8(keystart,
4140 keylen,
4141 NULL);
4142 if (key == NULL)
4143 goto onError;
4144 if (args_owned) {
4145 Py_DECREF(args);
4146 args_owned = 0;
4147 }
4148 args = PyObject_GetItem(dict, key);
4149 Py_DECREF(key);
4150 if (args == NULL) {
4151 goto onError;
4152 }
4153 args_owned = 1;
4154 arglen = -1;
4155 argidx = -2;
4156 }
4157 while (--fmtcnt >= 0) {
4158 switch (c = *fmt++) {
4159 case '-': flags |= F_LJUST; continue;
4160 case '+': flags |= F_SIGN; continue;
4161 case ' ': flags |= F_BLANK; continue;
4162 case '#': flags |= F_ALT; continue;
4163 case '0': flags |= F_ZERO; continue;
4164 }
4165 break;
4166 }
4167 if (c == '*') {
4168 v = getnextarg(args, arglen, &argidx);
4169 if (v == NULL)
4170 goto onError;
4171 if (!PyInt_Check(v)) {
4172 PyErr_SetString(PyExc_TypeError,
4173 "* wants int");
4174 goto onError;
4175 }
4176 width = PyInt_AsLong(v);
4177 if (width < 0) {
4178 flags |= F_LJUST;
4179 width = -width;
4180 }
4181 if (--fmtcnt >= 0)
4182 c = *fmt++;
4183 }
4184 else if (c >= '0' && c <= '9') {
4185 width = c - '0';
4186 while (--fmtcnt >= 0) {
4187 c = *fmt++;
4188 if (c < '0' || c > '9')
4189 break;
4190 if ((width*10) / 10 != width) {
4191 PyErr_SetString(PyExc_ValueError,
4192 "width too big");
4193 goto onError;
4194 }
4195 width = width*10 + (c - '0');
4196 }
4197 }
4198 if (c == '.') {
4199 prec = 0;
4200 if (--fmtcnt >= 0)
4201 c = *fmt++;
4202 if (c == '*') {
4203 v = getnextarg(args, arglen, &argidx);
4204 if (v == NULL)
4205 goto onError;
4206 if (!PyInt_Check(v)) {
4207 PyErr_SetString(PyExc_TypeError,
4208 "* wants int");
4209 goto onError;
4210 }
4211 prec = PyInt_AsLong(v);
4212 if (prec < 0)
4213 prec = 0;
4214 if (--fmtcnt >= 0)
4215 c = *fmt++;
4216 }
4217 else if (c >= '0' && c <= '9') {
4218 prec = c - '0';
4219 while (--fmtcnt >= 0) {
4220 c = Py_CHARMASK(*fmt++);
4221 if (c < '0' || c > '9')
4222 break;
4223 if ((prec*10) / 10 != prec) {
4224 PyErr_SetString(PyExc_ValueError,
4225 "prec too big");
4226 goto onError;
4227 }
4228 prec = prec*10 + (c - '0');
4229 }
4230 }
4231 } /* prec */
4232 if (fmtcnt >= 0) {
4233 if (c == 'h' || c == 'l' || c == 'L') {
4234 size = c;
4235 if (--fmtcnt >= 0)
4236 c = *fmt++;
4237 }
4238 }
4239 if (fmtcnt < 0) {
4240 PyErr_SetString(PyExc_ValueError,
4241 "incomplete format");
4242 goto onError;
4243 }
4244 if (c != '%') {
4245 v = getnextarg(args, arglen, &argidx);
4246 if (v == NULL)
4247 goto onError;
4248 }
4249 sign = 0;
4250 fill = ' ';
4251 switch (c) {
4252
4253 case '%':
4254 buf = tmpbuf;
4255 buf[0] = '%';
4256 len = 1;
4257 break;
4258
4259 case 's':
4260 case 'r':
4261 if (PyUnicode_Check(v) && c == 's') {
4262 temp = v;
4263 Py_INCREF(temp);
4264 }
4265 else {
4266 PyObject *unicode;
4267 if (c == 's')
4268 temp = PyObject_Str(v);
4269 else
4270 temp = PyObject_Repr(v);
4271 if (temp == NULL)
4272 goto onError;
4273 if (!PyString_Check(temp)) {
4274 /* XXX Note: this should never happen, since
4275 PyObject_Repr() and PyObject_Str() assure
4276 this */
4277 Py_DECREF(temp);
4278 PyErr_SetString(PyExc_TypeError,
4279 "%s argument has non-string str()");
4280 goto onError;
4281 }
4282 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4283 PyString_GET_SIZE(temp),
4284 "strict");
4285 Py_DECREF(temp);
4286 temp = unicode;
4287 if (temp == NULL)
4288 goto onError;
4289 }
4290 buf = PyUnicode_AS_UNICODE(temp);
4291 len = PyUnicode_GET_SIZE(temp);
4292 if (prec >= 0 && len > prec)
4293 len = prec;
4294 break;
4295
4296 case 'i':
4297 case 'd':
4298 case 'u':
4299 case 'o':
4300 case 'x':
4301 case 'X':
4302 if (c == 'i')
4303 c = 'd';
4304 buf = tmpbuf;
4305 len = formatint(buf, flags, prec, c, v);
4306 if (len < 0)
4307 goto onError;
4308 sign = (c == 'd');
4309 if (flags & F_ZERO) {
4310 fill = '0';
4311 if ((flags&F_ALT) &&
4312 (c == 'x' || c == 'X') &&
4313 buf[0] == '0' && buf[1] == c) {
4314 *res++ = *buf++;
4315 *res++ = *buf++;
4316 rescnt -= 2;
4317 len -= 2;
4318 width -= 2;
4319 if (width < 0)
4320 width = 0;
4321 }
4322 }
4323 break;
4324
4325 case 'e':
4326 case 'E':
4327 case 'f':
4328 case 'g':
4329 case 'G':
4330 buf = tmpbuf;
4331 len = formatfloat(buf, flags, prec, c, v);
4332 if (len < 0)
4333 goto onError;
4334 sign = 1;
4335 if (flags&F_ZERO)
4336 fill = '0';
4337 break;
4338
4339 case 'c':
4340 buf = tmpbuf;
4341 len = formatchar(buf, v);
4342 if (len < 0)
4343 goto onError;
4344 break;
4345
4346 default:
4347 PyErr_Format(PyExc_ValueError,
4348 "unsupported format character '%c' (0x%x)",
4349 c, c);
4350 goto onError;
4351 }
4352 if (sign) {
4353 if (*buf == '-' || *buf == '+') {
4354 sign = *buf++;
4355 len--;
4356 }
4357 else if (flags & F_SIGN)
4358 sign = '+';
4359 else if (flags & F_BLANK)
4360 sign = ' ';
4361 else
4362 sign = 0;
4363 }
4364 if (width < len)
4365 width = len;
4366 if (rescnt < width + (sign != 0)) {
4367 reslen -= rescnt;
4368 rescnt = width + fmtcnt + 100;
4369 reslen += rescnt;
4370 if (_PyUnicode_Resize(result, reslen) < 0)
4371 return NULL;
4372 res = PyUnicode_AS_UNICODE(result)
4373 + reslen - rescnt;
4374 }
4375 if (sign) {
4376 if (fill != ' ')
4377 *res++ = sign;
4378 rescnt--;
4379 if (width > len)
4380 width--;
4381 }
4382 if (width > len && !(flags & F_LJUST)) {
4383 do {
4384 --rescnt;
4385 *res++ = fill;
4386 } while (--width > len);
4387 }
4388 if (sign && fill == ' ')
4389 *res++ = sign;
4390 memcpy(res, buf, len * sizeof(Py_UNICODE));
4391 res += len;
4392 rescnt -= len;
4393 while (--width >= len) {
4394 --rescnt;
4395 *res++ = ' ';
4396 }
4397 if (dict && (argidx < arglen) && c != '%') {
4398 PyErr_SetString(PyExc_TypeError,
4399 "not all arguments converted");
4400 goto onError;
4401 }
4402 Py_XDECREF(temp);
4403 } /* '%' */
4404 } /* until end */
4405 if (argidx < arglen && !dict) {
4406 PyErr_SetString(PyExc_TypeError,
4407 "not all arguments converted");
4408 goto onError;
4409 }
4410
4411 if (args_owned) {
4412 Py_DECREF(args);
4413 }
4414 Py_DECREF(uformat);
4415 _PyUnicode_Resize(result, reslen - rescnt);
4416 return (PyObject *)result;
4417
4418 onError:
4419 Py_XDECREF(result);
4420 Py_DECREF(uformat);
4421 if (args_owned) {
4422 Py_DECREF(args);
4423 }
4424 return NULL;
4425}
4426
4427static PyBufferProcs unicode_as_buffer = {
4428 (getreadbufferproc) unicode_buffer_getreadbuf,
4429 (getwritebufferproc) unicode_buffer_getwritebuf,
4430 (getsegcountproc) unicode_buffer_getsegcount,
4431 (getcharbufferproc) unicode_buffer_getcharbuf,
4432};
4433
4434PyTypeObject PyUnicode_Type = {
4435 PyObject_HEAD_INIT(&PyType_Type)
4436 0, /* ob_size */
4437 "unicode", /* tp_name */
4438 sizeof(PyUnicodeObject), /* tp_size */
4439 0, /* tp_itemsize */
4440 /* Slots */
4441 (destructor)_PyUnicode_Free, /* tp_dealloc */
4442 0, /* tp_print */
4443 (getattrfunc)unicode_getattr, /* tp_getattr */
4444 0, /* tp_setattr */
4445 (cmpfunc) unicode_compare, /* tp_compare */
4446 (reprfunc) unicode_repr, /* tp_repr */
4447 0, /* tp_as_number */
4448 &unicode_as_sequence, /* tp_as_sequence */
4449 0, /* tp_as_mapping */
4450 (hashfunc) unicode_hash, /* tp_hash*/
4451 0, /* tp_call*/
4452 (reprfunc) unicode_str, /* tp_str */
4453 (getattrofunc) NULL, /* tp_getattro */
4454 (setattrofunc) NULL, /* tp_setattro */
4455 &unicode_as_buffer, /* tp_as_buffer */
4456 Py_TPFLAGS_DEFAULT, /* tp_flags */
4457};
4458
4459/* Initialize the Unicode implementation */
4460
4461void _PyUnicode_Init()
4462{
4463 /* Doublecheck the configuration... */
4464 if (sizeof(Py_UNICODE) != 2)
4465 Py_FatalError("Unicode configuration error: "
4466 "sizeof(Py_UNICODE) != 2 bytes");
4467
4468 unicode_empty = _PyUnicode_New(0);
4469}
4470
4471/* Finalize the Unicode implementation */
4472
4473void
4474_PyUnicode_Fini()
4475{
4476 PyUnicodeObject *u = unicode_freelist;
4477
4478 while (u != NULL) {
4479 PyUnicodeObject *v = u;
4480 u = *(PyUnicodeObject **)u;
4481 free(v);
4482 }
4483 Py_XDECREF(unicode_empty);
4484}