blob: d63165ea05b64c1e483dfb6c6934ef9c330f3a97 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
76/* Limit for the Unicode object free list */
77
78#define MAX_UNICODE_FREELIST_SIZE 1024
79
80/* Limit for the Unicode object free list stay alive optimization.
81
82 The implementation will keep allocated Unicode memory intact for
83 all objects on the free list having a size less than this
84 limit. This reduces malloc() overhead for small Unicode objects.
85
Barry Warsaw51ac5802000-03-20 16:36:48 +000086 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumd57fd912000-03-10 22:53:23 +000087 (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
88 malloc()-overhead) bytes of unused garbage.
89
90 Setting the limit to 0 effectively turns the feature off.
91
92 XXX The feature is currently turned off because there are
93 apparently some lingering bugs in its implementation which I
94 haven't yet been able to sort out.
95
96*/
97
98#define STAYALIVE_SIZE_LIMIT 0
99
100/* Endianness switches; defaults to little endian */
101
102#ifdef WORDS_BIGENDIAN
103# define BYTEORDER_IS_BIG_ENDIAN
104#else
105# define BYTEORDER_IS_LITTLE_ENDIAN
106#endif
107
108/* --- Globals ------------------------------------------------------------ */
109
110/* The empty Unicode object */
111static PyUnicodeObject *unicode_empty = NULL;
112
113/* Free list for Unicode objects */
114static PyUnicodeObject *unicode_freelist = NULL;
115static int unicode_freelist_size = 0;
116
117/* --- Unicode Object ----------------------------------------------------- */
118
119static
120int _PyUnicode_Resize(register PyUnicodeObject *unicode,
121 int length)
122{
123 void *oldstr;
124
125 /* Shortcut if there's nothing to do. */
126 if (unicode->length == length)
127 return 0;
128
129 /* Resizing unicode_empty is not allowed. */
130 if (unicode == unicode_empty) {
131 PyErr_SetString(PyExc_SystemError,
132 "can't resize empty unicode object");
133 return -1;
134 }
135
136 /* We allocate one more byte to make sure the string is
137 Ux0000 terminated -- XXX is this needed ? */
138 oldstr = unicode->str;
139 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
140 if (!unicode->str) {
141 unicode->str = oldstr;
142 PyErr_NoMemory();
143 return -1;
144 }
145 unicode->str[length] = 0;
146 unicode->length = length;
147
148 /* Reset the object caches */
149 if (unicode->utf8str) {
150 Py_DECREF(unicode->utf8str);
151 unicode->utf8str = NULL;
152 }
153 unicode->hash = -1;
154
155 return 0;
156}
157
158/* We allocate one more byte to make sure the string is
159 Ux0000 terminated -- XXX is this needed ?
160
161 XXX This allocator could further be enhanced by assuring that the
162 free list never reduces its size below 1.
163
164*/
165
166static
167PyUnicodeObject *_PyUnicode_New(int length)
168{
169 register PyUnicodeObject *unicode;
170
171 /* Optimization for empty strings */
172 if (length == 0 && unicode_empty != NULL) {
173 Py_INCREF(unicode_empty);
174 return unicode_empty;
175 }
176
177 /* Unicode freelist & memory allocation */
178 if (unicode_freelist) {
179 unicode = unicode_freelist;
180 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
181 unicode_freelist_size--;
182 unicode->ob_type = &PyUnicode_Type;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000183 _Py_NewReference((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000184 if (unicode->str) {
185 if (unicode->length < length &&
186 _PyUnicode_Resize(unicode, length)) {
187 free(unicode->str);
188 PyMem_DEL(unicode);
189 return NULL;
190 }
191 }
192 else
193 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
194 }
195 else {
196 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
197 if (unicode == NULL)
198 return NULL;
199 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
200 }
201
Barry Warsaw51ac5802000-03-20 16:36:48 +0000202 if (!unicode->str)
203 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 unicode->str[length] = 0;
205 unicode->length = length;
206 unicode->hash = -1;
207 unicode->utf8str = NULL;
208 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000209
210 onError:
211 _Py_ForgetReference((PyObject *)unicode);
212 PyMem_DEL(unicode);
213 PyErr_NoMemory();
214 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000215}
216
217static
218void _PyUnicode_Free(register PyUnicodeObject *unicode)
219{
220 Py_XDECREF(unicode->utf8str);
221 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
222 if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
223 free(unicode->str);
224 unicode->str = NULL;
225 unicode->length = 0;
226 }
227 *(PyUnicodeObject **)unicode = unicode_freelist;
228 unicode_freelist = unicode;
229 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000230 }
231 else {
232 free(unicode->str);
233 PyMem_DEL(unicode);
234 }
235}
236
237PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
238 int size)
239{
240 PyUnicodeObject *unicode;
241
242 unicode = _PyUnicode_New(size);
243 if (!unicode)
244 return NULL;
245
246 /* Copy the Unicode data into the new object */
247 if (u != NULL)
248 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
249
250 return (PyObject *)unicode;
251}
252
253#ifdef HAVE_WCHAR_H
254
255PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
256 int size)
257{
258 PyUnicodeObject *unicode;
259
260 if (w == NULL) {
261 PyErr_BadInternalCall();
262 return NULL;
263 }
264
265 unicode = _PyUnicode_New(size);
266 if (!unicode)
267 return NULL;
268
269 /* Copy the wchar_t data into the new object */
270#ifdef HAVE_USABLE_WCHAR_T
271 memcpy(unicode->str, w, size * sizeof(wchar_t));
272#else
273 {
274 register Py_UNICODE *u;
275 register int i;
276 u = PyUnicode_AS_UNICODE(unicode);
277 for (i = size; i >= 0; i--)
278 *u++ = *w++;
279 }
280#endif
281
282 return (PyObject *)unicode;
283}
284
285int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
286 register wchar_t *w,
287 int size)
288{
289 if (unicode == NULL) {
290 PyErr_BadInternalCall();
291 return -1;
292 }
293 if (size > PyUnicode_GET_SIZE(unicode))
294 size = PyUnicode_GET_SIZE(unicode);
295#ifdef HAVE_USABLE_WCHAR_T
296 memcpy(w, unicode->str, size * sizeof(wchar_t));
297#else
298 {
299 register Py_UNICODE *u;
300 register int i;
301 u = PyUnicode_AS_UNICODE(unicode);
302 for (i = size; i >= 0; i--)
303 *w++ = *u++;
304 }
305#endif
306
307 return size;
308}
309
310#endif
311
312PyObject *PyUnicode_FromObject(register PyObject *obj)
313{
314 const char *s;
315 int len;
316
317 if (obj == NULL) {
318 PyErr_BadInternalCall();
319 return NULL;
320 }
321 else if (PyUnicode_Check(obj)) {
322 Py_INCREF(obj);
323 return obj;
324 }
325 else if (PyString_Check(obj)) {
326 s = PyString_AS_STRING(obj);
327 len = PyString_GET_SIZE(obj);
328 }
329 else if (PyObject_AsCharBuffer(obj, &s, &len))
330 return NULL;
331 if (len == 0) {
332 Py_INCREF(unicode_empty);
333 return (PyObject *)unicode_empty;
334 }
335 return PyUnicode_DecodeUTF8(s, len, "strict");
336}
337
338PyObject *PyUnicode_Decode(const char *s,
339 int size,
340 const char *encoding,
341 const char *errors)
342{
343 PyObject *buffer = NULL, *unicode;
344
345 /* Shortcut for the default encoding UTF-8 */
346 if (encoding == NULL ||
347 (strcmp(encoding, "utf-8") == 0))
348 return PyUnicode_DecodeUTF8(s, size, errors);
349
350 /* Decode via the codec registry */
351 buffer = PyBuffer_FromMemory((void *)s, size);
352 if (buffer == NULL)
353 goto onError;
354 unicode = PyCodec_Decode(buffer, encoding, errors);
355 if (unicode == NULL)
356 goto onError;
357 if (!PyUnicode_Check(unicode)) {
358 PyErr_Format(PyExc_TypeError,
359 "decoder did not return an unicode object (type=%s)",
360 unicode->ob_type->tp_name);
361 Py_DECREF(unicode);
362 goto onError;
363 }
364 Py_DECREF(buffer);
365 return unicode;
366
367 onError:
368 Py_XDECREF(buffer);
369 return NULL;
370}
371
372PyObject *PyUnicode_Encode(const Py_UNICODE *s,
373 int size,
374 const char *encoding,
375 const char *errors)
376{
377 PyObject *v, *unicode;
378
379 unicode = PyUnicode_FromUnicode(s, size);
380 if (unicode == NULL)
381 return NULL;
382 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
383 Py_DECREF(unicode);
384 return v;
385}
386
387PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
388 const char *encoding,
389 const char *errors)
390{
391 PyObject *v;
392
393 if (!PyUnicode_Check(unicode)) {
394 PyErr_BadArgument();
395 goto onError;
396 }
397 /* Shortcut for the default encoding UTF-8 */
398 if ((encoding == NULL ||
399 (strcmp(encoding, "utf-8") == 0)) &&
400 errors == NULL)
401 return PyUnicode_AsUTF8String(unicode);
402
403 /* Encode via the codec registry */
404 v = PyCodec_Encode(unicode, encoding, errors);
405 if (v == NULL)
406 goto onError;
407 /* XXX Should we really enforce this ? */
408 if (!PyString_Check(v)) {
409 PyErr_Format(PyExc_TypeError,
410 "encoder did not return a string object (type=%s)",
411 v->ob_type->tp_name);
412 Py_DECREF(v);
413 goto onError;
414 }
415 return v;
416
417 onError:
418 return NULL;
419}
420
421Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
422{
423 if (!PyUnicode_Check(unicode)) {
424 PyErr_BadArgument();
425 goto onError;
426 }
427 return PyUnicode_AS_UNICODE(unicode);
428
429 onError:
430 return NULL;
431}
432
433int PyUnicode_GetSize(PyObject *unicode)
434{
435 if (!PyUnicode_Check(unicode)) {
436 PyErr_BadArgument();
437 goto onError;
438 }
439 return PyUnicode_GET_SIZE(unicode);
440
441 onError:
442 return -1;
443}
444
445/* --- UTF-8 Codec -------------------------------------------------------- */
446
447static
448char utf8_code_length[256] = {
449 /* Map UTF-8 encoded prefix byte to sequence length. zero means
450 illegal prefix. see RFC 2279 for details */
451 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
452 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
453 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
454 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
455 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
456 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
458 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
460 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
464 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
465 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
466 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
467};
468
469static
470int utf8_decoding_error(const char **source,
471 Py_UNICODE **dest,
472 const char *errors,
473 const char *details)
474{
475 if ((errors == NULL) ||
476 (strcmp(errors,"strict") == 0)) {
477 PyErr_Format(PyExc_UnicodeError,
478 "UTF-8 decoding error: %s",
479 details);
480 return -1;
481 }
482 else if (strcmp(errors,"ignore") == 0) {
483 (*source)++;
484 return 0;
485 }
486 else if (strcmp(errors,"replace") == 0) {
487 (*source)++;
488 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
489 (*dest)++;
490 return 0;
491 }
492 else {
493 PyErr_Format(PyExc_ValueError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000494 "UTF-8 decoding error; unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000495 errors);
496 return -1;
497 }
498}
499
500#define UTF8_ERROR(details) do { \
501 if (utf8_decoding_error(&s, &p, errors, details)) \
502 goto onError; \
503 continue; \
504} while (0)
505
506PyObject *PyUnicode_DecodeUTF8(const char *s,
507 int size,
508 const char *errors)
509{
510 int n;
511 const char *e;
512 PyUnicodeObject *unicode;
513 Py_UNICODE *p;
514
515 /* Note: size will always be longer than the resulting Unicode
516 character count */
517 unicode = _PyUnicode_New(size);
518 if (!unicode)
519 return NULL;
520 if (size == 0)
521 return (PyObject *)unicode;
522
523 /* Unpack UTF-8 encoded data */
524 p = unicode->str;
525 e = s + size;
526
527 while (s < e) {
528 register Py_UNICODE ch = (unsigned char)*s;
529
530 if (ch < 0x80) {
531 *p++ = ch;
532 s++;
533 continue;
534 }
535
536 n = utf8_code_length[ch];
537
538 if (s + n > e)
539 UTF8_ERROR("unexpected end of data");
540
541 switch (n) {
542
543 case 0:
544 UTF8_ERROR("unexpected code byte");
545 break;
546
547 case 1:
548 UTF8_ERROR("internal error");
549 break;
550
551 case 2:
552 if ((s[1] & 0xc0) != 0x80)
553 UTF8_ERROR("invalid data");
554 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
555 if (ch < 0x80)
556 UTF8_ERROR("illegal encoding");
557 else
558 *p++ = ch;
559 break;
560
561 case 3:
562 if ((s[1] & 0xc0) != 0x80 ||
563 (s[2] & 0xc0) != 0x80)
564 UTF8_ERROR("invalid data");
565 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
566 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
567 UTF8_ERROR("illegal encoding");
568 else
569 *p++ = ch;
570 break;
571
572 default:
573 /* Other sizes are only needed for UCS-4 */
574 UTF8_ERROR("unsupported Unicode code range");
575 }
576 s += n;
577 }
578
579 /* Adjust length */
580 if (_PyUnicode_Resize(unicode, p - unicode->str))
581 goto onError;
582
583 return (PyObject *)unicode;
584
585onError:
586 Py_DECREF(unicode);
587 return NULL;
588}
589
590#undef UTF8_ERROR
591
592static
593int utf8_encoding_error(const Py_UNICODE **source,
594 char **dest,
595 const char *errors,
596 const char *details)
597{
598 if ((errors == NULL) ||
599 (strcmp(errors,"strict") == 0)) {
600 PyErr_Format(PyExc_UnicodeError,
601 "UTF-8 encoding error: %s",
602 details);
603 return -1;
604 }
605 else if (strcmp(errors,"ignore") == 0) {
606 return 0;
607 }
608 else if (strcmp(errors,"replace") == 0) {
609 **dest = '?';
610 (*dest)++;
611 return 0;
612 }
613 else {
614 PyErr_Format(PyExc_ValueError,
615 "UTF-8 encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +0000616 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000617 errors);
618 return -1;
619 }
620}
621
622PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
623 int size,
624 const char *errors)
625{
626 PyObject *v;
627 char *p;
628 char *q;
629
630 v = PyString_FromStringAndSize(NULL, 3 * size);
631 if (v == NULL)
632 return NULL;
633 if (size == 0)
634 goto done;
635
636 p = q = PyString_AS_STRING(v);
637 while (size-- > 0) {
638 Py_UNICODE ch = *s++;
639 if (ch < 0x80)
640 *p++ = (char) ch;
641 else if (ch < 0x0800) {
642 *p++ = 0xc0 | (ch >> 6);
643 *p++ = 0x80 | (ch & 0x3f);
644 } else if (0xD800 <= ch && ch <= 0xDFFF) {
645 /* These byte ranges are reserved for UTF-16 surrogate
646 bytes which the Python implementation currently does
647 not support. */
648 printf("code range problem: U+%04x\n", ch);
649 if (utf8_encoding_error(&s, &p, errors,
650 "unsupported code range"))
651 goto onError;
652 } else {
653 *p++ = 0xe0 | (ch >> 12);
654 *p++ = 0x80 | ((ch >> 6) & 0x3f);
655 *p++ = 0x80 | (ch & 0x3f);
656 }
657 }
658 *p = '\0';
659 _PyString_Resize(&v, p - q);
660
661 done:
662 return v;
663
664 onError:
665 Py_DECREF(v);
666 return NULL;
667}
668
669/* Return a Python string holding the UTF-8 encoded value of the
670 Unicode object.
671
672 The resulting string is cached in the Unicode object for subsequent
673 usage by this function. The cached version is needed to implement
674 the character buffer interface.
675
676 The refcount of the string is *not* incremented.
677
678*/
679
680static
681PyObject *utf8_string(PyUnicodeObject *self,
682 const char *errors)
683{
684 PyObject *v = self->utf8str;
685
686 if (v)
687 return v;
688 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
689 PyUnicode_GET_SIZE(self),
690 errors);
691 if (v && errors == NULL)
692 self->utf8str = v;
693 return v;
694}
695
696PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
697{
698 PyObject *str;
699
700 if (!PyUnicode_Check(unicode)) {
701 PyErr_BadArgument();
702 return NULL;
703 }
704 str = utf8_string((PyUnicodeObject *)unicode, NULL);
705 if (str == NULL)
706 return NULL;
707 Py_INCREF(str);
708 return str;
709}
710
711/* --- UTF-16 Codec ------------------------------------------------------- */
712
713static
714int utf16_decoding_error(const Py_UNICODE **source,
715 Py_UNICODE **dest,
716 const char *errors,
717 const char *details)
718{
719 if ((errors == NULL) ||
720 (strcmp(errors,"strict") == 0)) {
721 PyErr_Format(PyExc_UnicodeError,
722 "UTF-16 decoding error: %s",
723 details);
724 return -1;
725 }
726 else if (strcmp(errors,"ignore") == 0) {
727 return 0;
728 }
729 else if (strcmp(errors,"replace") == 0) {
730 if (dest) {
731 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
732 (*dest)++;
733 }
734 return 0;
735 }
736 else {
737 PyErr_Format(PyExc_ValueError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000738 "UTF-16 decoding error; unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000739 errors);
740 return -1;
741 }
742}
743
744#define UTF16_ERROR(details) do { \
745 if (utf16_decoding_error(&q, &p, errors, details)) \
746 goto onError; \
747 continue; \
748} while(0)
749
750PyObject *PyUnicode_DecodeUTF16(const char *s,
751 int size,
752 const char *errors,
753 int *byteorder)
754{
755 PyUnicodeObject *unicode;
756 Py_UNICODE *p;
757 const Py_UNICODE *q, *e;
758 int bo = 0;
759
760 /* size should be an even number */
761 if (size % sizeof(Py_UNICODE) != 0) {
762 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
763 return NULL;
764 /* The remaining input chars are ignored if we fall through
765 here... */
766 }
767
768 /* Note: size will always be longer than the resulting Unicode
769 character count */
770 unicode = _PyUnicode_New(size);
771 if (!unicode)
772 return NULL;
773 if (size == 0)
774 return (PyObject *)unicode;
775
776 /* Unpack UTF-16 encoded data */
777 p = unicode->str;
778 q = (Py_UNICODE *)s;
779 e = q + (size / sizeof(Py_UNICODE));
780
781 if (byteorder)
782 bo = *byteorder;
783
784 while (q < e) {
785 register Py_UNICODE ch = *q++;
786
787 /* Check for BOM marks (U+FEFF) in the input and adjust
788 current byte order setting accordingly. Swap input
789 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
790 !) */
791#ifdef BYTEORDER_IS_LITTLE_ENDIAN
792 if (ch == 0xFEFF) {
793 bo = -1;
794 continue;
795 } else if (ch == 0xFFFE) {
796 bo = 1;
797 continue;
798 }
799 if (bo == 1)
800 ch = (ch >> 8) | (ch << 8);
801#else
802 if (ch == 0xFEFF) {
803 bo = 1;
804 continue;
805 } else if (ch == 0xFFFE) {
806 bo = -1;
807 continue;
808 }
809 if (bo == -1)
810 ch = (ch >> 8) | (ch << 8);
811#endif
812 if (ch < 0xD800 || ch > 0xDFFF) {
813 *p++ = ch;
814 continue;
815 }
816
817 /* UTF-16 code pair: */
818 if (q >= e)
819 UTF16_ERROR("unexpected end of data");
820 if (0xDC00 <= *q && *q <= 0xDFFF) {
821 q++;
822 if (0xD800 <= *q && *q <= 0xDBFF)
823 /* This is valid data (a UTF-16 surrogate pair), but
824 we are not able to store this information since our
825 Py_UNICODE type only has 16 bits... this might
826 change someday, even though it's unlikely. */
827 UTF16_ERROR("code pairs are not supported");
828 else
829 continue;
830 }
831 UTF16_ERROR("illegal encoding");
832 }
833
834 if (byteorder)
835 *byteorder = bo;
836
837 /* Adjust length */
838 if (_PyUnicode_Resize(unicode, p - unicode->str))
839 goto onError;
840
841 return (PyObject *)unicode;
842
843onError:
844 Py_DECREF(unicode);
845 return NULL;
846}
847
848#undef UTF16_ERROR
849
850PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
851 int size,
852 const char *errors,
853 int byteorder)
854{
855 PyObject *v;
856 Py_UNICODE *p;
857 char *q;
858
859 /* We don't create UTF-16 pairs... */
860 v = PyString_FromStringAndSize(NULL,
861 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
862 if (v == NULL)
863 return NULL;
864 if (size == 0)
865 goto done;
866
867 q = PyString_AS_STRING(v);
868 p = (Py_UNICODE *)q;
869
870 if (byteorder == 0)
871 *p++ = 0xFEFF;
872 if (byteorder == 0 ||
873#ifdef BYTEORDER_IS_LITTLE_ENDIAN
874 byteorder == -1
875#else
876 byteorder == 1
877#endif
878 )
879 memcpy(p, s, size * sizeof(Py_UNICODE));
880 else
881 while (size-- > 0) {
882 Py_UNICODE ch = *s++;
883 *p++ = (ch >> 8) | (ch << 8);
884 }
885 done:
886 return v;
887}
888
889PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
890{
891 if (!PyUnicode_Check(unicode)) {
892 PyErr_BadArgument();
893 return NULL;
894 }
895 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
896 PyUnicode_GET_SIZE(unicode),
897 NULL,
898 0);
899}
900
901/* --- Unicode Escape Codec ----------------------------------------------- */
902
903static
904int unicodeescape_decoding_error(const char **source,
905 unsigned int *x,
906 const char *errors,
907 const char *details)
908{
909 if ((errors == NULL) ||
910 (strcmp(errors,"strict") == 0)) {
911 PyErr_Format(PyExc_UnicodeError,
912 "Unicode-Escape decoding error: %s",
913 details);
914 return -1;
915 }
916 else if (strcmp(errors,"ignore") == 0) {
917 return 0;
918 }
919 else if (strcmp(errors,"replace") == 0) {
920 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
921 return 0;
922 }
923 else {
924 PyErr_Format(PyExc_ValueError,
925 "Unicode-Escape decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +0000926 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000927 errors);
928 return -1;
929 }
930}
931
932PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
933 int size,
934 const char *errors)
935{
936 PyUnicodeObject *v;
937 Py_UNICODE *p = NULL, *buf = NULL;
938 const char *end;
939
940 /* Escaped strings will always be longer than the resulting
941 Unicode string, so we start with size here and then reduce the
942 length after conversion to the true value. */
943 v = _PyUnicode_New(size);
944 if (v == NULL)
945 goto onError;
946 if (size == 0)
947 return (PyObject *)v;
948 p = buf = PyUnicode_AS_UNICODE(v);
949 end = s + size;
950 while (s < end) {
951 unsigned char c;
952 unsigned int x;
953 int i;
954
955 /* Non-escape characters are interpreted as Unicode ordinals */
956 if (*s != '\\') {
957 *p++ = (unsigned char)*s++;
958 continue;
959 }
960
961 /* \ - Escapes */
962 s++;
963 switch (*s++) {
964
965 /* \x escapes */
966 case '\n': break;
967 case '\\': *p++ = '\\'; break;
968 case '\'': *p++ = '\''; break;
969 case '\"': *p++ = '\"'; break;
970 case 'b': *p++ = '\b'; break;
971 case 'f': *p++ = '\014'; break; /* FF */
972 case 't': *p++ = '\t'; break;
973 case 'n': *p++ = '\n'; break;
974 case 'r': *p++ = '\r'; break;
975 case 'v': *p++ = '\013'; break; /* VT */
976 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
977
978 /* \OOO (octal) escapes */
979 case '0': case '1': case '2': case '3':
980 case '4': case '5': case '6': case '7':
981 c = s[-1] - '0';
982 if ('0' <= *s && *s <= '7') {
983 c = (c<<3) + *s++ - '0';
984 if ('0' <= *s && *s <= '7')
985 c = (c<<3) + *s++ - '0';
986 }
987 *p++ = c;
988 break;
989
990 /* \xXXXX escape with 0-4 hex digits */
991 case 'x':
992 x = 0;
993 c = (unsigned char)*s;
994 if (isxdigit(c)) {
995 do {
996 x = (x<<4) & ~0xF;
997 if ('0' <= c && c <= '9')
998 x += c - '0';
999 else if ('a' <= c && c <= 'f')
1000 x += 10 + c - 'a';
1001 else
1002 x += 10 + c - 'A';
1003 c = (unsigned char)*++s;
1004 } while (isxdigit(c));
1005 *p++ = x;
1006 } else {
1007 *p++ = '\\';
1008 *p++ = (unsigned char)s[-1];
1009 }
1010 break;
1011
1012 /* \uXXXX with 4 hex digits */
1013 case 'u':
1014 for (x = 0, i = 0; i < 4; i++) {
1015 c = (unsigned char)s[i];
1016 if (!isxdigit(c)) {
1017 if (unicodeescape_decoding_error(&s, &x, errors,
1018 "truncated \\uXXXX"))
1019 goto onError;
1020 i++;
1021 break;
1022 }
1023 x = (x<<4) & ~0xF;
1024 if (c >= '0' && c <= '9')
1025 x += c - '0';
1026 else if (c >= 'a' && c <= 'f')
1027 x += 10 + c - 'a';
1028 else
1029 x += 10 + c - 'A';
1030 }
1031 s += i;
1032 *p++ = x;
1033 break;
1034
1035 default:
1036 *p++ = '\\';
1037 *p++ = (unsigned char)s[-1];
1038 break;
1039 }
1040 }
1041 _PyUnicode_Resize(v, (int)(p - buf));
1042 return (PyObject *)v;
1043
1044 onError:
1045 Py_XDECREF(v);
1046 return NULL;
1047}
1048
1049/* Return a Unicode-Escape string version of the Unicode object.
1050
1051 If quotes is true, the string is enclosed in u"" or u'' quotes as
1052 appropriate.
1053
1054*/
1055
Barry Warsaw51ac5802000-03-20 16:36:48 +00001056static const Py_UNICODE *findchar(const Py_UNICODE *s,
1057 int size,
1058 Py_UNICODE ch);
1059
Guido van Rossumd57fd912000-03-10 22:53:23 +00001060static
1061PyObject *unicodeescape_string(const Py_UNICODE *s,
1062 int size,
1063 int quotes)
1064{
1065 PyObject *repr;
1066 char *p;
1067 char *q;
1068
1069 static const char *hexdigit = "0123456789ABCDEF";
1070
1071 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1072 if (repr == NULL)
1073 return NULL;
1074
1075 p = q = PyString_AS_STRING(repr);
1076
1077 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 *p++ = 'u';
1079 *p++ = (findchar(s, size, '\'') &&
1080 !findchar(s, size, '"')) ? '"' : '\'';
1081 }
1082 while (size-- > 0) {
1083 Py_UNICODE ch = *s++;
1084 /* Escape quotes */
1085 if (quotes && (ch == q[1] || ch == '\\')) {
1086 *p++ = '\\';
1087 *p++ = (char) ch;
1088 }
1089 /* Map 16-bit characters to '\uxxxx' */
1090 else if (ch >= 256) {
1091 *p++ = '\\';
1092 *p++ = 'u';
1093 *p++ = hexdigit[(ch >> 12) & 0xf];
1094 *p++ = hexdigit[(ch >> 8) & 0xf];
1095 *p++ = hexdigit[(ch >> 4) & 0xf];
1096 *p++ = hexdigit[ch & 15];
1097 }
1098 /* Map non-printable US ASCII to '\ooo' */
1099 else if (ch < ' ' || ch >= 128) {
1100 *p++ = '\\';
1101 *p++ = hexdigit[(ch >> 6) & 7];
1102 *p++ = hexdigit[(ch >> 3) & 7];
1103 *p++ = hexdigit[ch & 7];
1104 }
1105 /* Copy everything else as-is */
1106 else
1107 *p++ = (char) ch;
1108 }
1109 if (quotes)
1110 *p++ = q[1];
1111
1112 *p = '\0';
1113 _PyString_Resize(&repr, p - q);
1114
1115 return repr;
1116}
1117
1118PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1119 int size)
1120{
1121 return unicodeescape_string(s, size, 0);
1122}
1123
1124PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1125{
1126 if (!PyUnicode_Check(unicode)) {
1127 PyErr_BadArgument();
1128 return NULL;
1129 }
1130 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1131 PyUnicode_GET_SIZE(unicode));
1132}
1133
1134/* --- Raw Unicode Escape Codec ------------------------------------------- */
1135
1136PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1137 int size,
1138 const char *errors)
1139{
1140 PyUnicodeObject *v;
1141 Py_UNICODE *p, *buf;
1142 const char *end;
1143 const char *bs;
1144
1145 /* Escaped strings will always be longer than the resulting
1146 Unicode string, so we start with size here and then reduce the
1147 length after conversion to the true value. */
1148 v = _PyUnicode_New(size);
1149 if (v == NULL)
1150 goto onError;
1151 if (size == 0)
1152 return (PyObject *)v;
1153 p = buf = PyUnicode_AS_UNICODE(v);
1154 end = s + size;
1155 while (s < end) {
1156 unsigned char c;
1157 unsigned int x;
1158 int i;
1159
1160 /* Non-escape characters are interpreted as Unicode ordinals */
1161 if (*s != '\\') {
1162 *p++ = (unsigned char)*s++;
1163 continue;
1164 }
1165
1166 /* \u-escapes are only interpreted iff the number of leading
1167 backslashes if odd */
1168 bs = s;
1169 for (;s < end;) {
1170 if (*s != '\\')
1171 break;
1172 *p++ = (unsigned char)*s++;
1173 }
1174 if (((s - bs) & 1) == 0 ||
1175 s >= end ||
1176 *s != 'u') {
1177 continue;
1178 }
1179 p--;
1180 s++;
1181
1182 /* \uXXXX with 4 hex digits */
1183 for (x = 0, i = 0; i < 4; i++) {
1184 c = (unsigned char)s[i];
1185 if (!isxdigit(c)) {
1186 if (unicodeescape_decoding_error(&s, &x, errors,
1187 "truncated \\uXXXX"))
1188 goto onError;
1189 i++;
1190 break;
1191 }
1192 x = (x<<4) & ~0xF;
1193 if (c >= '0' && c <= '9')
1194 x += c - '0';
1195 else if (c >= 'a' && c <= 'f')
1196 x += 10 + c - 'a';
1197 else
1198 x += 10 + c - 'A';
1199 }
1200 s += i;
1201 *p++ = x;
1202 }
1203 _PyUnicode_Resize(v, (int)(p - buf));
1204 return (PyObject *)v;
1205
1206 onError:
1207 Py_XDECREF(v);
1208 return NULL;
1209}
1210
1211PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1212 int size)
1213{
1214 PyObject *repr;
1215 char *p;
1216 char *q;
1217
1218 static const char *hexdigit = "0123456789ABCDEF";
1219
1220 repr = PyString_FromStringAndSize(NULL, 6 * size);
1221 if (repr == NULL)
1222 return NULL;
1223
1224 p = q = PyString_AS_STRING(repr);
1225 while (size-- > 0) {
1226 Py_UNICODE ch = *s++;
1227 /* Map 16-bit characters to '\uxxxx' */
1228 if (ch >= 256) {
1229 *p++ = '\\';
1230 *p++ = 'u';
1231 *p++ = hexdigit[(ch >> 12) & 0xf];
1232 *p++ = hexdigit[(ch >> 8) & 0xf];
1233 *p++ = hexdigit[(ch >> 4) & 0xf];
1234 *p++ = hexdigit[ch & 15];
1235 }
1236 /* Copy everything else as-is */
1237 else
1238 *p++ = (char) ch;
1239 }
1240 *p = '\0';
1241 _PyString_Resize(&repr, p - q);
1242
1243 return repr;
1244}
1245
1246PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1247{
1248 if (!PyUnicode_Check(unicode)) {
1249 PyErr_BadArgument();
1250 return NULL;
1251 }
1252 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1253 PyUnicode_GET_SIZE(unicode));
1254}
1255
1256/* --- Latin-1 Codec ------------------------------------------------------ */
1257
1258PyObject *PyUnicode_DecodeLatin1(const char *s,
1259 int size,
1260 const char *errors)
1261{
1262 PyUnicodeObject *v;
1263 Py_UNICODE *p;
1264
1265 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1266 v = _PyUnicode_New(size);
1267 if (v == NULL)
1268 goto onError;
1269 if (size == 0)
1270 return (PyObject *)v;
1271 p = PyUnicode_AS_UNICODE(v);
1272 while (size-- > 0)
1273 *p++ = (unsigned char)*s++;
1274 return (PyObject *)v;
1275
1276 onError:
1277 Py_XDECREF(v);
1278 return NULL;
1279}
1280
1281static
1282int latin1_encoding_error(const Py_UNICODE **source,
1283 char **dest,
1284 const char *errors,
1285 const char *details)
1286{
1287 if ((errors == NULL) ||
1288 (strcmp(errors,"strict") == 0)) {
1289 PyErr_Format(PyExc_UnicodeError,
1290 "Latin-1 encoding error: %s",
1291 details);
1292 return -1;
1293 }
1294 else if (strcmp(errors,"ignore") == 0) {
1295 return 0;
1296 }
1297 else if (strcmp(errors,"replace") == 0) {
1298 **dest = '?';
1299 return 0;
1300 }
1301 else {
1302 PyErr_Format(PyExc_ValueError,
1303 "Latin-1 encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001304 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001305 errors);
1306 return -1;
1307 }
1308}
1309
1310PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1311 int size,
1312 const char *errors)
1313{
1314 PyObject *repr;
1315 char *s;
1316 repr = PyString_FromStringAndSize(NULL, size);
1317 if (repr == NULL)
1318 return NULL;
1319
1320 s = PyString_AS_STRING(repr);
1321 while (size-- > 0) {
1322 Py_UNICODE ch = *p++;
1323 if (ch >= 256) {
1324 if (latin1_encoding_error(&p, &s, errors,
1325 "ordinal not in range(256)"))
1326 goto onError;
1327 }
1328 else
1329 *s++ = (char)ch;
1330 }
1331 return repr;
1332
1333 onError:
1334 Py_DECREF(repr);
1335 return NULL;
1336}
1337
1338PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1339{
1340 if (!PyUnicode_Check(unicode)) {
1341 PyErr_BadArgument();
1342 return NULL;
1343 }
1344 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1345 PyUnicode_GET_SIZE(unicode),
1346 NULL);
1347}
1348
1349/* --- 7-bit ASCII Codec -------------------------------------------------- */
1350
1351static
1352int ascii_decoding_error(const char **source,
1353 Py_UNICODE **dest,
1354 const char *errors,
1355 const char *details)
1356{
1357 if ((errors == NULL) ||
1358 (strcmp(errors,"strict") == 0)) {
1359 PyErr_Format(PyExc_UnicodeError,
1360 "ASCII decoding error: %s",
1361 details);
1362 return -1;
1363 }
1364 else if (strcmp(errors,"ignore") == 0) {
1365 return 0;
1366 }
1367 else if (strcmp(errors,"replace") == 0) {
1368 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1369 (*dest)++;
1370 return 0;
1371 }
1372 else {
1373 PyErr_Format(PyExc_ValueError,
1374 "ASCII decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001375 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 errors);
1377 return -1;
1378 }
1379}
1380
1381PyObject *PyUnicode_DecodeASCII(const char *s,
1382 int size,
1383 const char *errors)
1384{
1385 PyUnicodeObject *v;
1386 Py_UNICODE *p;
1387
1388 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1389 v = _PyUnicode_New(size);
1390 if (v == NULL)
1391 goto onError;
1392 if (size == 0)
1393 return (PyObject *)v;
1394 p = PyUnicode_AS_UNICODE(v);
1395 while (size-- > 0) {
1396 register unsigned char c;
1397
1398 c = (unsigned char)*s++;
1399 if (c < 128)
1400 *p++ = c;
1401 else if (ascii_decoding_error(&s, &p, errors,
1402 "ordinal not in range(128)"))
1403 goto onError;
1404 }
1405 if (p - PyUnicode_AS_UNICODE(v) < size)
1406 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1407 return (PyObject *)v;
1408
1409 onError:
1410 Py_XDECREF(v);
1411 return NULL;
1412}
1413
1414static
1415int ascii_encoding_error(const Py_UNICODE **source,
1416 char **dest,
1417 const char *errors,
1418 const char *details)
1419{
1420 if ((errors == NULL) ||
1421 (strcmp(errors,"strict") == 0)) {
1422 PyErr_Format(PyExc_UnicodeError,
1423 "ASCII encoding error: %s",
1424 details);
1425 return -1;
1426 }
1427 else if (strcmp(errors,"ignore") == 0) {
1428 return 0;
1429 }
1430 else if (strcmp(errors,"replace") == 0) {
1431 **dest = '?';
1432 return 0;
1433 }
1434 else {
1435 PyErr_Format(PyExc_ValueError,
1436 "ASCII encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001437 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001438 errors);
1439 return -1;
1440 }
1441}
1442
1443PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1444 int size,
1445 const char *errors)
1446{
1447 PyObject *repr;
1448 char *s;
1449 repr = PyString_FromStringAndSize(NULL, size);
1450 if (repr == NULL)
1451 return NULL;
1452
1453 s = PyString_AS_STRING(repr);
1454 while (size-- > 0) {
1455 Py_UNICODE ch = *p++;
1456 if (ch >= 128) {
1457 if (ascii_encoding_error(&p, &s, errors,
1458 "ordinal not in range(128)"))
1459 goto onError;
1460 }
1461 else
1462 *s++ = (char)ch;
1463 }
1464 return repr;
1465
1466 onError:
1467 Py_DECREF(repr);
1468 return NULL;
1469}
1470
1471PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1472{
1473 if (!PyUnicode_Check(unicode)) {
1474 PyErr_BadArgument();
1475 return NULL;
1476 }
1477 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1478 PyUnicode_GET_SIZE(unicode),
1479 NULL);
1480}
1481
1482/* --- Character Mapping Codec -------------------------------------------- */
1483
1484static
1485int charmap_decoding_error(const char **source,
1486 Py_UNICODE **dest,
1487 const char *errors,
1488 const char *details)
1489{
1490 if ((errors == NULL) ||
1491 (strcmp(errors,"strict") == 0)) {
1492 PyErr_Format(PyExc_UnicodeError,
1493 "charmap decoding error: %s",
1494 details);
1495 return -1;
1496 }
1497 else if (strcmp(errors,"ignore") == 0) {
1498 return 0;
1499 }
1500 else if (strcmp(errors,"replace") == 0) {
1501 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1502 (*dest)++;
1503 return 0;
1504 }
1505 else {
1506 PyErr_Format(PyExc_ValueError,
1507 "charmap decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001508 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001509 errors);
1510 return -1;
1511 }
1512}
1513
1514PyObject *PyUnicode_DecodeCharmap(const char *s,
1515 int size,
1516 PyObject *mapping,
1517 const char *errors)
1518{
1519 PyUnicodeObject *v;
1520 Py_UNICODE *p;
1521
1522 /* Default to Latin-1 */
1523 if (mapping == NULL)
1524 return PyUnicode_DecodeLatin1(s, size, errors);
1525
1526 v = _PyUnicode_New(size);
1527 if (v == NULL)
1528 goto onError;
1529 if (size == 0)
1530 return (PyObject *)v;
1531 p = PyUnicode_AS_UNICODE(v);
1532 while (size-- > 0) {
1533 unsigned char ch = *s++;
1534 PyObject *w, *x;
1535
1536 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1537 w = PyInt_FromLong((long)ch);
1538 if (w == NULL)
1539 goto onError;
1540 x = PyObject_GetItem(mapping, w);
1541 Py_DECREF(w);
1542 if (x == NULL) {
1543 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1544 /* No mapping found: default to Latin-1 mapping */
1545 PyErr_Clear();
1546 *p++ = (Py_UNICODE)ch;
1547 continue;
1548 }
1549 goto onError;
1550 }
1551
1552 /* Apply mapping */
1553 if (PyInt_Check(x)) {
1554 int value = PyInt_AS_LONG(x);
1555 if (value < 0 || value > 65535) {
1556 PyErr_SetString(PyExc_TypeError,
1557 "character mapping must be in range(65336)");
1558 Py_DECREF(x);
1559 goto onError;
1560 }
1561 *p++ = (Py_UNICODE)value;
1562 }
1563 else if (x == Py_None) {
1564 /* undefined mapping */
1565 if (charmap_decoding_error(&s, &p, errors,
1566 "character maps to <undefined>")) {
1567 Py_DECREF(x);
1568 goto onError;
1569 }
1570 }
1571 else if (PyUnicode_Check(x)) {
1572 if (PyUnicode_GET_SIZE(x) != 1) {
1573 /* 1-n mapping */
1574 PyErr_SetString(PyExc_NotImplementedError,
1575 "1-n mappings are currently not implemented");
1576 Py_DECREF(x);
1577 goto onError;
1578 }
1579 *p++ = *PyUnicode_AS_UNICODE(x);
1580 }
1581 else {
1582 /* wrong return value */
1583 PyErr_SetString(PyExc_TypeError,
1584 "character mapping must return integer, None or unicode");
1585 Py_DECREF(x);
1586 goto onError;
1587 }
1588 Py_DECREF(x);
1589 }
1590 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1591 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1592 goto onError;
1593 return (PyObject *)v;
1594
1595 onError:
1596 Py_XDECREF(v);
1597 return NULL;
1598}
1599
1600static
1601int charmap_encoding_error(const Py_UNICODE **source,
1602 char **dest,
1603 const char *errors,
1604 const char *details)
1605{
1606 if ((errors == NULL) ||
1607 (strcmp(errors,"strict") == 0)) {
1608 PyErr_Format(PyExc_UnicodeError,
1609 "charmap encoding error: %s",
1610 details);
1611 return -1;
1612 }
1613 else if (strcmp(errors,"ignore") == 0) {
1614 return 0;
1615 }
1616 else if (strcmp(errors,"replace") == 0) {
1617 **dest = '?';
1618 (*dest)++;
1619 return 0;
1620 }
1621 else {
1622 PyErr_Format(PyExc_ValueError,
1623 "charmap encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001624 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001625 errors);
1626 return -1;
1627 }
1628}
1629
1630PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1631 int size,
1632 PyObject *mapping,
1633 const char *errors)
1634{
1635 PyObject *v;
1636 char *s;
1637
1638 /* Default to Latin-1 */
1639 if (mapping == NULL)
1640 return PyUnicode_EncodeLatin1(p, size, errors);
1641
1642 v = PyString_FromStringAndSize(NULL, size);
1643 if (v == NULL)
1644 return NULL;
1645 s = PyString_AS_STRING(v);
1646 while (size-- > 0) {
1647 Py_UNICODE ch = *p++;
1648 PyObject *w, *x;
1649
1650 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1651 w = PyInt_FromLong((long)ch);
1652 if (w == NULL)
1653 goto onError;
1654 x = PyObject_GetItem(mapping, w);
1655 Py_DECREF(w);
1656 if (x == NULL) {
1657 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1658 /* No mapping found: default to Latin-1 mapping if possible */
1659 PyErr_Clear();
1660 if (ch < 256) {
1661 *s++ = (char)ch;
1662 continue;
1663 }
1664 else if (!charmap_encoding_error(&p, &s, errors,
1665 "missing character mapping"))
1666 continue;
1667 }
1668 goto onError;
1669 }
1670
1671 /* Apply mapping */
1672 if (PyInt_Check(x)) {
1673 int value = PyInt_AS_LONG(x);
1674 if (value < 0 || value > 255) {
1675 PyErr_SetString(PyExc_TypeError,
1676 "character mapping must be in range(256)");
1677 Py_DECREF(x);
1678 goto onError;
1679 }
1680 *s++ = (char)value;
1681 }
1682 else if (x == Py_None) {
1683 /* undefined mapping */
1684 if (charmap_encoding_error(&p, &s, errors,
1685 "character maps to <undefined>")) {
1686 Py_DECREF(x);
1687 goto onError;
1688 }
1689 }
1690 else if (PyString_Check(x)) {
1691 if (PyString_GET_SIZE(x) != 1) {
1692 /* 1-n mapping */
1693 PyErr_SetString(PyExc_NotImplementedError,
1694 "1-n mappings are currently not implemented");
1695 Py_DECREF(x);
1696 goto onError;
1697 }
1698 *s++ = *PyString_AS_STRING(x);
1699 }
1700 else {
1701 /* wrong return value */
1702 PyErr_SetString(PyExc_TypeError,
1703 "character mapping must return integer, None or unicode");
1704 Py_DECREF(x);
1705 goto onError;
1706 }
1707 Py_DECREF(x);
1708 }
1709 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1710 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1711 goto onError;
1712 return v;
1713
1714 onError:
1715 Py_DECREF(v);
1716 return NULL;
1717}
1718
1719PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1720 PyObject *mapping)
1721{
1722 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1723 PyErr_BadArgument();
1724 return NULL;
1725 }
1726 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1727 PyUnicode_GET_SIZE(unicode),
1728 mapping,
1729 NULL);
1730}
1731
1732static
1733int translate_error(const Py_UNICODE **source,
1734 Py_UNICODE **dest,
1735 const char *errors,
1736 const char *details)
1737{
1738 if ((errors == NULL) ||
1739 (strcmp(errors,"strict") == 0)) {
1740 PyErr_Format(PyExc_UnicodeError,
1741 "translate error: %s",
1742 details);
1743 return -1;
1744 }
1745 else if (strcmp(errors,"ignore") == 0) {
1746 return 0;
1747 }
1748 else if (strcmp(errors,"replace") == 0) {
1749 **dest = '?';
1750 (*dest)++;
1751 return 0;
1752 }
1753 else {
1754 PyErr_Format(PyExc_ValueError,
1755 "translate error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001756 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001757 errors);
1758 return -1;
1759 }
1760}
1761
1762PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1763 int size,
1764 PyObject *mapping,
1765 const char *errors)
1766{
1767 PyUnicodeObject *v;
1768 Py_UNICODE *p;
1769
1770 if (mapping == NULL) {
1771 PyErr_BadArgument();
1772 return NULL;
1773 }
1774
1775 /* Output will never be longer than input */
1776 v = _PyUnicode_New(size);
1777 if (v == NULL)
1778 goto onError;
1779 if (size == 0)
1780 goto done;
1781 p = PyUnicode_AS_UNICODE(v);
1782 while (size-- > 0) {
1783 Py_UNICODE ch = *s++;
1784 PyObject *w, *x;
1785
1786 /* Get mapping */
1787 w = PyInt_FromLong(ch);
1788 if (w == NULL)
1789 goto onError;
1790 x = PyObject_GetItem(mapping, w);
1791 Py_DECREF(w);
1792 if (x == NULL) {
1793 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1794 /* No mapping found: default to 1-1 mapping */
1795 PyErr_Clear();
1796 *p++ = ch;
1797 continue;
1798 }
1799 goto onError;
1800 }
1801
1802 /* Apply mapping */
1803 if (PyInt_Check(x))
1804 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1805 else if (x == Py_None) {
1806 /* undefined mapping */
1807 if (translate_error(&s, &p, errors,
1808 "character maps to <undefined>")) {
1809 Py_DECREF(x);
1810 goto onError;
1811 }
1812 }
1813 else if (PyUnicode_Check(x)) {
1814 if (PyUnicode_GET_SIZE(x) != 1) {
1815 /* 1-n mapping */
1816 PyErr_SetString(PyExc_NotImplementedError,
1817 "1-n mappings are currently not implemented");
1818 Py_DECREF(x);
1819 goto onError;
1820 }
1821 *p++ = *PyUnicode_AS_UNICODE(x);
1822 }
1823 else {
1824 /* wrong return value */
1825 PyErr_SetString(PyExc_TypeError,
1826 "translate mapping must return integer, None or unicode");
1827 Py_DECREF(x);
1828 goto onError;
1829 }
1830 Py_DECREF(x);
1831 }
1832 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1833 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1834
1835 done:
1836 return (PyObject *)v;
1837
1838 onError:
1839 Py_XDECREF(v);
1840 return NULL;
1841}
1842
1843PyObject *PyUnicode_Translate(PyObject *str,
1844 PyObject *mapping,
1845 const char *errors)
1846{
1847 PyObject *result;
1848
1849 str = PyUnicode_FromObject(str);
1850 if (str == NULL)
1851 goto onError;
1852 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1853 PyUnicode_GET_SIZE(str),
1854 mapping,
1855 errors);
1856 Py_DECREF(str);
1857 return result;
1858
1859 onError:
1860 Py_XDECREF(str);
1861 return NULL;
1862}
1863
1864/* --- Helpers ------------------------------------------------------------ */
1865
1866static
1867int count(PyUnicodeObject *self,
1868 int start,
1869 int end,
1870 PyUnicodeObject *substring)
1871{
1872 int count = 0;
1873
1874 end -= substring->length;
1875
1876 while (start <= end)
1877 if (Py_UNICODE_MATCH(self, start, substring)) {
1878 count++;
1879 start += substring->length;
1880 } else
1881 start++;
1882
1883 return count;
1884}
1885
1886int PyUnicode_Count(PyObject *str,
1887 PyObject *substr,
1888 int start,
1889 int end)
1890{
1891 int result;
1892
1893 str = PyUnicode_FromObject(str);
1894 if (str == NULL)
1895 return -1;
1896 substr = PyUnicode_FromObject(substr);
1897 if (substr == NULL) {
1898 Py_DECREF(substr);
1899 return -1;
1900 }
1901
1902 result = count((PyUnicodeObject *)str,
1903 start, end,
1904 (PyUnicodeObject *)substr);
1905
1906 Py_DECREF(str);
1907 Py_DECREF(substr);
1908 return result;
1909}
1910
1911static
1912int findstring(PyUnicodeObject *self,
1913 PyUnicodeObject *substring,
1914 int start,
1915 int end,
1916 int direction)
1917{
1918 if (start < 0)
1919 start += self->length;
1920 if (start < 0)
1921 start = 0;
1922
1923 if (substring->length == 0)
1924 return start;
1925
1926 if (end > self->length)
1927 end = self->length;
1928 if (end < 0)
1929 end += self->length;
1930 if (end < 0)
1931 end = 0;
1932
1933 end -= substring->length;
1934
1935 if (direction < 0) {
1936 for (; end >= start; end--)
1937 if (Py_UNICODE_MATCH(self, end, substring))
1938 return end;
1939 } else {
1940 for (; start <= end; start++)
1941 if (Py_UNICODE_MATCH(self, start, substring))
1942 return start;
1943 }
1944
1945 return -1;
1946}
1947
1948int PyUnicode_Find(PyObject *str,
1949 PyObject *substr,
1950 int start,
1951 int end,
1952 int direction)
1953{
1954 int result;
1955
1956 str = PyUnicode_FromObject(str);
1957 if (str == NULL)
1958 return -1;
1959 substr = PyUnicode_FromObject(substr);
1960 if (substr == NULL) {
1961 Py_DECREF(substr);
1962 return -1;
1963 }
1964
1965 result = findstring((PyUnicodeObject *)str,
1966 (PyUnicodeObject *)substr,
1967 start, end, direction);
1968 Py_DECREF(str);
1969 Py_DECREF(substr);
1970 return result;
1971}
1972
1973static
1974int tailmatch(PyUnicodeObject *self,
1975 PyUnicodeObject *substring,
1976 int start,
1977 int end,
1978 int direction)
1979{
1980 if (start < 0)
1981 start += self->length;
1982 if (start < 0)
1983 start = 0;
1984
1985 if (substring->length == 0)
1986 return 1;
1987
1988 if (end > self->length)
1989 end = self->length;
1990 if (end < 0)
1991 end += self->length;
1992 if (end < 0)
1993 end = 0;
1994
1995 end -= substring->length;
1996 if (end < start)
1997 return 0;
1998
1999 if (direction > 0) {
2000 if (Py_UNICODE_MATCH(self, end, substring))
2001 return 1;
2002 } else {
2003 if (Py_UNICODE_MATCH(self, start, substring))
2004 return 1;
2005 }
2006
2007 return 0;
2008}
2009
2010int PyUnicode_Tailmatch(PyObject *str,
2011 PyObject *substr,
2012 int start,
2013 int end,
2014 int direction)
2015{
2016 int result;
2017
2018 str = PyUnicode_FromObject(str);
2019 if (str == NULL)
2020 return -1;
2021 substr = PyUnicode_FromObject(substr);
2022 if (substr == NULL) {
2023 Py_DECREF(substr);
2024 return -1;
2025 }
2026
2027 result = tailmatch((PyUnicodeObject *)str,
2028 (PyUnicodeObject *)substr,
2029 start, end, direction);
2030 Py_DECREF(str);
2031 Py_DECREF(substr);
2032 return result;
2033}
2034
2035static
2036const Py_UNICODE *findchar(const Py_UNICODE *s,
2037 int size,
2038 Py_UNICODE ch)
2039{
2040 /* like wcschr, but doesn't stop at NULL characters */
2041
2042 while (size-- > 0) {
2043 if (*s == ch)
2044 return s;
2045 s++;
2046 }
2047
2048 return NULL;
2049}
2050
2051/* Apply fixfct filter to the Unicode object self and return a
2052 reference to the modified object */
2053
2054static
2055PyObject *fixup(PyUnicodeObject *self,
2056 int (*fixfct)(PyUnicodeObject *s))
2057{
2058
2059 PyUnicodeObject *u;
2060
2061 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2062 self->length);
2063 if (u == NULL)
2064 return NULL;
2065 if (!fixfct(u)) {
2066 /* fixfct should return TRUE if it modified the buffer. If
2067 FALSE, return a reference to the original buffer instead
2068 (to save space, not time) */
2069 Py_INCREF(self);
2070 Py_DECREF(u);
2071 return (PyObject*) self;
2072 }
2073 return (PyObject*) u;
2074}
2075
2076static
2077int fixupper(PyUnicodeObject *self)
2078{
2079 int len = self->length;
2080 Py_UNICODE *s = self->str;
2081 int status = 0;
2082
2083 while (len-- > 0) {
2084 register Py_UNICODE ch;
2085
2086 ch = Py_UNICODE_TOUPPER(*s);
2087 if (ch != *s) {
2088 status = 1;
2089 *s = ch;
2090 }
2091 s++;
2092 }
2093
2094 return status;
2095}
2096
2097static
2098int fixlower(PyUnicodeObject *self)
2099{
2100 int len = self->length;
2101 Py_UNICODE *s = self->str;
2102 int status = 0;
2103
2104 while (len-- > 0) {
2105 register Py_UNICODE ch;
2106
2107 ch = Py_UNICODE_TOLOWER(*s);
2108 if (ch != *s) {
2109 status = 1;
2110 *s = ch;
2111 }
2112 s++;
2113 }
2114
2115 return status;
2116}
2117
2118static
2119int fixswapcase(PyUnicodeObject *self)
2120{
2121 int len = self->length;
2122 Py_UNICODE *s = self->str;
2123 int status = 0;
2124
2125 while (len-- > 0) {
2126 if (Py_UNICODE_ISUPPER(*s)) {
2127 *s = Py_UNICODE_TOLOWER(*s);
2128 status = 1;
2129 } else if (Py_UNICODE_ISLOWER(*s)) {
2130 *s = Py_UNICODE_TOUPPER(*s);
2131 status = 1;
2132 }
2133 s++;
2134 }
2135
2136 return status;
2137}
2138
2139static
2140int fixcapitalize(PyUnicodeObject *self)
2141{
2142 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2143 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2144 return 1;
2145 }
2146 return 0;
2147}
2148
2149static
2150int fixtitle(PyUnicodeObject *self)
2151{
2152 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2153 register Py_UNICODE *e;
2154 int previous_is_cased;
2155
2156 /* Shortcut for single character strings */
2157 if (PyUnicode_GET_SIZE(self) == 1) {
2158 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2159 if (*p != ch) {
2160 *p = ch;
2161 return 1;
2162 }
2163 else
2164 return 0;
2165 }
2166
2167 e = p + PyUnicode_GET_SIZE(self);
2168 previous_is_cased = 0;
2169 for (; p < e; p++) {
2170 register const Py_UNICODE ch = *p;
2171
2172 if (previous_is_cased)
2173 *p = Py_UNICODE_TOLOWER(ch);
2174 else
2175 *p = Py_UNICODE_TOTITLE(ch);
2176
2177 if (Py_UNICODE_ISLOWER(ch) ||
2178 Py_UNICODE_ISUPPER(ch) ||
2179 Py_UNICODE_ISTITLE(ch))
2180 previous_is_cased = 1;
2181 else
2182 previous_is_cased = 0;
2183 }
2184 return 1;
2185}
2186
2187PyObject *PyUnicode_Join(PyObject *separator,
2188 PyObject *seq)
2189{
2190 Py_UNICODE *sep;
2191 int seplen;
2192 PyUnicodeObject *res = NULL;
2193 int reslen = 0;
2194 Py_UNICODE *p;
2195 int seqlen = 0;
2196 int sz = 100;
2197 int i;
2198
2199 seqlen = PySequence_Length(seq);
2200 if (seqlen < 0 && PyErr_Occurred())
2201 return NULL;
2202
2203 if (separator == NULL) {
2204 Py_UNICODE blank = ' ';
2205 sep = &blank;
2206 seplen = 1;
2207 }
2208 else {
2209 separator = PyUnicode_FromObject(separator);
2210 if (separator == NULL)
2211 return NULL;
2212 sep = PyUnicode_AS_UNICODE(separator);
2213 seplen = PyUnicode_GET_SIZE(separator);
2214 }
2215
2216 res = _PyUnicode_New(sz);
2217 if (res == NULL)
2218 goto onError;
2219 p = PyUnicode_AS_UNICODE(res);
2220 reslen = 0;
2221
2222 for (i = 0; i < seqlen; i++) {
2223 int itemlen;
2224 PyObject *item;
2225
2226 item = PySequence_GetItem(seq, i);
2227 if (item == NULL)
2228 goto onError;
2229 if (!PyUnicode_Check(item)) {
2230 PyObject *v;
2231 v = PyUnicode_FromObject(item);
2232 Py_DECREF(item);
2233 item = v;
2234 if (item == NULL)
2235 goto onError;
2236 }
2237 itemlen = PyUnicode_GET_SIZE(item);
2238 while (reslen + itemlen + seplen >= sz) {
2239 if (_PyUnicode_Resize(res, sz*2))
2240 goto onError;
2241 sz *= 2;
2242 p = PyUnicode_AS_UNICODE(res) + reslen;
2243 }
2244 if (i > 0) {
2245 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2246 p += seplen;
2247 reslen += seplen;
2248 }
2249 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2250 p += itemlen;
2251 reslen += itemlen;
2252 Py_DECREF(item);
2253 }
2254 if (_PyUnicode_Resize(res, reslen))
2255 goto onError;
2256
2257 Py_XDECREF(separator);
2258 return (PyObject *)res;
2259
2260 onError:
2261 Py_XDECREF(separator);
2262 Py_DECREF(res);
2263 return NULL;
2264}
2265
2266static
2267PyUnicodeObject *pad(PyUnicodeObject *self,
2268 int left,
2269 int right,
2270 Py_UNICODE fill)
2271{
2272 PyUnicodeObject *u;
2273
2274 if (left < 0)
2275 left = 0;
2276 if (right < 0)
2277 right = 0;
2278
2279 if (left == 0 && right == 0) {
2280 Py_INCREF(self);
2281 return self;
2282 }
2283
2284 u = _PyUnicode_New(left + self->length + right);
2285 if (u) {
2286 if (left)
2287 Py_UNICODE_FILL(u->str, fill, left);
2288 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2289 if (right)
2290 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2291 }
2292
2293 return u;
2294}
2295
2296#define SPLIT_APPEND(data, left, right) \
2297 str = PyUnicode_FromUnicode(data + left, right - left); \
2298 if (!str) \
2299 goto onError; \
2300 if (PyList_Append(list, str)) { \
2301 Py_DECREF(str); \
2302 goto onError; \
2303 } \
2304 else \
2305 Py_DECREF(str);
2306
2307static
2308PyObject *split_whitespace(PyUnicodeObject *self,
2309 PyObject *list,
2310 int maxcount)
2311{
2312 register int i;
2313 register int j;
2314 int len = self->length;
2315 PyObject *str;
2316
2317 for (i = j = 0; i < len; ) {
2318 /* find a token */
2319 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2320 i++;
2321 j = i;
2322 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2323 i++;
2324 if (j < i) {
2325 if (maxcount-- <= 0)
2326 break;
2327 SPLIT_APPEND(self->str, j, i);
2328 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2329 i++;
2330 j = i;
2331 }
2332 }
2333 if (j < len) {
2334 SPLIT_APPEND(self->str, j, len);
2335 }
2336 return list;
2337
2338 onError:
2339 Py_DECREF(list);
2340 return NULL;
2341}
2342
2343PyObject *PyUnicode_Splitlines(PyObject *string,
2344 int maxcount)
2345{
2346 register int i;
2347 register int j;
2348 int len;
2349 PyObject *list;
2350 PyObject *str;
2351 Py_UNICODE *data;
2352
2353 string = PyUnicode_FromObject(string);
2354 if (string == NULL)
2355 return NULL;
2356 data = PyUnicode_AS_UNICODE(string);
2357 len = PyUnicode_GET_SIZE(string);
2358
2359 if (maxcount < 0)
2360 maxcount = INT_MAX;
2361
2362 list = PyList_New(0);
2363 if (!list)
2364 goto onError;
2365
2366 for (i = j = 0; i < len; ) {
2367 /* Find a line and append it */
2368 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2369 i++;
2370 if (maxcount-- <= 0)
2371 break;
2372 SPLIT_APPEND(data, j, i);
2373
2374 /* Skip the line break reading CRLF as one line break */
2375 if (i < len) {
2376 if (data[i] == '\r' && i + 1 < len &&
2377 data[i+1] == '\n')
2378 i += 2;
2379 else
2380 i++;
2381 }
2382 j = i;
2383 }
2384 if (j < len) {
2385 SPLIT_APPEND(data, j, len);
2386 }
2387
2388 Py_DECREF(string);
2389 return list;
2390
2391 onError:
2392 Py_DECREF(list);
2393 Py_DECREF(string);
2394 return NULL;
2395}
2396
2397static
2398PyObject *split_char(PyUnicodeObject *self,
2399 PyObject *list,
2400 Py_UNICODE ch,
2401 int maxcount)
2402{
2403 register int i;
2404 register int j;
2405 int len = self->length;
2406 PyObject *str;
2407
2408 for (i = j = 0; i < len; ) {
2409 if (self->str[i] == ch) {
2410 if (maxcount-- <= 0)
2411 break;
2412 SPLIT_APPEND(self->str, j, i);
2413 i = j = i + 1;
2414 } else
2415 i++;
2416 }
2417 if (j <= len) {
2418 SPLIT_APPEND(self->str, j, len);
2419 }
2420 return list;
2421
2422 onError:
2423 Py_DECREF(list);
2424 return NULL;
2425}
2426
2427static
2428PyObject *split_substring(PyUnicodeObject *self,
2429 PyObject *list,
2430 PyUnicodeObject *substring,
2431 int maxcount)
2432{
2433 register int i;
2434 register int j;
2435 int len = self->length;
2436 int sublen = substring->length;
2437 PyObject *str;
2438
2439 for (i = j = 0; i < len - sublen; ) {
2440 if (Py_UNICODE_MATCH(self, i, substring)) {
2441 if (maxcount-- <= 0)
2442 break;
2443 SPLIT_APPEND(self->str, j, i);
2444 i = j = i + sublen;
2445 } else
2446 i++;
2447 }
2448 if (j <= len) {
2449 SPLIT_APPEND(self->str, j, len);
2450 }
2451 return list;
2452
2453 onError:
2454 Py_DECREF(list);
2455 return NULL;
2456}
2457
2458#undef SPLIT_APPEND
2459
2460static
2461PyObject *split(PyUnicodeObject *self,
2462 PyUnicodeObject *substring,
2463 int maxcount)
2464{
2465 PyObject *list;
2466
2467 if (maxcount < 0)
2468 maxcount = INT_MAX;
2469
2470 list = PyList_New(0);
2471 if (!list)
2472 return NULL;
2473
2474 if (substring == NULL)
2475 return split_whitespace(self,list,maxcount);
2476
2477 else if (substring->length == 1)
2478 return split_char(self,list,substring->str[0],maxcount);
2479
2480 else if (substring->length == 0) {
2481 Py_DECREF(list);
2482 PyErr_SetString(PyExc_ValueError, "empty separator");
2483 return NULL;
2484 }
2485 else
2486 return split_substring(self,list,substring,maxcount);
2487}
2488
2489static
2490PyObject *strip(PyUnicodeObject *self,
2491 int left,
2492 int right)
2493{
2494 Py_UNICODE *p = self->str;
2495 int start = 0;
2496 int end = self->length;
2497
2498 if (left)
2499 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2500 start++;
2501
2502 if (right)
2503 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2504 end--;
2505
2506 if (start == 0 && end == self->length) {
2507 /* couldn't strip anything off, return original string */
2508 Py_INCREF(self);
2509 return (PyObject*) self;
2510 }
2511
2512 return (PyObject*) PyUnicode_FromUnicode(
2513 self->str + start,
2514 end - start
2515 );
2516}
2517
2518static
2519PyObject *replace(PyUnicodeObject *self,
2520 PyUnicodeObject *str1,
2521 PyUnicodeObject *str2,
2522 int maxcount)
2523{
2524 PyUnicodeObject *u;
2525
2526 if (maxcount < 0)
2527 maxcount = INT_MAX;
2528
2529 if (str1->length == 1 && str2->length == 1) {
2530 int i;
2531
2532 /* replace characters */
2533 if (!findchar(self->str, self->length, str1->str[0])) {
2534 /* nothing to replace, return original string */
2535 Py_INCREF(self);
2536 u = self;
2537 } else {
2538 Py_UNICODE u1 = str1->str[0];
2539 Py_UNICODE u2 = str2->str[0];
2540
2541 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2542 self->str,
2543 self->length
2544 );
2545 if (u)
2546 for (i = 0; i < u->length; i++)
2547 if (u->str[i] == u1) {
2548 if (--maxcount < 0)
2549 break;
2550 u->str[i] = u2;
2551 }
2552 }
2553
2554 } else {
2555 int n, i;
2556 Py_UNICODE *p;
2557
2558 /* replace strings */
2559 n = count(self, 0, self->length, str1);
2560 if (n > maxcount)
2561 n = maxcount;
2562 if (n == 0) {
2563 /* nothing to replace, return original string */
2564 Py_INCREF(self);
2565 u = self;
2566 } else {
2567 u = _PyUnicode_New(
2568 self->length + n * (str2->length - str1->length));
2569 if (u) {
2570 i = 0;
2571 p = u->str;
2572 while (i <= self->length - str1->length)
2573 if (Py_UNICODE_MATCH(self, i, str1)) {
2574 /* replace string segment */
2575 Py_UNICODE_COPY(p, str2->str, str2->length);
2576 p += str2->length;
2577 i += str1->length;
2578 if (--n <= 0) {
2579 /* copy remaining part */
2580 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2581 break;
2582 }
2583 } else
2584 *p++ = self->str[i++];
2585 }
2586 }
2587 }
2588
2589 return (PyObject *) u;
2590}
2591
2592/* --- Unicode Object Methods --------------------------------------------- */
2593
2594static char title__doc__[] =
2595"S.title() -> unicode\n\
2596\n\
2597Return a titlecased version of S, i.e. words start with title case\n\
2598characters, all remaining cased characters have lower case.";
2599
2600static PyObject*
2601unicode_title(PyUnicodeObject *self, PyObject *args)
2602{
2603 if (!PyArg_NoArgs(args))
2604 return NULL;
2605 return fixup(self, fixtitle);
2606}
2607
2608static char capitalize__doc__[] =
2609"S.capitalize() -> unicode\n\
2610\n\
2611Return a capitalized version of S, i.e. make the first character\n\
2612have upper case.";
2613
2614static PyObject*
2615unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2616{
2617 if (!PyArg_NoArgs(args))
2618 return NULL;
2619 return fixup(self, fixcapitalize);
2620}
2621
2622#if 0
2623static char capwords__doc__[] =
2624"S.capwords() -> unicode\n\
2625\n\
2626Apply .capitalize() to all words in S and return the result with\n\
2627normalized whitespace (all whitespace strings are replaced by ' ').";
2628
2629static PyObject*
2630unicode_capwords(PyUnicodeObject *self, PyObject *args)
2631{
2632 PyObject *list;
2633 PyObject *item;
2634 int i;
2635
2636 if (!PyArg_NoArgs(args))
2637 return NULL;
2638
2639 /* Split into words */
2640 list = split(self, NULL, -1);
2641 if (!list)
2642 return NULL;
2643
2644 /* Capitalize each word */
2645 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2646 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2647 fixcapitalize);
2648 if (item == NULL)
2649 goto onError;
2650 Py_DECREF(PyList_GET_ITEM(list, i));
2651 PyList_SET_ITEM(list, i, item);
2652 }
2653
2654 /* Join the words to form a new string */
2655 item = PyUnicode_Join(NULL, list);
2656
2657onError:
2658 Py_DECREF(list);
2659 return (PyObject *)item;
2660}
2661#endif
2662
2663static char center__doc__[] =
2664"S.center(width) -> unicode\n\
2665\n\
2666Return S centered in a Unicode string of length width. Padding is done\n\
2667using spaces.";
2668
2669static PyObject *
2670unicode_center(PyUnicodeObject *self, PyObject *args)
2671{
2672 int marg, left;
2673 int width;
2674
2675 if (!PyArg_ParseTuple(args, "i:center", &width))
2676 return NULL;
2677
2678 if (self->length >= width) {
2679 Py_INCREF(self);
2680 return (PyObject*) self;
2681 }
2682
2683 marg = width - self->length;
2684 left = marg / 2 + (marg & width & 1);
2685
2686 return (PyObject*) pad(self, left, marg - left, ' ');
2687}
2688
2689static int
2690unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2691{
2692 int len1, len2;
2693 Py_UNICODE *s1 = str1->str;
2694 Py_UNICODE *s2 = str2->str;
2695
2696 len1 = str1->length;
2697 len2 = str2->length;
2698
2699 while (len1 > 0 && len2 > 0) {
2700 int cmp = (*s1++) - (*s2++);
2701 if (cmp)
2702 /* This should make Christian happy! */
2703 return (cmp < 0) ? -1 : (cmp != 0);
2704 len1--, len2--;
2705 }
2706
2707 return (len1 < len2) ? -1 : (len1 != len2);
2708}
2709
2710int PyUnicode_Compare(PyObject *left,
2711 PyObject *right)
2712{
2713 PyUnicodeObject *u = NULL, *v = NULL;
2714 int result;
2715
2716 /* Coerce the two arguments */
2717 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2718 if (u == NULL)
2719 goto onError;
2720 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2721 if (v == NULL)
2722 goto onError;
2723
2724 /* Shortcut for emtpy or interned objects */
2725 if (v == u) {
2726 Py_DECREF(u);
2727 Py_DECREF(v);
2728 return 0;
2729 }
2730
2731 result = unicode_compare(u, v);
2732
2733 Py_DECREF(u);
2734 Py_DECREF(v);
2735 return result;
2736
2737onError:
2738 Py_XDECREF(u);
2739 Py_XDECREF(v);
2740 return -1;
2741}
2742
Guido van Rossum403d68b2000-03-13 15:55:09 +00002743int PyUnicode_Contains(PyObject *container,
2744 PyObject *element)
2745{
2746 PyUnicodeObject *u = NULL, *v = NULL;
2747 int result;
2748 register const Py_UNICODE *p, *e;
2749 register Py_UNICODE ch;
2750
2751 /* Coerce the two arguments */
2752 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2753 if (u == NULL)
2754 goto onError;
2755 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2756 if (v == NULL)
2757 goto onError;
2758
2759 /* Check v in u */
2760 if (PyUnicode_GET_SIZE(v) != 1) {
2761 PyErr_SetString(PyExc_TypeError,
2762 "string member test needs char left operand");
2763 goto onError;
2764 }
2765 ch = *PyUnicode_AS_UNICODE(v);
2766 p = PyUnicode_AS_UNICODE(u);
2767 e = p + PyUnicode_GET_SIZE(u);
2768 result = 0;
2769 while (p < e) {
2770 if (*p++ == ch) {
2771 result = 1;
2772 break;
2773 }
2774 }
2775
2776 Py_DECREF(u);
2777 Py_DECREF(v);
2778 return result;
2779
2780onError:
2781 Py_XDECREF(u);
2782 Py_XDECREF(v);
2783 return -1;
2784}
2785
Guido van Rossumd57fd912000-03-10 22:53:23 +00002786/* Concat to string or Unicode object giving a new Unicode object. */
2787
2788PyObject *PyUnicode_Concat(PyObject *left,
2789 PyObject *right)
2790{
2791 PyUnicodeObject *u = NULL, *v = NULL, *w;
2792
2793 /* Coerce the two arguments */
2794 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2795 if (u == NULL)
2796 goto onError;
2797 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2798 if (v == NULL)
2799 goto onError;
2800
2801 /* Shortcuts */
2802 if (v == unicode_empty) {
2803 Py_DECREF(v);
2804 return (PyObject *)u;
2805 }
2806 if (u == unicode_empty) {
2807 Py_DECREF(u);
2808 return (PyObject *)v;
2809 }
2810
2811 /* Concat the two Unicode strings */
2812 w = _PyUnicode_New(u->length + v->length);
2813 if (w == NULL)
2814 goto onError;
2815 Py_UNICODE_COPY(w->str, u->str, u->length);
2816 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
2817
2818 Py_DECREF(u);
2819 Py_DECREF(v);
2820 return (PyObject *)w;
2821
2822onError:
2823 Py_XDECREF(u);
2824 Py_XDECREF(v);
2825 return NULL;
2826}
2827
2828static char count__doc__[] =
2829"S.count(sub[, start[, end]]) -> int\n\
2830\n\
2831Return the number of occurrences of substring sub in Unicode string\n\
2832S[start:end]. Optional arguments start and end are\n\
2833interpreted as in slice notation.";
2834
2835static PyObject *
2836unicode_count(PyUnicodeObject *self, PyObject *args)
2837{
2838 PyUnicodeObject *substring;
2839 int start = 0;
2840 int end = INT_MAX;
2841 PyObject *result;
2842
2843 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
2844 return NULL;
2845
2846 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2847 (PyObject *)substring);
2848 if (substring == NULL)
2849 return NULL;
2850
2851 if (substring->length == 0) {
2852 Py_DECREF(substring);
2853 return PyInt_FromLong((long) 0);
2854 }
2855
2856 if (start < 0)
2857 start += self->length;
2858 if (start < 0)
2859 start = 0;
2860 if (end > self->length)
2861 end = self->length;
2862 if (end < 0)
2863 end += self->length;
2864 if (end < 0)
2865 end = 0;
2866
2867 result = PyInt_FromLong((long) count(self, start, end, substring));
2868
2869 Py_DECREF(substring);
2870 return result;
2871}
2872
2873static char encode__doc__[] =
2874"S.encode([encoding[,errors]]) -> string\n\
2875\n\
2876Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
2877errors may be given to set a different error handling scheme. Default\n\
2878is 'strict' meaning that encoding errors raise a ValueError. Other\n\
2879possible values are 'ignore' and 'replace'.";
2880
2881static PyObject *
2882unicode_encode(PyUnicodeObject *self, PyObject *args)
2883{
2884 char *encoding = NULL;
2885 char *errors = NULL;
2886 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
2887 return NULL;
2888 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
2889}
2890
2891static char expandtabs__doc__[] =
2892"S.expandtabs([tabsize]) -> unicode\n\
2893\n\
2894Return a copy of S where all tab characters are expanded using spaces.\n\
2895If tabsize is not given, a tab size of 8 characters is assumed.";
2896
2897static PyObject*
2898unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
2899{
2900 Py_UNICODE *e;
2901 Py_UNICODE *p;
2902 Py_UNICODE *q;
2903 int i, j;
2904 PyUnicodeObject *u;
2905 int tabsize = 8;
2906
2907 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
2908 return NULL;
2909
2910 /* First pass: determine size of ouput string */
2911 i = j = 0;
2912 e = self->str + self->length;
2913 for (p = self->str; p < e; p++)
2914 if (*p == '\t') {
2915 if (tabsize > 0)
2916 j += tabsize - (j % tabsize);
2917 }
2918 else {
2919 j++;
2920 if (*p == '\n' || *p == '\r') {
2921 i += j;
2922 j = 0;
2923 }
2924 }
2925
2926 /* Second pass: create output string and fill it */
2927 u = _PyUnicode_New(i + j);
2928 if (!u)
2929 return NULL;
2930
2931 j = 0;
2932 q = u->str;
2933
2934 for (p = self->str; p < e; p++)
2935 if (*p == '\t') {
2936 if (tabsize > 0) {
2937 i = tabsize - (j % tabsize);
2938 j += i;
2939 while (i--)
2940 *q++ = ' ';
2941 }
2942 }
2943 else {
2944 j++;
2945 *q++ = *p;
2946 if (*p == '\n' || *p == '\r')
2947 j = 0;
2948 }
2949
2950 return (PyObject*) u;
2951}
2952
2953static char find__doc__[] =
2954"S.find(sub [,start [,end]]) -> int\n\
2955\n\
2956Return the lowest index in S where substring sub is found,\n\
2957such that sub is contained within s[start,end]. Optional\n\
2958arguments start and end are interpreted as in slice notation.\n\
2959\n\
2960Return -1 on failure.";
2961
2962static PyObject *
2963unicode_find(PyUnicodeObject *self, PyObject *args)
2964{
2965 PyUnicodeObject *substring;
2966 int start = 0;
2967 int end = INT_MAX;
2968 PyObject *result;
2969
2970 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
2971 return NULL;
2972 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2973 (PyObject *)substring);
2974 if (substring == NULL)
2975 return NULL;
2976
2977 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
2978
2979 Py_DECREF(substring);
2980 return result;
2981}
2982
2983static PyObject *
2984unicode_getitem(PyUnicodeObject *self, int index)
2985{
2986 if (index < 0 || index >= self->length) {
2987 PyErr_SetString(PyExc_IndexError, "string index out of range");
2988 return NULL;
2989 }
2990
2991 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
2992}
2993
2994static long
2995unicode_hash(PyUnicodeObject *self)
2996{
2997 long hash;
2998 PyObject *utf8;
2999
3000 /* Since Unicode objects compare equal to their UTF-8 string
3001 counterparts, they should also use the UTF-8 strings as basis
3002 for their hash value. This is needed to assure that strings and
3003 Unicode objects behave in the same way as dictionary
3004 keys. Unfortunately, this costs some performance and also some
3005 memory if the cached UTF-8 representation is not used later
3006 on. */
3007 if (self->hash != -1)
3008 return self->hash;
3009 utf8 = utf8_string(self, NULL);
3010 if (utf8 == NULL)
3011 return -1;
3012 hash = PyObject_Hash(utf8);
3013 if (hash == -1)
3014 return -1;
3015 self->hash = hash;
3016 return hash;
3017}
3018
3019static char index__doc__[] =
3020"S.index(sub [,start [,end]]) -> int\n\
3021\n\
3022Like S.find() but raise ValueError when the substring is not found.";
3023
3024static PyObject *
3025unicode_index(PyUnicodeObject *self, PyObject *args)
3026{
3027 int result;
3028 PyUnicodeObject *substring;
3029 int start = 0;
3030 int end = INT_MAX;
3031
3032 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
3033 return NULL;
3034
3035 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3036 (PyObject *)substring);
3037 if (substring == NULL)
3038 return NULL;
3039
3040 result = findstring(self, substring, start, end, 1);
3041
3042 Py_DECREF(substring);
3043 if (result < 0) {
3044 PyErr_SetString(PyExc_ValueError, "substring not found");
3045 return NULL;
3046 }
3047 return PyInt_FromLong(result);
3048}
3049
3050static char islower__doc__[] =
3051"S.islower() -> int\n\
3052\n\
3053Return 1 if all cased characters in S are lowercase and there is\n\
3054at least one cased character in S, 0 otherwise.";
3055
3056static PyObject*
3057unicode_islower(PyUnicodeObject *self, PyObject *args)
3058{
3059 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3060 register const Py_UNICODE *e;
3061 int cased;
3062
3063 if (!PyArg_NoArgs(args))
3064 return NULL;
3065
3066 /* Shortcut for single character strings */
3067 if (PyUnicode_GET_SIZE(self) == 1)
3068 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3069
3070 e = p + PyUnicode_GET_SIZE(self);
3071 cased = 0;
3072 for (; p < e; p++) {
3073 register const Py_UNICODE ch = *p;
3074
3075 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3076 return PyInt_FromLong(0);
3077 else if (!cased && Py_UNICODE_ISLOWER(ch))
3078 cased = 1;
3079 }
3080 return PyInt_FromLong(cased);
3081}
3082
3083static char isupper__doc__[] =
3084"S.isupper() -> int\n\
3085\n\
3086Return 1 if all cased characters in S are uppercase and there is\n\
3087at least one cased character in S, 0 otherwise.";
3088
3089static PyObject*
3090unicode_isupper(PyUnicodeObject *self, PyObject *args)
3091{
3092 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3093 register const Py_UNICODE *e;
3094 int cased;
3095
3096 if (!PyArg_NoArgs(args))
3097 return NULL;
3098
3099 /* Shortcut for single character strings */
3100 if (PyUnicode_GET_SIZE(self) == 1)
3101 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3102
3103 e = p + PyUnicode_GET_SIZE(self);
3104 cased = 0;
3105 for (; p < e; p++) {
3106 register const Py_UNICODE ch = *p;
3107
3108 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3109 return PyInt_FromLong(0);
3110 else if (!cased && Py_UNICODE_ISUPPER(ch))
3111 cased = 1;
3112 }
3113 return PyInt_FromLong(cased);
3114}
3115
3116static char istitle__doc__[] =
3117"S.istitle() -> int\n\
3118\n\
3119Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3120may only follow uncased characters and lowercase characters only cased\n\
3121ones. Return 0 otherwise.";
3122
3123static PyObject*
3124unicode_istitle(PyUnicodeObject *self, PyObject *args)
3125{
3126 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3127 register const Py_UNICODE *e;
3128 int cased, previous_is_cased;
3129
3130 if (!PyArg_NoArgs(args))
3131 return NULL;
3132
3133 /* Shortcut for single character strings */
3134 if (PyUnicode_GET_SIZE(self) == 1)
3135 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3136 (Py_UNICODE_ISUPPER(*p) != 0));
3137
3138 e = p + PyUnicode_GET_SIZE(self);
3139 cased = 0;
3140 previous_is_cased = 0;
3141 for (; p < e; p++) {
3142 register const Py_UNICODE ch = *p;
3143
3144 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3145 if (previous_is_cased)
3146 return PyInt_FromLong(0);
3147 previous_is_cased = 1;
3148 cased = 1;
3149 }
3150 else if (Py_UNICODE_ISLOWER(ch)) {
3151 if (!previous_is_cased)
3152 return PyInt_FromLong(0);
3153 previous_is_cased = 1;
3154 cased = 1;
3155 }
3156 else
3157 previous_is_cased = 0;
3158 }
3159 return PyInt_FromLong(cased);
3160}
3161
3162static char isspace__doc__[] =
3163"S.isspace() -> int\n\
3164\n\
3165Return 1 if there are only whitespace characters in S,\n\
31660 otherwise.";
3167
3168static PyObject*
3169unicode_isspace(PyUnicodeObject *self, PyObject *args)
3170{
3171 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3172 register const Py_UNICODE *e;
3173
3174 if (!PyArg_NoArgs(args))
3175 return NULL;
3176
3177 /* Shortcut for single character strings */
3178 if (PyUnicode_GET_SIZE(self) == 1 &&
3179 Py_UNICODE_ISSPACE(*p))
3180 return PyInt_FromLong(1);
3181
3182 e = p + PyUnicode_GET_SIZE(self);
3183 for (; p < e; p++) {
3184 if (!Py_UNICODE_ISSPACE(*p))
3185 return PyInt_FromLong(0);
3186 }
3187 return PyInt_FromLong(1);
3188}
3189
3190static char isdecimal__doc__[] =
3191"S.isdecimal() -> int\n\
3192\n\
3193Return 1 if there are only decimal characters in S,\n\
31940 otherwise.";
3195
3196static PyObject*
3197unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3198{
3199 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3200 register const Py_UNICODE *e;
3201
3202 if (!PyArg_NoArgs(args))
3203 return NULL;
3204
3205 /* Shortcut for single character strings */
3206 if (PyUnicode_GET_SIZE(self) == 1 &&
3207 Py_UNICODE_ISDECIMAL(*p))
3208 return PyInt_FromLong(1);
3209
3210 e = p + PyUnicode_GET_SIZE(self);
3211 for (; p < e; p++) {
3212 if (!Py_UNICODE_ISDECIMAL(*p))
3213 return PyInt_FromLong(0);
3214 }
3215 return PyInt_FromLong(1);
3216}
3217
3218static char isdigit__doc__[] =
3219"S.isdigit() -> int\n\
3220\n\
3221Return 1 if there are only digit characters in S,\n\
32220 otherwise.";
3223
3224static PyObject*
3225unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3226{
3227 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3228 register const Py_UNICODE *e;
3229
3230 if (!PyArg_NoArgs(args))
3231 return NULL;
3232
3233 /* Shortcut for single character strings */
3234 if (PyUnicode_GET_SIZE(self) == 1 &&
3235 Py_UNICODE_ISDIGIT(*p))
3236 return PyInt_FromLong(1);
3237
3238 e = p + PyUnicode_GET_SIZE(self);
3239 for (; p < e; p++) {
3240 if (!Py_UNICODE_ISDIGIT(*p))
3241 return PyInt_FromLong(0);
3242 }
3243 return PyInt_FromLong(1);
3244}
3245
3246static char isnumeric__doc__[] =
3247"S.isnumeric() -> int\n\
3248\n\
3249Return 1 if there are only numeric characters in S,\n\
32500 otherwise.";
3251
3252static PyObject*
3253unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3254{
3255 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3256 register const Py_UNICODE *e;
3257
3258 if (!PyArg_NoArgs(args))
3259 return NULL;
3260
3261 /* Shortcut for single character strings */
3262 if (PyUnicode_GET_SIZE(self) == 1 &&
3263 Py_UNICODE_ISNUMERIC(*p))
3264 return PyInt_FromLong(1);
3265
3266 e = p + PyUnicode_GET_SIZE(self);
3267 for (; p < e; p++) {
3268 if (!Py_UNICODE_ISNUMERIC(*p))
3269 return PyInt_FromLong(0);
3270 }
3271 return PyInt_FromLong(1);
3272}
3273
3274static char join__doc__[] =
3275"S.join(sequence) -> unicode\n\
3276\n\
3277Return a string which is the concatenation of the strings in the\n\
3278sequence. The separator between elements is S.";
3279
3280static PyObject*
3281unicode_join(PyUnicodeObject *self, PyObject *args)
3282{
3283 PyObject *data;
3284 if (!PyArg_ParseTuple(args, "O:join", &data))
3285 return NULL;
3286
3287 return PyUnicode_Join((PyObject *)self, data);
3288}
3289
3290static int
3291unicode_length(PyUnicodeObject *self)
3292{
3293 return self->length;
3294}
3295
3296static char ljust__doc__[] =
3297"S.ljust(width) -> unicode\n\
3298\n\
3299Return S left justified in a Unicode string of length width. Padding is\n\
3300done using spaces.";
3301
3302static PyObject *
3303unicode_ljust(PyUnicodeObject *self, PyObject *args)
3304{
3305 int width;
3306 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3307 return NULL;
3308
3309 if (self->length >= width) {
3310 Py_INCREF(self);
3311 return (PyObject*) self;
3312 }
3313
3314 return (PyObject*) pad(self, 0, width - self->length, ' ');
3315}
3316
3317static char lower__doc__[] =
3318"S.lower() -> unicode\n\
3319\n\
3320Return a copy of the string S converted to lowercase.";
3321
3322static PyObject*
3323unicode_lower(PyUnicodeObject *self, PyObject *args)
3324{
3325 if (!PyArg_NoArgs(args))
3326 return NULL;
3327 return fixup(self, fixlower);
3328}
3329
3330static char lstrip__doc__[] =
3331"S.lstrip() -> unicode\n\
3332\n\
3333Return a copy of the string S with leading whitespace removed.";
3334
3335static PyObject *
3336unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3337{
3338 if (!PyArg_NoArgs(args))
3339 return NULL;
3340 return strip(self, 1, 0);
3341}
3342
3343static PyObject*
3344unicode_repeat(PyUnicodeObject *str, int len)
3345{
3346 PyUnicodeObject *u;
3347 Py_UNICODE *p;
3348
3349 if (len < 0)
3350 len = 0;
3351
3352 if (len == 1) {
3353 /* no repeat, return original string */
3354 Py_INCREF(str);
3355 return (PyObject*) str;
3356 }
3357
3358 u = _PyUnicode_New(len * str->length);
3359 if (!u)
3360 return NULL;
3361
3362 p = u->str;
3363
3364 while (len-- > 0) {
3365 Py_UNICODE_COPY(p, str->str, str->length);
3366 p += str->length;
3367 }
3368
3369 return (PyObject*) u;
3370}
3371
3372PyObject *PyUnicode_Replace(PyObject *obj,
3373 PyObject *subobj,
3374 PyObject *replobj,
3375 int maxcount)
3376{
3377 PyObject *self;
3378 PyObject *str1;
3379 PyObject *str2;
3380 PyObject *result;
3381
3382 self = PyUnicode_FromObject(obj);
3383 if (self == NULL)
3384 return NULL;
3385 str1 = PyUnicode_FromObject(subobj);
3386 if (str1 == NULL) {
3387 Py_DECREF(self);
3388 return NULL;
3389 }
3390 str2 = PyUnicode_FromObject(replobj);
3391 if (str2 == NULL) {
3392 Py_DECREF(self);
3393 Py_DECREF(str1);
3394 return NULL;
3395 }
3396 result = replace((PyUnicodeObject *)self,
3397 (PyUnicodeObject *)str1,
3398 (PyUnicodeObject *)str2,
3399 maxcount);
3400 Py_DECREF(self);
3401 Py_DECREF(str1);
3402 Py_DECREF(str2);
3403 return result;
3404}
3405
3406static char replace__doc__[] =
3407"S.replace (old, new[, maxsplit]) -> unicode\n\
3408\n\
3409Return a copy of S with all occurrences of substring\n\
3410old replaced by new. If the optional argument maxsplit is\n\
3411given, only the first maxsplit occurrences are replaced.";
3412
3413static PyObject*
3414unicode_replace(PyUnicodeObject *self, PyObject *args)
3415{
3416 PyUnicodeObject *str1;
3417 PyUnicodeObject *str2;
3418 int maxcount = -1;
3419 PyObject *result;
3420
3421 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3422 return NULL;
3423 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3424 if (str1 == NULL)
3425 return NULL;
3426 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3427 if (str2 == NULL)
3428 return NULL;
3429
3430 result = replace(self, str1, str2, maxcount);
3431
3432 Py_DECREF(str1);
3433 Py_DECREF(str2);
3434 return result;
3435}
3436
3437static
3438PyObject *unicode_repr(PyObject *unicode)
3439{
3440 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3441 PyUnicode_GET_SIZE(unicode),
3442 1);
3443}
3444
3445static char rfind__doc__[] =
3446"S.rfind(sub [,start [,end]]) -> int\n\
3447\n\
3448Return the highest index in S where substring sub is found,\n\
3449such that sub is contained within s[start,end]. Optional\n\
3450arguments start and end are interpreted as in slice notation.\n\
3451\n\
3452Return -1 on failure.";
3453
3454static PyObject *
3455unicode_rfind(PyUnicodeObject *self, PyObject *args)
3456{
3457 PyUnicodeObject *substring;
3458 int start = 0;
3459 int end = INT_MAX;
3460 PyObject *result;
3461
3462 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3463 return NULL;
3464 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3465 (PyObject *)substring);
3466 if (substring == NULL)
3467 return NULL;
3468
3469 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3470
3471 Py_DECREF(substring);
3472 return result;
3473}
3474
3475static char rindex__doc__[] =
3476"S.rindex(sub [,start [,end]]) -> int\n\
3477\n\
3478Like S.rfind() but raise ValueError when the substring is not found.";
3479
3480static PyObject *
3481unicode_rindex(PyUnicodeObject *self, PyObject *args)
3482{
3483 int result;
3484 PyUnicodeObject *substring;
3485 int start = 0;
3486 int end = INT_MAX;
3487
3488 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3489 return NULL;
3490 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3491 (PyObject *)substring);
3492 if (substring == NULL)
3493 return NULL;
3494
3495 result = findstring(self, substring, start, end, -1);
3496
3497 Py_DECREF(substring);
3498 if (result < 0) {
3499 PyErr_SetString(PyExc_ValueError, "substring not found");
3500 return NULL;
3501 }
3502 return PyInt_FromLong(result);
3503}
3504
3505static char rjust__doc__[] =
3506"S.rjust(width) -> unicode\n\
3507\n\
3508Return S right justified in a Unicode string of length width. Padding is\n\
3509done using spaces.";
3510
3511static PyObject *
3512unicode_rjust(PyUnicodeObject *self, PyObject *args)
3513{
3514 int width;
3515 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3516 return NULL;
3517
3518 if (self->length >= width) {
3519 Py_INCREF(self);
3520 return (PyObject*) self;
3521 }
3522
3523 return (PyObject*) pad(self, width - self->length, 0, ' ');
3524}
3525
3526static char rstrip__doc__[] =
3527"S.rstrip() -> unicode\n\
3528\n\
3529Return a copy of the string S with trailing whitespace removed.";
3530
3531static PyObject *
3532unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3533{
3534 if (!PyArg_NoArgs(args))
3535 return NULL;
3536 return strip(self, 0, 1);
3537}
3538
3539static PyObject*
3540unicode_slice(PyUnicodeObject *self, int start, int end)
3541{
3542 /* standard clamping */
3543 if (start < 0)
3544 start = 0;
3545 if (end < 0)
3546 end = 0;
3547 if (end > self->length)
3548 end = self->length;
3549 if (start == 0 && end == self->length) {
3550 /* full slice, return original string */
3551 Py_INCREF(self);
3552 return (PyObject*) self;
3553 }
3554 if (start > end)
3555 start = end;
3556 /* copy slice */
3557 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3558 end - start);
3559}
3560
3561PyObject *PyUnicode_Split(PyObject *s,
3562 PyObject *sep,
3563 int maxsplit)
3564{
3565 PyObject *result;
3566
3567 s = PyUnicode_FromObject(s);
3568 if (s == NULL)
3569 return NULL;
3570 if (sep != NULL) {
3571 sep = PyUnicode_FromObject(sep);
3572 if (sep == NULL) {
3573 Py_DECREF(s);
3574 return NULL;
3575 }
3576 }
3577
3578 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3579
3580 Py_DECREF(s);
3581 Py_XDECREF(sep);
3582 return result;
3583}
3584
3585static char split__doc__[] =
3586"S.split([sep [,maxsplit]]) -> list of strings\n\
3587\n\
3588Return a list of the words in S, using sep as the\n\
3589delimiter string. If maxsplit is given, at most maxsplit\n\
3590splits are done. If sep is not specified, any whitespace string\n\
3591is a separator.";
3592
3593static PyObject*
3594unicode_split(PyUnicodeObject *self, PyObject *args)
3595{
3596 PyObject *substring = Py_None;
3597 int maxcount = -1;
3598
3599 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3600 return NULL;
3601
3602 if (substring == Py_None)
3603 return split(self, NULL, maxcount);
3604 else if (PyUnicode_Check(substring))
3605 return split(self, (PyUnicodeObject *)substring, maxcount);
3606 else
3607 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3608}
3609
3610static char splitlines__doc__[] =
3611"S.splitlines([maxsplit]]) -> list of strings\n\
3612\n\
3613Return a list of the lines in S, breaking at line boundaries.\n\
3614If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
3615included in the resulting list.";
3616
3617static PyObject*
3618unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3619{
3620 int maxcount = -1;
3621
3622 if (!PyArg_ParseTuple(args, "|i:splitlines", &maxcount))
3623 return NULL;
3624
3625 return PyUnicode_Splitlines((PyObject *)self, maxcount);
3626}
3627
3628static
3629PyObject *unicode_str(PyUnicodeObject *self)
3630{
3631 return PyUnicode_AsUTF8String((PyObject *)self);
3632}
3633
3634static char strip__doc__[] =
3635"S.strip() -> unicode\n\
3636\n\
3637Return a copy of S with leading and trailing whitespace removed.";
3638
3639static PyObject *
3640unicode_strip(PyUnicodeObject *self, PyObject *args)
3641{
3642 if (!PyArg_NoArgs(args))
3643 return NULL;
3644 return strip(self, 1, 1);
3645}
3646
3647static char swapcase__doc__[] =
3648"S.swapcase() -> unicode\n\
3649\n\
3650Return a copy of S with uppercase characters converted to lowercase\n\
3651and vice versa.";
3652
3653static PyObject*
3654unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3655{
3656 if (!PyArg_NoArgs(args))
3657 return NULL;
3658 return fixup(self, fixswapcase);
3659}
3660
3661static char translate__doc__[] =
3662"S.translate(table) -> unicode\n\
3663\n\
3664Return a copy of the string S, where all characters have been mapped\n\
3665through the given translation table, which must be a mapping of\n\
3666Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3667are left untouched. Characters mapped to None are deleted.";
3668
3669static PyObject*
3670unicode_translate(PyUnicodeObject *self, PyObject *args)
3671{
3672 PyObject *table;
3673
3674 if (!PyArg_ParseTuple(args, "O:translate", &table))
3675 return NULL;
3676 return PyUnicode_TranslateCharmap(self->str,
3677 self->length,
3678 table,
3679 "ignore");
3680}
3681
3682static char upper__doc__[] =
3683"S.upper() -> unicode\n\
3684\n\
3685Return a copy of S converted to uppercase.";
3686
3687static PyObject*
3688unicode_upper(PyUnicodeObject *self, PyObject *args)
3689{
3690 if (!PyArg_NoArgs(args))
3691 return NULL;
3692 return fixup(self, fixupper);
3693}
3694
3695#if 0
3696static char zfill__doc__[] =
3697"S.zfill(width) -> unicode\n\
3698\n\
3699Pad a numeric string x with zeros on the left, to fill a field\n\
3700of the specified width. The string x is never truncated.";
3701
3702static PyObject *
3703unicode_zfill(PyUnicodeObject *self, PyObject *args)
3704{
3705 int fill;
3706 PyUnicodeObject *u;
3707
3708 int width;
3709 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3710 return NULL;
3711
3712 if (self->length >= width) {
3713 Py_INCREF(self);
3714 return (PyObject*) self;
3715 }
3716
3717 fill = width - self->length;
3718
3719 u = pad(self, fill, 0, '0');
3720
3721 if (u->str[fill] == '+' || u->str[fill] == '-') {
3722 /* move sign to beginning of string */
3723 u->str[0] = u->str[fill];
3724 u->str[fill] = '0';
3725 }
3726
3727 return (PyObject*) u;
3728}
3729#endif
3730
3731#if 0
3732static PyObject*
3733unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3734{
3735 if (!PyArg_NoArgs(args))
3736 return NULL;
3737 return PyInt_FromLong(unicode_freelist_size);
3738}
3739#endif
3740
3741static char startswith__doc__[] =
3742"S.startswith(prefix[, start[, end]]) -> int\n\
3743\n\
3744Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3745optional start, test S beginning at that position. With optional end, stop\n\
3746comparing S at that position.";
3747
3748static PyObject *
3749unicode_startswith(PyUnicodeObject *self,
3750 PyObject *args)
3751{
3752 PyUnicodeObject *substring;
3753 int start = 0;
3754 int end = INT_MAX;
3755 PyObject *result;
3756
3757 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3758 return NULL;
3759 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3760 (PyObject *)substring);
3761 if (substring == NULL)
3762 return NULL;
3763
3764 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3765
3766 Py_DECREF(substring);
3767 return result;
3768}
3769
3770
3771static char endswith__doc__[] =
3772"S.endswith(suffix[, start[, end]]) -> int\n\
3773\n\
3774Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3775optional start, test S beginning at that position. With optional end, stop\n\
3776comparing S at that position.";
3777
3778static PyObject *
3779unicode_endswith(PyUnicodeObject *self,
3780 PyObject *args)
3781{
3782 PyUnicodeObject *substring;
3783 int start = 0;
3784 int end = INT_MAX;
3785 PyObject *result;
3786
3787 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3788 return NULL;
3789 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3790 (PyObject *)substring);
3791 if (substring == NULL)
3792 return NULL;
3793
3794 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3795
3796 Py_DECREF(substring);
3797 return result;
3798}
3799
3800
3801static PyMethodDef unicode_methods[] = {
3802
3803 /* Order is according to common usage: often used methods should
3804 appear first, since lookup is done sequentially. */
3805
3806 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3807 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3808 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3809 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3810 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3811 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3812 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3813 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3814 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3815 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3816 {"index", (PyCFunction) unicode_index, 1, index__doc__},
3817 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
3818 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
3819 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
3820/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
3821 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
3822 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
3823 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
3824 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
3825 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
3826 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
3827 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
3828 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
3829 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
3830 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
3831 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
3832 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
3833 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
3834 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
3835 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
3836 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
3837 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
3838 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
3839#if 0
3840 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
3841 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
3842#endif
3843
3844#if 0
3845 /* This one is just used for debugging the implementation. */
3846 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
3847#endif
3848
3849 {NULL, NULL}
3850};
3851
3852static PyObject *
3853unicode_getattr(PyUnicodeObject *self, char *name)
3854{
3855 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
3856}
3857
3858static PySequenceMethods unicode_as_sequence = {
3859 (inquiry) unicode_length, /* sq_length */
3860 (binaryfunc) PyUnicode_Concat, /* sq_concat */
3861 (intargfunc) unicode_repeat, /* sq_repeat */
3862 (intargfunc) unicode_getitem, /* sq_item */
3863 (intintargfunc) unicode_slice, /* sq_slice */
3864 0, /* sq_ass_item */
3865 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003866 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867};
3868
3869static int
3870unicode_buffer_getreadbuf(PyUnicodeObject *self,
3871 int index,
3872 const void **ptr)
3873{
3874 if (index != 0) {
3875 PyErr_SetString(PyExc_SystemError,
3876 "accessing non-existent unicode segment");
3877 return -1;
3878 }
3879 *ptr = (void *) self->str;
3880 return PyUnicode_GET_DATA_SIZE(self);
3881}
3882
3883static int
3884unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
3885 const void **ptr)
3886{
3887 PyErr_SetString(PyExc_TypeError,
3888 "cannot use unicode as modifyable buffer");
3889 return -1;
3890}
3891
3892static int
3893unicode_buffer_getsegcount(PyUnicodeObject *self,
3894 int *lenp)
3895{
3896 if (lenp)
3897 *lenp = PyUnicode_GET_DATA_SIZE(self);
3898 return 1;
3899}
3900
3901static int
3902unicode_buffer_getcharbuf(PyUnicodeObject *self,
3903 int index,
3904 const void **ptr)
3905{
3906 PyObject *str;
3907
3908 if (index != 0) {
3909 PyErr_SetString(PyExc_SystemError,
3910 "accessing non-existent unicode segment");
3911 return -1;
3912 }
3913 str = utf8_string(self, NULL);
3914 if (str == NULL)
3915 return -1;
3916 *ptr = (void *) PyString_AS_STRING(str);
3917 return PyString_GET_SIZE(str);
3918}
3919
3920/* Helpers for PyUnicode_Format() */
3921
3922static PyObject *
3923getnextarg(args, arglen, p_argidx)
3924 PyObject *args;
3925int arglen;
3926int *p_argidx;
3927{
3928 int argidx = *p_argidx;
3929 if (argidx < arglen) {
3930 (*p_argidx)++;
3931 if (arglen < 0)
3932 return args;
3933 else
3934 return PyTuple_GetItem(args, argidx);
3935 }
3936 PyErr_SetString(PyExc_TypeError,
3937 "not enough arguments for format string");
3938 return NULL;
3939}
3940
3941#define F_LJUST (1<<0)
3942#define F_SIGN (1<<1)
3943#define F_BLANK (1<<2)
3944#define F_ALT (1<<3)
3945#define F_ZERO (1<<4)
3946
3947static
3948#ifdef HAVE_STDARG_PROTOTYPES
3949int usprintf(register Py_UNICODE *buffer, char *format, ...)
3950#else
3951int usprintf(va_alist) va_dcl
3952#endif
3953{
3954 register int i;
3955 int len;
3956 va_list va;
3957 char *charbuffer;
3958#ifdef HAVE_STDARG_PROTOTYPES
3959 va_start(va, format);
3960#else
3961 Py_UNICODE *args;
3962 char *format;
3963
3964 va_start(va);
3965 buffer = va_arg(va, Py_UNICODE *);
3966 format = va_arg(va, char *);
3967#endif
3968
3969 /* First, format the string as char array, then expand to Py_UNICODE
3970 array. */
3971 charbuffer = (char *)buffer;
3972 len = vsprintf(charbuffer, format, va);
3973 for (i = len - 1; i >= 0; i--)
3974 buffer[i] = (Py_UNICODE) charbuffer[i];
3975
3976 va_end(va);
3977 return len;
3978}
3979
3980static int
3981formatfloat(Py_UNICODE *buf,
3982 int flags,
3983 int prec,
3984 int type,
3985 PyObject *v)
3986{
3987 char fmt[20];
3988 double x;
3989
3990 x = PyFloat_AsDouble(v);
3991 if (x == -1.0 && PyErr_Occurred())
3992 return -1;
3993 if (prec < 0)
3994 prec = 6;
3995 if (prec > 50)
3996 prec = 50; /* Arbitrary limitation */
3997 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
3998 type = 'g';
3999 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4000 return usprintf(buf, fmt, x);
4001}
4002
4003static int
4004formatint(Py_UNICODE *buf,
4005 int flags,
4006 int prec,
4007 int type,
4008 PyObject *v)
4009{
4010 char fmt[20];
4011 long x;
4012
4013 x = PyInt_AsLong(v);
4014 if (x == -1 && PyErr_Occurred())
4015 return -1;
4016 if (prec < 0)
4017 prec = 1;
4018 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4019 return usprintf(buf, fmt, x);
4020}
4021
4022static int
4023formatchar(Py_UNICODE *buf,
4024 PyObject *v)
4025{
4026 if (PyUnicode_Check(v))
4027 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4028
4029 else if (PyString_Check(v))
4030 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4031
4032 else {
4033 /* Integer input truncated to a character */
4034 long x;
4035 x = PyInt_AsLong(v);
4036 if (x == -1 && PyErr_Occurred())
4037 return -1;
4038 buf[0] = (char) x;
4039 }
4040 buf[1] = '\0';
4041 return 1;
4042}
4043
4044PyObject *PyUnicode_Format(PyObject *format,
4045 PyObject *args)
4046{
4047 Py_UNICODE *fmt, *res;
4048 int fmtcnt, rescnt, reslen, arglen, argidx;
4049 int args_owned = 0;
4050 PyUnicodeObject *result = NULL;
4051 PyObject *dict = NULL;
4052 PyObject *uformat;
4053
4054 if (format == NULL || args == NULL) {
4055 PyErr_BadInternalCall();
4056 return NULL;
4057 }
4058 uformat = PyUnicode_FromObject(format);
4059 fmt = PyUnicode_AS_UNICODE(uformat);
4060 fmtcnt = PyUnicode_GET_SIZE(uformat);
4061
4062 reslen = rescnt = fmtcnt + 100;
4063 result = _PyUnicode_New(reslen);
4064 if (result == NULL)
4065 goto onError;
4066 res = PyUnicode_AS_UNICODE(result);
4067
4068 if (PyTuple_Check(args)) {
4069 arglen = PyTuple_Size(args);
4070 argidx = 0;
4071 }
4072 else {
4073 arglen = -1;
4074 argidx = -2;
4075 }
4076 if (args->ob_type->tp_as_mapping)
4077 dict = args;
4078
4079 while (--fmtcnt >= 0) {
4080 if (*fmt != '%') {
4081 if (--rescnt < 0) {
4082 rescnt = fmtcnt + 100;
4083 reslen += rescnt;
4084 if (_PyUnicode_Resize(result, reslen) < 0)
4085 return NULL;
4086 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4087 --rescnt;
4088 }
4089 *res++ = *fmt++;
4090 }
4091 else {
4092 /* Got a format specifier */
4093 int flags = 0;
4094 int width = -1;
4095 int prec = -1;
4096 int size = 0;
4097 Py_UNICODE c = '\0';
4098 Py_UNICODE fill;
4099 PyObject *v = NULL;
4100 PyObject *temp = NULL;
4101 Py_UNICODE *buf;
4102 Py_UNICODE sign;
4103 int len;
4104 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4105
4106 fmt++;
4107 if (*fmt == '(') {
4108 Py_UNICODE *keystart;
4109 int keylen;
4110 PyObject *key;
4111 int pcount = 1;
4112
4113 if (dict == NULL) {
4114 PyErr_SetString(PyExc_TypeError,
4115 "format requires a mapping");
4116 goto onError;
4117 }
4118 ++fmt;
4119 --fmtcnt;
4120 keystart = fmt;
4121 /* Skip over balanced parentheses */
4122 while (pcount > 0 && --fmtcnt >= 0) {
4123 if (*fmt == ')')
4124 --pcount;
4125 else if (*fmt == '(')
4126 ++pcount;
4127 fmt++;
4128 }
4129 keylen = fmt - keystart - 1;
4130 if (fmtcnt < 0 || pcount > 0) {
4131 PyErr_SetString(PyExc_ValueError,
4132 "incomplete format key");
4133 goto onError;
4134 }
4135 /* keys are converted to strings (using UTF-8) and
4136 then looked up since Python uses strings to hold
4137 variables names etc. in its namespaces and we
4138 wouldn't want to break common idioms. The
4139 alternative would be using Unicode objects for the
4140 lookup but u"abc" and "abc" have different hash
4141 values (on purpose). */
4142 key = PyUnicode_EncodeUTF8(keystart,
4143 keylen,
4144 NULL);
4145 if (key == NULL)
4146 goto onError;
4147 if (args_owned) {
4148 Py_DECREF(args);
4149 args_owned = 0;
4150 }
4151 args = PyObject_GetItem(dict, key);
4152 Py_DECREF(key);
4153 if (args == NULL) {
4154 goto onError;
4155 }
4156 args_owned = 1;
4157 arglen = -1;
4158 argidx = -2;
4159 }
4160 while (--fmtcnt >= 0) {
4161 switch (c = *fmt++) {
4162 case '-': flags |= F_LJUST; continue;
4163 case '+': flags |= F_SIGN; continue;
4164 case ' ': flags |= F_BLANK; continue;
4165 case '#': flags |= F_ALT; continue;
4166 case '0': flags |= F_ZERO; continue;
4167 }
4168 break;
4169 }
4170 if (c == '*') {
4171 v = getnextarg(args, arglen, &argidx);
4172 if (v == NULL)
4173 goto onError;
4174 if (!PyInt_Check(v)) {
4175 PyErr_SetString(PyExc_TypeError,
4176 "* wants int");
4177 goto onError;
4178 }
4179 width = PyInt_AsLong(v);
4180 if (width < 0) {
4181 flags |= F_LJUST;
4182 width = -width;
4183 }
4184 if (--fmtcnt >= 0)
4185 c = *fmt++;
4186 }
4187 else if (c >= '0' && c <= '9') {
4188 width = c - '0';
4189 while (--fmtcnt >= 0) {
4190 c = *fmt++;
4191 if (c < '0' || c > '9')
4192 break;
4193 if ((width*10) / 10 != width) {
4194 PyErr_SetString(PyExc_ValueError,
4195 "width too big");
4196 goto onError;
4197 }
4198 width = width*10 + (c - '0');
4199 }
4200 }
4201 if (c == '.') {
4202 prec = 0;
4203 if (--fmtcnt >= 0)
4204 c = *fmt++;
4205 if (c == '*') {
4206 v = getnextarg(args, arglen, &argidx);
4207 if (v == NULL)
4208 goto onError;
4209 if (!PyInt_Check(v)) {
4210 PyErr_SetString(PyExc_TypeError,
4211 "* wants int");
4212 goto onError;
4213 }
4214 prec = PyInt_AsLong(v);
4215 if (prec < 0)
4216 prec = 0;
4217 if (--fmtcnt >= 0)
4218 c = *fmt++;
4219 }
4220 else if (c >= '0' && c <= '9') {
4221 prec = c - '0';
4222 while (--fmtcnt >= 0) {
4223 c = Py_CHARMASK(*fmt++);
4224 if (c < '0' || c > '9')
4225 break;
4226 if ((prec*10) / 10 != prec) {
4227 PyErr_SetString(PyExc_ValueError,
4228 "prec too big");
4229 goto onError;
4230 }
4231 prec = prec*10 + (c - '0');
4232 }
4233 }
4234 } /* prec */
4235 if (fmtcnt >= 0) {
4236 if (c == 'h' || c == 'l' || c == 'L') {
4237 size = c;
4238 if (--fmtcnt >= 0)
4239 c = *fmt++;
4240 }
4241 }
4242 if (fmtcnt < 0) {
4243 PyErr_SetString(PyExc_ValueError,
4244 "incomplete format");
4245 goto onError;
4246 }
4247 if (c != '%') {
4248 v = getnextarg(args, arglen, &argidx);
4249 if (v == NULL)
4250 goto onError;
4251 }
4252 sign = 0;
4253 fill = ' ';
4254 switch (c) {
4255
4256 case '%':
4257 buf = tmpbuf;
4258 buf[0] = '%';
4259 len = 1;
4260 break;
4261
4262 case 's':
4263 case 'r':
4264 if (PyUnicode_Check(v) && c == 's') {
4265 temp = v;
4266 Py_INCREF(temp);
4267 }
4268 else {
4269 PyObject *unicode;
4270 if (c == 's')
4271 temp = PyObject_Str(v);
4272 else
4273 temp = PyObject_Repr(v);
4274 if (temp == NULL)
4275 goto onError;
4276 if (!PyString_Check(temp)) {
4277 /* XXX Note: this should never happen, since
4278 PyObject_Repr() and PyObject_Str() assure
4279 this */
4280 Py_DECREF(temp);
4281 PyErr_SetString(PyExc_TypeError,
4282 "%s argument has non-string str()");
4283 goto onError;
4284 }
4285 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4286 PyString_GET_SIZE(temp),
4287 "strict");
4288 Py_DECREF(temp);
4289 temp = unicode;
4290 if (temp == NULL)
4291 goto onError;
4292 }
4293 buf = PyUnicode_AS_UNICODE(temp);
4294 len = PyUnicode_GET_SIZE(temp);
4295 if (prec >= 0 && len > prec)
4296 len = prec;
4297 break;
4298
4299 case 'i':
4300 case 'd':
4301 case 'u':
4302 case 'o':
4303 case 'x':
4304 case 'X':
4305 if (c == 'i')
4306 c = 'd';
4307 buf = tmpbuf;
4308 len = formatint(buf, flags, prec, c, v);
4309 if (len < 0)
4310 goto onError;
4311 sign = (c == 'd');
4312 if (flags & F_ZERO) {
4313 fill = '0';
4314 if ((flags&F_ALT) &&
4315 (c == 'x' || c == 'X') &&
4316 buf[0] == '0' && buf[1] == c) {
4317 *res++ = *buf++;
4318 *res++ = *buf++;
4319 rescnt -= 2;
4320 len -= 2;
4321 width -= 2;
4322 if (width < 0)
4323 width = 0;
4324 }
4325 }
4326 break;
4327
4328 case 'e':
4329 case 'E':
4330 case 'f':
4331 case 'g':
4332 case 'G':
4333 buf = tmpbuf;
4334 len = formatfloat(buf, flags, prec, c, v);
4335 if (len < 0)
4336 goto onError;
4337 sign = 1;
4338 if (flags&F_ZERO)
4339 fill = '0';
4340 break;
4341
4342 case 'c':
4343 buf = tmpbuf;
4344 len = formatchar(buf, v);
4345 if (len < 0)
4346 goto onError;
4347 break;
4348
4349 default:
4350 PyErr_Format(PyExc_ValueError,
4351 "unsupported format character '%c' (0x%x)",
4352 c, c);
4353 goto onError;
4354 }
4355 if (sign) {
4356 if (*buf == '-' || *buf == '+') {
4357 sign = *buf++;
4358 len--;
4359 }
4360 else if (flags & F_SIGN)
4361 sign = '+';
4362 else if (flags & F_BLANK)
4363 sign = ' ';
4364 else
4365 sign = 0;
4366 }
4367 if (width < len)
4368 width = len;
4369 if (rescnt < width + (sign != 0)) {
4370 reslen -= rescnt;
4371 rescnt = width + fmtcnt + 100;
4372 reslen += rescnt;
4373 if (_PyUnicode_Resize(result, reslen) < 0)
4374 return NULL;
4375 res = PyUnicode_AS_UNICODE(result)
4376 + reslen - rescnt;
4377 }
4378 if (sign) {
4379 if (fill != ' ')
4380 *res++ = sign;
4381 rescnt--;
4382 if (width > len)
4383 width--;
4384 }
4385 if (width > len && !(flags & F_LJUST)) {
4386 do {
4387 --rescnt;
4388 *res++ = fill;
4389 } while (--width > len);
4390 }
4391 if (sign && fill == ' ')
4392 *res++ = sign;
4393 memcpy(res, buf, len * sizeof(Py_UNICODE));
4394 res += len;
4395 rescnt -= len;
4396 while (--width >= len) {
4397 --rescnt;
4398 *res++ = ' ';
4399 }
4400 if (dict && (argidx < arglen) && c != '%') {
4401 PyErr_SetString(PyExc_TypeError,
4402 "not all arguments converted");
4403 goto onError;
4404 }
4405 Py_XDECREF(temp);
4406 } /* '%' */
4407 } /* until end */
4408 if (argidx < arglen && !dict) {
4409 PyErr_SetString(PyExc_TypeError,
4410 "not all arguments converted");
4411 goto onError;
4412 }
4413
4414 if (args_owned) {
4415 Py_DECREF(args);
4416 }
4417 Py_DECREF(uformat);
4418 _PyUnicode_Resize(result, reslen - rescnt);
4419 return (PyObject *)result;
4420
4421 onError:
4422 Py_XDECREF(result);
4423 Py_DECREF(uformat);
4424 if (args_owned) {
4425 Py_DECREF(args);
4426 }
4427 return NULL;
4428}
4429
4430static PyBufferProcs unicode_as_buffer = {
4431 (getreadbufferproc) unicode_buffer_getreadbuf,
4432 (getwritebufferproc) unicode_buffer_getwritebuf,
4433 (getsegcountproc) unicode_buffer_getsegcount,
4434 (getcharbufferproc) unicode_buffer_getcharbuf,
4435};
4436
4437PyTypeObject PyUnicode_Type = {
4438 PyObject_HEAD_INIT(&PyType_Type)
4439 0, /* ob_size */
4440 "unicode", /* tp_name */
4441 sizeof(PyUnicodeObject), /* tp_size */
4442 0, /* tp_itemsize */
4443 /* Slots */
4444 (destructor)_PyUnicode_Free, /* tp_dealloc */
4445 0, /* tp_print */
4446 (getattrfunc)unicode_getattr, /* tp_getattr */
4447 0, /* tp_setattr */
4448 (cmpfunc) unicode_compare, /* tp_compare */
4449 (reprfunc) unicode_repr, /* tp_repr */
4450 0, /* tp_as_number */
4451 &unicode_as_sequence, /* tp_as_sequence */
4452 0, /* tp_as_mapping */
4453 (hashfunc) unicode_hash, /* tp_hash*/
4454 0, /* tp_call*/
4455 (reprfunc) unicode_str, /* tp_str */
4456 (getattrofunc) NULL, /* tp_getattro */
4457 (setattrofunc) NULL, /* tp_setattro */
4458 &unicode_as_buffer, /* tp_as_buffer */
4459 Py_TPFLAGS_DEFAULT, /* tp_flags */
4460};
4461
4462/* Initialize the Unicode implementation */
4463
4464void _PyUnicode_Init()
4465{
4466 /* Doublecheck the configuration... */
4467 if (sizeof(Py_UNICODE) != 2)
4468 Py_FatalError("Unicode configuration error: "
4469 "sizeof(Py_UNICODE) != 2 bytes");
4470
4471 unicode_empty = _PyUnicode_New(0);
4472}
4473
4474/* Finalize the Unicode implementation */
4475
4476void
4477_PyUnicode_Fini()
4478{
4479 PyUnicodeObject *u = unicode_freelist;
4480
4481 while (u != NULL) {
4482 PyUnicodeObject *v = u;
4483 u = *(PyUnicodeObject **)u;
4484 free(v);
4485 }
4486 Py_XDECREF(unicode_empty);
4487}