blob: e4bbcff20cb0548ab700c167d78fa427663c5eaa [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000079/* Limit for the Unicode object free list */
80
81#define MAX_UNICODE_FREELIST_SIZE 1024
82
83/* Limit for the Unicode object free list stay alive optimization.
84
85 The implementation will keep allocated Unicode memory intact for
86 all objects on the free list having a size less than this
87 limit. This reduces malloc() overhead for small Unicode objects.
88
Barry Warsaw51ac5802000-03-20 16:36:48 +000089 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumd57fd912000-03-10 22:53:23 +000090 (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
91 malloc()-overhead) bytes of unused garbage.
92
93 Setting the limit to 0 effectively turns the feature off.
94
95 XXX The feature is currently turned off because there are
96 apparently some lingering bugs in its implementation which I
97 haven't yet been able to sort out.
98
99*/
100
101#define STAYALIVE_SIZE_LIMIT 0
102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------ */
112
113/* The empty Unicode object */
114static PyUnicodeObject *unicode_empty = NULL;
115
116/* Free list for Unicode objects */
117static PyUnicodeObject *unicode_freelist = NULL;
118static int unicode_freelist_size = 0;
119
120/* --- Unicode Object ----------------------------------------------------- */
121
122static
123int _PyUnicode_Resize(register PyUnicodeObject *unicode,
124 int length)
125{
126 void *oldstr;
127
128 /* Shortcut if there's nothing to do. */
129 if (unicode->length == length)
130 return 0;
131
132 /* Resizing unicode_empty is not allowed. */
133 if (unicode == unicode_empty) {
134 PyErr_SetString(PyExc_SystemError,
135 "can't resize empty unicode object");
136 return -1;
137 }
138
139 /* We allocate one more byte to make sure the string is
140 Ux0000 terminated -- XXX is this needed ? */
141 oldstr = unicode->str;
142 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
143 if (!unicode->str) {
144 unicode->str = oldstr;
145 PyErr_NoMemory();
146 return -1;
147 }
148 unicode->str[length] = 0;
149 unicode->length = length;
150
151 /* Reset the object caches */
152 if (unicode->utf8str) {
153 Py_DECREF(unicode->utf8str);
154 unicode->utf8str = NULL;
155 }
156 unicode->hash = -1;
157
158 return 0;
159}
160
161/* We allocate one more byte to make sure the string is
162 Ux0000 terminated -- XXX is this needed ?
163
164 XXX This allocator could further be enhanced by assuring that the
165 free list never reduces its size below 1.
166
167*/
168
169static
170PyUnicodeObject *_PyUnicode_New(int length)
171{
172 register PyUnicodeObject *unicode;
173
174 /* Optimization for empty strings */
175 if (length == 0 && unicode_empty != NULL) {
176 Py_INCREF(unicode_empty);
177 return unicode_empty;
178 }
179
180 /* Unicode freelist & memory allocation */
181 if (unicode_freelist) {
182 unicode = unicode_freelist;
183 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
184 unicode_freelist_size--;
185 unicode->ob_type = &PyUnicode_Type;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000186 _Py_NewReference((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->str) {
188 if (unicode->length < length &&
189 _PyUnicode_Resize(unicode, length)) {
190 free(unicode->str);
191 PyMem_DEL(unicode);
192 return NULL;
193 }
194 }
195 else
196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
197 }
198 else {
199 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
200 if (unicode == NULL)
201 return NULL;
202 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
203 }
204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205 if (!unicode->str)
206 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 unicode->str[length] = 0;
208 unicode->length = length;
209 unicode->hash = -1;
210 unicode->utf8str = NULL;
211 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212
213 onError:
214 _Py_ForgetReference((PyObject *)unicode);
215 PyMem_DEL(unicode);
216 PyErr_NoMemory();
217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218}
219
220static
221void _PyUnicode_Free(register PyUnicodeObject *unicode)
222{
223 Py_XDECREF(unicode->utf8str);
224 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
225 if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
226 free(unicode->str);
227 unicode->str = NULL;
228 unicode->length = 0;
229 }
230 *(PyUnicodeObject **)unicode = unicode_freelist;
231 unicode_freelist = unicode;
232 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 }
234 else {
235 free(unicode->str);
236 PyMem_DEL(unicode);
237 }
238}
239
240PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
241 int size)
242{
243 PyUnicodeObject *unicode;
244
245 unicode = _PyUnicode_New(size);
246 if (!unicode)
247 return NULL;
248
249 /* Copy the Unicode data into the new object */
250 if (u != NULL)
251 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
252
253 return (PyObject *)unicode;
254}
255
256#ifdef HAVE_WCHAR_H
257
258PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
259 int size)
260{
261 PyUnicodeObject *unicode;
262
263 if (w == NULL) {
264 PyErr_BadInternalCall();
265 return NULL;
266 }
267
268 unicode = _PyUnicode_New(size);
269 if (!unicode)
270 return NULL;
271
272 /* Copy the wchar_t data into the new object */
273#ifdef HAVE_USABLE_WCHAR_T
274 memcpy(unicode->str, w, size * sizeof(wchar_t));
275#else
276 {
277 register Py_UNICODE *u;
278 register int i;
279 u = PyUnicode_AS_UNICODE(unicode);
280 for (i = size; i >= 0; i--)
281 *u++ = *w++;
282 }
283#endif
284
285 return (PyObject *)unicode;
286}
287
288int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
289 register wchar_t *w,
290 int size)
291{
292 if (unicode == NULL) {
293 PyErr_BadInternalCall();
294 return -1;
295 }
296 if (size > PyUnicode_GET_SIZE(unicode))
297 size = PyUnicode_GET_SIZE(unicode);
298#ifdef HAVE_USABLE_WCHAR_T
299 memcpy(w, unicode->str, size * sizeof(wchar_t));
300#else
301 {
302 register Py_UNICODE *u;
303 register int i;
304 u = PyUnicode_AS_UNICODE(unicode);
305 for (i = size; i >= 0; i--)
306 *w++ = *u++;
307 }
308#endif
309
310 return size;
311}
312
313#endif
314
315PyObject *PyUnicode_FromObject(register PyObject *obj)
316{
317 const char *s;
318 int len;
319
320 if (obj == NULL) {
321 PyErr_BadInternalCall();
322 return NULL;
323 }
324 else if (PyUnicode_Check(obj)) {
325 Py_INCREF(obj);
326 return obj;
327 }
328 else if (PyString_Check(obj)) {
329 s = PyString_AS_STRING(obj);
330 len = PyString_GET_SIZE(obj);
331 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000332 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
333 /* Overwrite the error message with something more useful in
334 case of a TypeError. */
335 if (PyErr_ExceptionMatches(PyExc_TypeError))
336 PyErr_SetString(PyExc_TypeError,
337 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000338 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000340 if (len == 0) {
341 Py_INCREF(unicode_empty);
342 return (PyObject *)unicode_empty;
343 }
344 return PyUnicode_DecodeUTF8(s, len, "strict");
345}
346
347PyObject *PyUnicode_Decode(const char *s,
348 int size,
349 const char *encoding,
350 const char *errors)
351{
352 PyObject *buffer = NULL, *unicode;
353
354 /* Shortcut for the default encoding UTF-8 */
355 if (encoding == NULL ||
356 (strcmp(encoding, "utf-8") == 0))
357 return PyUnicode_DecodeUTF8(s, size, errors);
358
359 /* Decode via the codec registry */
360 buffer = PyBuffer_FromMemory((void *)s, size);
361 if (buffer == NULL)
362 goto onError;
363 unicode = PyCodec_Decode(buffer, encoding, errors);
364 if (unicode == NULL)
365 goto onError;
366 if (!PyUnicode_Check(unicode)) {
367 PyErr_Format(PyExc_TypeError,
368 "decoder did not return an unicode object (type=%s)",
369 unicode->ob_type->tp_name);
370 Py_DECREF(unicode);
371 goto onError;
372 }
373 Py_DECREF(buffer);
374 return unicode;
375
376 onError:
377 Py_XDECREF(buffer);
378 return NULL;
379}
380
381PyObject *PyUnicode_Encode(const Py_UNICODE *s,
382 int size,
383 const char *encoding,
384 const char *errors)
385{
386 PyObject *v, *unicode;
387
388 unicode = PyUnicode_FromUnicode(s, size);
389 if (unicode == NULL)
390 return NULL;
391 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
392 Py_DECREF(unicode);
393 return v;
394}
395
396PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
397 const char *encoding,
398 const char *errors)
399{
400 PyObject *v;
401
402 if (!PyUnicode_Check(unicode)) {
403 PyErr_BadArgument();
404 goto onError;
405 }
406 /* Shortcut for the default encoding UTF-8 */
407 if ((encoding == NULL ||
408 (strcmp(encoding, "utf-8") == 0)) &&
409 errors == NULL)
410 return PyUnicode_AsUTF8String(unicode);
411
412 /* Encode via the codec registry */
413 v = PyCodec_Encode(unicode, encoding, errors);
414 if (v == NULL)
415 goto onError;
416 /* XXX Should we really enforce this ? */
417 if (!PyString_Check(v)) {
418 PyErr_Format(PyExc_TypeError,
419 "encoder did not return a string object (type=%s)",
420 v->ob_type->tp_name);
421 Py_DECREF(v);
422 goto onError;
423 }
424 return v;
425
426 onError:
427 return NULL;
428}
429
430Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
431{
432 if (!PyUnicode_Check(unicode)) {
433 PyErr_BadArgument();
434 goto onError;
435 }
436 return PyUnicode_AS_UNICODE(unicode);
437
438 onError:
439 return NULL;
440}
441
442int PyUnicode_GetSize(PyObject *unicode)
443{
444 if (!PyUnicode_Check(unicode)) {
445 PyErr_BadArgument();
446 goto onError;
447 }
448 return PyUnicode_GET_SIZE(unicode);
449
450 onError:
451 return -1;
452}
453
454/* --- UTF-8 Codec -------------------------------------------------------- */
455
456static
457char utf8_code_length[256] = {
458 /* Map UTF-8 encoded prefix byte to sequence length. zero means
459 illegal prefix. see RFC 2279 for details */
460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
462 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
463 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
464 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
465 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
466 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
467 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
468 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
470 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
472 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
473 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
474 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
475 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
476};
477
478static
479int utf8_decoding_error(const char **source,
480 Py_UNICODE **dest,
481 const char *errors,
482 const char *details)
483{
484 if ((errors == NULL) ||
485 (strcmp(errors,"strict") == 0)) {
486 PyErr_Format(PyExc_UnicodeError,
487 "UTF-8 decoding error: %s",
488 details);
489 return -1;
490 }
491 else if (strcmp(errors,"ignore") == 0) {
492 (*source)++;
493 return 0;
494 }
495 else if (strcmp(errors,"replace") == 0) {
496 (*source)++;
497 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
498 (*dest)++;
499 return 0;
500 }
501 else {
502 PyErr_Format(PyExc_ValueError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000503 "UTF-8 decoding error; unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000504 errors);
505 return -1;
506 }
507}
508
509#define UTF8_ERROR(details) do { \
510 if (utf8_decoding_error(&s, &p, errors, details)) \
511 goto onError; \
512 continue; \
513} while (0)
514
515PyObject *PyUnicode_DecodeUTF8(const char *s,
516 int size,
517 const char *errors)
518{
519 int n;
520 const char *e;
521 PyUnicodeObject *unicode;
522 Py_UNICODE *p;
523
524 /* Note: size will always be longer than the resulting Unicode
525 character count */
526 unicode = _PyUnicode_New(size);
527 if (!unicode)
528 return NULL;
529 if (size == 0)
530 return (PyObject *)unicode;
531
532 /* Unpack UTF-8 encoded data */
533 p = unicode->str;
534 e = s + size;
535
536 while (s < e) {
537 register Py_UNICODE ch = (unsigned char)*s;
538
539 if (ch < 0x80) {
540 *p++ = ch;
541 s++;
542 continue;
543 }
544
545 n = utf8_code_length[ch];
546
547 if (s + n > e)
548 UTF8_ERROR("unexpected end of data");
549
550 switch (n) {
551
552 case 0:
553 UTF8_ERROR("unexpected code byte");
554 break;
555
556 case 1:
557 UTF8_ERROR("internal error");
558 break;
559
560 case 2:
561 if ((s[1] & 0xc0) != 0x80)
562 UTF8_ERROR("invalid data");
563 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
564 if (ch < 0x80)
565 UTF8_ERROR("illegal encoding");
566 else
567 *p++ = ch;
568 break;
569
570 case 3:
571 if ((s[1] & 0xc0) != 0x80 ||
572 (s[2] & 0xc0) != 0x80)
573 UTF8_ERROR("invalid data");
574 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
575 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
576 UTF8_ERROR("illegal encoding");
577 else
578 *p++ = ch;
579 break;
580
581 default:
582 /* Other sizes are only needed for UCS-4 */
583 UTF8_ERROR("unsupported Unicode code range");
584 }
585 s += n;
586 }
587
588 /* Adjust length */
589 if (_PyUnicode_Resize(unicode, p - unicode->str))
590 goto onError;
591
592 return (PyObject *)unicode;
593
594onError:
595 Py_DECREF(unicode);
596 return NULL;
597}
598
599#undef UTF8_ERROR
600
601static
602int utf8_encoding_error(const Py_UNICODE **source,
603 char **dest,
604 const char *errors,
605 const char *details)
606{
607 if ((errors == NULL) ||
608 (strcmp(errors,"strict") == 0)) {
609 PyErr_Format(PyExc_UnicodeError,
610 "UTF-8 encoding error: %s",
611 details);
612 return -1;
613 }
614 else if (strcmp(errors,"ignore") == 0) {
615 return 0;
616 }
617 else if (strcmp(errors,"replace") == 0) {
618 **dest = '?';
619 (*dest)++;
620 return 0;
621 }
622 else {
623 PyErr_Format(PyExc_ValueError,
624 "UTF-8 encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +0000625 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000626 errors);
627 return -1;
628 }
629}
630
631PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
632 int size,
633 const char *errors)
634{
635 PyObject *v;
636 char *p;
637 char *q;
638
639 v = PyString_FromStringAndSize(NULL, 3 * size);
640 if (v == NULL)
641 return NULL;
642 if (size == 0)
643 goto done;
644
645 p = q = PyString_AS_STRING(v);
646 while (size-- > 0) {
647 Py_UNICODE ch = *s++;
648 if (ch < 0x80)
649 *p++ = (char) ch;
650 else if (ch < 0x0800) {
651 *p++ = 0xc0 | (ch >> 6);
652 *p++ = 0x80 | (ch & 0x3f);
653 } else if (0xD800 <= ch && ch <= 0xDFFF) {
654 /* These byte ranges are reserved for UTF-16 surrogate
655 bytes which the Python implementation currently does
656 not support. */
657 printf("code range problem: U+%04x\n", ch);
658 if (utf8_encoding_error(&s, &p, errors,
659 "unsupported code range"))
660 goto onError;
661 } else {
662 *p++ = 0xe0 | (ch >> 12);
663 *p++ = 0x80 | ((ch >> 6) & 0x3f);
664 *p++ = 0x80 | (ch & 0x3f);
665 }
666 }
667 *p = '\0';
668 _PyString_Resize(&v, p - q);
669
670 done:
671 return v;
672
673 onError:
674 Py_DECREF(v);
675 return NULL;
676}
677
678/* Return a Python string holding the UTF-8 encoded value of the
679 Unicode object.
680
681 The resulting string is cached in the Unicode object for subsequent
682 usage by this function. The cached version is needed to implement
683 the character buffer interface.
684
685 The refcount of the string is *not* incremented.
686
687*/
688
689static
690PyObject *utf8_string(PyUnicodeObject *self,
691 const char *errors)
692{
693 PyObject *v = self->utf8str;
694
695 if (v)
696 return v;
697 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
698 PyUnicode_GET_SIZE(self),
699 errors);
700 if (v && errors == NULL)
701 self->utf8str = v;
702 return v;
703}
704
705PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
706{
707 PyObject *str;
708
709 if (!PyUnicode_Check(unicode)) {
710 PyErr_BadArgument();
711 return NULL;
712 }
713 str = utf8_string((PyUnicodeObject *)unicode, NULL);
714 if (str == NULL)
715 return NULL;
716 Py_INCREF(str);
717 return str;
718}
719
720/* --- UTF-16 Codec ------------------------------------------------------- */
721
722static
723int utf16_decoding_error(const Py_UNICODE **source,
724 Py_UNICODE **dest,
725 const char *errors,
726 const char *details)
727{
728 if ((errors == NULL) ||
729 (strcmp(errors,"strict") == 0)) {
730 PyErr_Format(PyExc_UnicodeError,
731 "UTF-16 decoding error: %s",
732 details);
733 return -1;
734 }
735 else if (strcmp(errors,"ignore") == 0) {
736 return 0;
737 }
738 else if (strcmp(errors,"replace") == 0) {
739 if (dest) {
740 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
741 (*dest)++;
742 }
743 return 0;
744 }
745 else {
746 PyErr_Format(PyExc_ValueError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000747 "UTF-16 decoding error; unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000748 errors);
749 return -1;
750 }
751}
752
753#define UTF16_ERROR(details) do { \
754 if (utf16_decoding_error(&q, &p, errors, details)) \
755 goto onError; \
756 continue; \
757} while(0)
758
759PyObject *PyUnicode_DecodeUTF16(const char *s,
760 int size,
761 const char *errors,
762 int *byteorder)
763{
764 PyUnicodeObject *unicode;
765 Py_UNICODE *p;
766 const Py_UNICODE *q, *e;
767 int bo = 0;
768
769 /* size should be an even number */
770 if (size % sizeof(Py_UNICODE) != 0) {
771 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
772 return NULL;
773 /* The remaining input chars are ignored if we fall through
774 here... */
775 }
776
777 /* Note: size will always be longer than the resulting Unicode
778 character count */
779 unicode = _PyUnicode_New(size);
780 if (!unicode)
781 return NULL;
782 if (size == 0)
783 return (PyObject *)unicode;
784
785 /* Unpack UTF-16 encoded data */
786 p = unicode->str;
787 q = (Py_UNICODE *)s;
788 e = q + (size / sizeof(Py_UNICODE));
789
790 if (byteorder)
791 bo = *byteorder;
792
793 while (q < e) {
794 register Py_UNICODE ch = *q++;
795
796 /* Check for BOM marks (U+FEFF) in the input and adjust
797 current byte order setting accordingly. Swap input
798 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
799 !) */
800#ifdef BYTEORDER_IS_LITTLE_ENDIAN
801 if (ch == 0xFEFF) {
802 bo = -1;
803 continue;
804 } else if (ch == 0xFFFE) {
805 bo = 1;
806 continue;
807 }
808 if (bo == 1)
809 ch = (ch >> 8) | (ch << 8);
810#else
811 if (ch == 0xFEFF) {
812 bo = 1;
813 continue;
814 } else if (ch == 0xFFFE) {
815 bo = -1;
816 continue;
817 }
818 if (bo == -1)
819 ch = (ch >> 8) | (ch << 8);
820#endif
821 if (ch < 0xD800 || ch > 0xDFFF) {
822 *p++ = ch;
823 continue;
824 }
825
826 /* UTF-16 code pair: */
827 if (q >= e)
828 UTF16_ERROR("unexpected end of data");
829 if (0xDC00 <= *q && *q <= 0xDFFF) {
830 q++;
831 if (0xD800 <= *q && *q <= 0xDBFF)
832 /* This is valid data (a UTF-16 surrogate pair), but
833 we are not able to store this information since our
834 Py_UNICODE type only has 16 bits... this might
835 change someday, even though it's unlikely. */
836 UTF16_ERROR("code pairs are not supported");
837 else
838 continue;
839 }
840 UTF16_ERROR("illegal encoding");
841 }
842
843 if (byteorder)
844 *byteorder = bo;
845
846 /* Adjust length */
847 if (_PyUnicode_Resize(unicode, p - unicode->str))
848 goto onError;
849
850 return (PyObject *)unicode;
851
852onError:
853 Py_DECREF(unicode);
854 return NULL;
855}
856
857#undef UTF16_ERROR
858
859PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
860 int size,
861 const char *errors,
862 int byteorder)
863{
864 PyObject *v;
865 Py_UNICODE *p;
866 char *q;
867
868 /* We don't create UTF-16 pairs... */
869 v = PyString_FromStringAndSize(NULL,
870 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
871 if (v == NULL)
872 return NULL;
873 if (size == 0)
874 goto done;
875
876 q = PyString_AS_STRING(v);
877 p = (Py_UNICODE *)q;
878
879 if (byteorder == 0)
880 *p++ = 0xFEFF;
881 if (byteorder == 0 ||
882#ifdef BYTEORDER_IS_LITTLE_ENDIAN
883 byteorder == -1
884#else
885 byteorder == 1
886#endif
887 )
888 memcpy(p, s, size * sizeof(Py_UNICODE));
889 else
890 while (size-- > 0) {
891 Py_UNICODE ch = *s++;
892 *p++ = (ch >> 8) | (ch << 8);
893 }
894 done:
895 return v;
896}
897
898PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
899{
900 if (!PyUnicode_Check(unicode)) {
901 PyErr_BadArgument();
902 return NULL;
903 }
904 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
905 PyUnicode_GET_SIZE(unicode),
906 NULL,
907 0);
908}
909
910/* --- Unicode Escape Codec ----------------------------------------------- */
911
912static
913int unicodeescape_decoding_error(const char **source,
914 unsigned int *x,
915 const char *errors,
916 const char *details)
917{
918 if ((errors == NULL) ||
919 (strcmp(errors,"strict") == 0)) {
920 PyErr_Format(PyExc_UnicodeError,
921 "Unicode-Escape decoding error: %s",
922 details);
923 return -1;
924 }
925 else if (strcmp(errors,"ignore") == 0) {
926 return 0;
927 }
928 else if (strcmp(errors,"replace") == 0) {
929 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
930 return 0;
931 }
932 else {
933 PyErr_Format(PyExc_ValueError,
934 "Unicode-Escape decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +0000935 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000936 errors);
937 return -1;
938 }
939}
940
941PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
942 int size,
943 const char *errors)
944{
945 PyUnicodeObject *v;
946 Py_UNICODE *p = NULL, *buf = NULL;
947 const char *end;
948
949 /* Escaped strings will always be longer than the resulting
950 Unicode string, so we start with size here and then reduce the
951 length after conversion to the true value. */
952 v = _PyUnicode_New(size);
953 if (v == NULL)
954 goto onError;
955 if (size == 0)
956 return (PyObject *)v;
957 p = buf = PyUnicode_AS_UNICODE(v);
958 end = s + size;
959 while (s < end) {
960 unsigned char c;
961 unsigned int x;
962 int i;
963
964 /* Non-escape characters are interpreted as Unicode ordinals */
965 if (*s != '\\') {
966 *p++ = (unsigned char)*s++;
967 continue;
968 }
969
970 /* \ - Escapes */
971 s++;
972 switch (*s++) {
973
974 /* \x escapes */
975 case '\n': break;
976 case '\\': *p++ = '\\'; break;
977 case '\'': *p++ = '\''; break;
978 case '\"': *p++ = '\"'; break;
979 case 'b': *p++ = '\b'; break;
980 case 'f': *p++ = '\014'; break; /* FF */
981 case 't': *p++ = '\t'; break;
982 case 'n': *p++ = '\n'; break;
983 case 'r': *p++ = '\r'; break;
984 case 'v': *p++ = '\013'; break; /* VT */
985 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
986
987 /* \OOO (octal) escapes */
988 case '0': case '1': case '2': case '3':
989 case '4': case '5': case '6': case '7':
990 c = s[-1] - '0';
991 if ('0' <= *s && *s <= '7') {
992 c = (c<<3) + *s++ - '0';
993 if ('0' <= *s && *s <= '7')
994 c = (c<<3) + *s++ - '0';
995 }
996 *p++ = c;
997 break;
998
999 /* \xXXXX escape with 0-4 hex digits */
1000 case 'x':
1001 x = 0;
1002 c = (unsigned char)*s;
1003 if (isxdigit(c)) {
1004 do {
1005 x = (x<<4) & ~0xF;
1006 if ('0' <= c && c <= '9')
1007 x += c - '0';
1008 else if ('a' <= c && c <= 'f')
1009 x += 10 + c - 'a';
1010 else
1011 x += 10 + c - 'A';
1012 c = (unsigned char)*++s;
1013 } while (isxdigit(c));
1014 *p++ = x;
1015 } else {
1016 *p++ = '\\';
1017 *p++ = (unsigned char)s[-1];
1018 }
1019 break;
1020
1021 /* \uXXXX with 4 hex digits */
1022 case 'u':
1023 for (x = 0, i = 0; i < 4; i++) {
1024 c = (unsigned char)s[i];
1025 if (!isxdigit(c)) {
1026 if (unicodeescape_decoding_error(&s, &x, errors,
1027 "truncated \\uXXXX"))
1028 goto onError;
1029 i++;
1030 break;
1031 }
1032 x = (x<<4) & ~0xF;
1033 if (c >= '0' && c <= '9')
1034 x += c - '0';
1035 else if (c >= 'a' && c <= 'f')
1036 x += 10 + c - 'a';
1037 else
1038 x += 10 + c - 'A';
1039 }
1040 s += i;
1041 *p++ = x;
1042 break;
1043
1044 default:
1045 *p++ = '\\';
1046 *p++ = (unsigned char)s[-1];
1047 break;
1048 }
1049 }
1050 _PyUnicode_Resize(v, (int)(p - buf));
1051 return (PyObject *)v;
1052
1053 onError:
1054 Py_XDECREF(v);
1055 return NULL;
1056}
1057
1058/* Return a Unicode-Escape string version of the Unicode object.
1059
1060 If quotes is true, the string is enclosed in u"" or u'' quotes as
1061 appropriate.
1062
1063*/
1064
Barry Warsaw51ac5802000-03-20 16:36:48 +00001065static const Py_UNICODE *findchar(const Py_UNICODE *s,
1066 int size,
1067 Py_UNICODE ch);
1068
Guido van Rossumd57fd912000-03-10 22:53:23 +00001069static
1070PyObject *unicodeescape_string(const Py_UNICODE *s,
1071 int size,
1072 int quotes)
1073{
1074 PyObject *repr;
1075 char *p;
1076 char *q;
1077
1078 static const char *hexdigit = "0123456789ABCDEF";
1079
1080 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1081 if (repr == NULL)
1082 return NULL;
1083
1084 p = q = PyString_AS_STRING(repr);
1085
1086 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001087 *p++ = 'u';
1088 *p++ = (findchar(s, size, '\'') &&
1089 !findchar(s, size, '"')) ? '"' : '\'';
1090 }
1091 while (size-- > 0) {
1092 Py_UNICODE ch = *s++;
1093 /* Escape quotes */
1094 if (quotes && (ch == q[1] || ch == '\\')) {
1095 *p++ = '\\';
1096 *p++ = (char) ch;
1097 }
1098 /* Map 16-bit characters to '\uxxxx' */
1099 else if (ch >= 256) {
1100 *p++ = '\\';
1101 *p++ = 'u';
1102 *p++ = hexdigit[(ch >> 12) & 0xf];
1103 *p++ = hexdigit[(ch >> 8) & 0xf];
1104 *p++ = hexdigit[(ch >> 4) & 0xf];
1105 *p++ = hexdigit[ch & 15];
1106 }
1107 /* Map non-printable US ASCII to '\ooo' */
1108 else if (ch < ' ' || ch >= 128) {
1109 *p++ = '\\';
1110 *p++ = hexdigit[(ch >> 6) & 7];
1111 *p++ = hexdigit[(ch >> 3) & 7];
1112 *p++ = hexdigit[ch & 7];
1113 }
1114 /* Copy everything else as-is */
1115 else
1116 *p++ = (char) ch;
1117 }
1118 if (quotes)
1119 *p++ = q[1];
1120
1121 *p = '\0';
1122 _PyString_Resize(&repr, p - q);
1123
1124 return repr;
1125}
1126
1127PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1128 int size)
1129{
1130 return unicodeescape_string(s, size, 0);
1131}
1132
1133PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1134{
1135 if (!PyUnicode_Check(unicode)) {
1136 PyErr_BadArgument();
1137 return NULL;
1138 }
1139 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1140 PyUnicode_GET_SIZE(unicode));
1141}
1142
1143/* --- Raw Unicode Escape Codec ------------------------------------------- */
1144
1145PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1146 int size,
1147 const char *errors)
1148{
1149 PyUnicodeObject *v;
1150 Py_UNICODE *p, *buf;
1151 const char *end;
1152 const char *bs;
1153
1154 /* Escaped strings will always be longer than the resulting
1155 Unicode string, so we start with size here and then reduce the
1156 length after conversion to the true value. */
1157 v = _PyUnicode_New(size);
1158 if (v == NULL)
1159 goto onError;
1160 if (size == 0)
1161 return (PyObject *)v;
1162 p = buf = PyUnicode_AS_UNICODE(v);
1163 end = s + size;
1164 while (s < end) {
1165 unsigned char c;
1166 unsigned int x;
1167 int i;
1168
1169 /* Non-escape characters are interpreted as Unicode ordinals */
1170 if (*s != '\\') {
1171 *p++ = (unsigned char)*s++;
1172 continue;
1173 }
1174
1175 /* \u-escapes are only interpreted iff the number of leading
1176 backslashes if odd */
1177 bs = s;
1178 for (;s < end;) {
1179 if (*s != '\\')
1180 break;
1181 *p++ = (unsigned char)*s++;
1182 }
1183 if (((s - bs) & 1) == 0 ||
1184 s >= end ||
1185 *s != 'u') {
1186 continue;
1187 }
1188 p--;
1189 s++;
1190
1191 /* \uXXXX with 4 hex digits */
1192 for (x = 0, i = 0; i < 4; i++) {
1193 c = (unsigned char)s[i];
1194 if (!isxdigit(c)) {
1195 if (unicodeescape_decoding_error(&s, &x, errors,
1196 "truncated \\uXXXX"))
1197 goto onError;
1198 i++;
1199 break;
1200 }
1201 x = (x<<4) & ~0xF;
1202 if (c >= '0' && c <= '9')
1203 x += c - '0';
1204 else if (c >= 'a' && c <= 'f')
1205 x += 10 + c - 'a';
1206 else
1207 x += 10 + c - 'A';
1208 }
1209 s += i;
1210 *p++ = x;
1211 }
1212 _PyUnicode_Resize(v, (int)(p - buf));
1213 return (PyObject *)v;
1214
1215 onError:
1216 Py_XDECREF(v);
1217 return NULL;
1218}
1219
1220PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1221 int size)
1222{
1223 PyObject *repr;
1224 char *p;
1225 char *q;
1226
1227 static const char *hexdigit = "0123456789ABCDEF";
1228
1229 repr = PyString_FromStringAndSize(NULL, 6 * size);
1230 if (repr == NULL)
1231 return NULL;
1232
1233 p = q = PyString_AS_STRING(repr);
1234 while (size-- > 0) {
1235 Py_UNICODE ch = *s++;
1236 /* Map 16-bit characters to '\uxxxx' */
1237 if (ch >= 256) {
1238 *p++ = '\\';
1239 *p++ = 'u';
1240 *p++ = hexdigit[(ch >> 12) & 0xf];
1241 *p++ = hexdigit[(ch >> 8) & 0xf];
1242 *p++ = hexdigit[(ch >> 4) & 0xf];
1243 *p++ = hexdigit[ch & 15];
1244 }
1245 /* Copy everything else as-is */
1246 else
1247 *p++ = (char) ch;
1248 }
1249 *p = '\0';
1250 _PyString_Resize(&repr, p - q);
1251
1252 return repr;
1253}
1254
1255PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1256{
1257 if (!PyUnicode_Check(unicode)) {
1258 PyErr_BadArgument();
1259 return NULL;
1260 }
1261 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1262 PyUnicode_GET_SIZE(unicode));
1263}
1264
1265/* --- Latin-1 Codec ------------------------------------------------------ */
1266
1267PyObject *PyUnicode_DecodeLatin1(const char *s,
1268 int size,
1269 const char *errors)
1270{
1271 PyUnicodeObject *v;
1272 Py_UNICODE *p;
1273
1274 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1275 v = _PyUnicode_New(size);
1276 if (v == NULL)
1277 goto onError;
1278 if (size == 0)
1279 return (PyObject *)v;
1280 p = PyUnicode_AS_UNICODE(v);
1281 while (size-- > 0)
1282 *p++ = (unsigned char)*s++;
1283 return (PyObject *)v;
1284
1285 onError:
1286 Py_XDECREF(v);
1287 return NULL;
1288}
1289
1290static
1291int latin1_encoding_error(const Py_UNICODE **source,
1292 char **dest,
1293 const char *errors,
1294 const char *details)
1295{
1296 if ((errors == NULL) ||
1297 (strcmp(errors,"strict") == 0)) {
1298 PyErr_Format(PyExc_UnicodeError,
1299 "Latin-1 encoding error: %s",
1300 details);
1301 return -1;
1302 }
1303 else if (strcmp(errors,"ignore") == 0) {
1304 return 0;
1305 }
1306 else if (strcmp(errors,"replace") == 0) {
1307 **dest = '?';
1308 return 0;
1309 }
1310 else {
1311 PyErr_Format(PyExc_ValueError,
1312 "Latin-1 encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001313 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001314 errors);
1315 return -1;
1316 }
1317}
1318
1319PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1320 int size,
1321 const char *errors)
1322{
1323 PyObject *repr;
1324 char *s;
1325 repr = PyString_FromStringAndSize(NULL, size);
1326 if (repr == NULL)
1327 return NULL;
1328
1329 s = PyString_AS_STRING(repr);
1330 while (size-- > 0) {
1331 Py_UNICODE ch = *p++;
1332 if (ch >= 256) {
1333 if (latin1_encoding_error(&p, &s, errors,
1334 "ordinal not in range(256)"))
1335 goto onError;
1336 }
1337 else
1338 *s++ = (char)ch;
1339 }
1340 return repr;
1341
1342 onError:
1343 Py_DECREF(repr);
1344 return NULL;
1345}
1346
1347PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1348{
1349 if (!PyUnicode_Check(unicode)) {
1350 PyErr_BadArgument();
1351 return NULL;
1352 }
1353 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1354 PyUnicode_GET_SIZE(unicode),
1355 NULL);
1356}
1357
1358/* --- 7-bit ASCII Codec -------------------------------------------------- */
1359
1360static
1361int ascii_decoding_error(const char **source,
1362 Py_UNICODE **dest,
1363 const char *errors,
1364 const char *details)
1365{
1366 if ((errors == NULL) ||
1367 (strcmp(errors,"strict") == 0)) {
1368 PyErr_Format(PyExc_UnicodeError,
1369 "ASCII decoding error: %s",
1370 details);
1371 return -1;
1372 }
1373 else if (strcmp(errors,"ignore") == 0) {
1374 return 0;
1375 }
1376 else if (strcmp(errors,"replace") == 0) {
1377 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1378 (*dest)++;
1379 return 0;
1380 }
1381 else {
1382 PyErr_Format(PyExc_ValueError,
1383 "ASCII decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001384 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 errors);
1386 return -1;
1387 }
1388}
1389
1390PyObject *PyUnicode_DecodeASCII(const char *s,
1391 int size,
1392 const char *errors)
1393{
1394 PyUnicodeObject *v;
1395 Py_UNICODE *p;
1396
1397 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1398 v = _PyUnicode_New(size);
1399 if (v == NULL)
1400 goto onError;
1401 if (size == 0)
1402 return (PyObject *)v;
1403 p = PyUnicode_AS_UNICODE(v);
1404 while (size-- > 0) {
1405 register unsigned char c;
1406
1407 c = (unsigned char)*s++;
1408 if (c < 128)
1409 *p++ = c;
1410 else if (ascii_decoding_error(&s, &p, errors,
1411 "ordinal not in range(128)"))
1412 goto onError;
1413 }
1414 if (p - PyUnicode_AS_UNICODE(v) < size)
1415 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1416 return (PyObject *)v;
1417
1418 onError:
1419 Py_XDECREF(v);
1420 return NULL;
1421}
1422
1423static
1424int ascii_encoding_error(const Py_UNICODE **source,
1425 char **dest,
1426 const char *errors,
1427 const char *details)
1428{
1429 if ((errors == NULL) ||
1430 (strcmp(errors,"strict") == 0)) {
1431 PyErr_Format(PyExc_UnicodeError,
1432 "ASCII encoding error: %s",
1433 details);
1434 return -1;
1435 }
1436 else if (strcmp(errors,"ignore") == 0) {
1437 return 0;
1438 }
1439 else if (strcmp(errors,"replace") == 0) {
1440 **dest = '?';
1441 return 0;
1442 }
1443 else {
1444 PyErr_Format(PyExc_ValueError,
1445 "ASCII encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001446 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001447 errors);
1448 return -1;
1449 }
1450}
1451
1452PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1453 int size,
1454 const char *errors)
1455{
1456 PyObject *repr;
1457 char *s;
1458 repr = PyString_FromStringAndSize(NULL, size);
1459 if (repr == NULL)
1460 return NULL;
1461
1462 s = PyString_AS_STRING(repr);
1463 while (size-- > 0) {
1464 Py_UNICODE ch = *p++;
1465 if (ch >= 128) {
1466 if (ascii_encoding_error(&p, &s, errors,
1467 "ordinal not in range(128)"))
1468 goto onError;
1469 }
1470 else
1471 *s++ = (char)ch;
1472 }
1473 return repr;
1474
1475 onError:
1476 Py_DECREF(repr);
1477 return NULL;
1478}
1479
1480PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1481{
1482 if (!PyUnicode_Check(unicode)) {
1483 PyErr_BadArgument();
1484 return NULL;
1485 }
1486 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1487 PyUnicode_GET_SIZE(unicode),
1488 NULL);
1489}
1490
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001491#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001492
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001493/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001494
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001495PyObject *PyUnicode_DecodeMBCS(const char *s,
1496 int size,
1497 const char *errors)
1498{
1499 PyUnicodeObject *v;
1500 Py_UNICODE *p;
1501
1502 /* First get the size of the result */
1503 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1504 if (usize==0)
1505 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1506
1507 v = _PyUnicode_New(usize);
1508 if (v == NULL)
1509 return NULL;
1510 if (usize == 0)
1511 return (PyObject *)v;
1512 p = PyUnicode_AS_UNICODE(v);
1513 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1514 Py_DECREF(v);
1515 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1516 }
1517
1518 return (PyObject *)v;
1519}
1520
1521PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1522 int size,
1523 const char *errors)
1524{
1525 PyObject *repr;
1526 char *s;
1527
1528 /* First get the size of the result */
1529 DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1530 if (mbcssize==0)
1531 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1532
1533 repr = PyString_FromStringAndSize(NULL, mbcssize);
1534 if (repr == NULL)
1535 return NULL;
1536 if (mbcssize==0)
1537 return repr;
1538
1539 /* Do the conversion */
1540 s = PyString_AS_STRING(repr);
1541 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1542 Py_DECREF(repr);
1543 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1544 }
1545 return repr;
1546}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001547
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001548#endif /* MS_WIN32 */
1549
Guido van Rossumd57fd912000-03-10 22:53:23 +00001550/* --- Character Mapping Codec -------------------------------------------- */
1551
1552static
1553int charmap_decoding_error(const char **source,
1554 Py_UNICODE **dest,
1555 const char *errors,
1556 const char *details)
1557{
1558 if ((errors == NULL) ||
1559 (strcmp(errors,"strict") == 0)) {
1560 PyErr_Format(PyExc_UnicodeError,
1561 "charmap decoding error: %s",
1562 details);
1563 return -1;
1564 }
1565 else if (strcmp(errors,"ignore") == 0) {
1566 return 0;
1567 }
1568 else if (strcmp(errors,"replace") == 0) {
1569 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1570 (*dest)++;
1571 return 0;
1572 }
1573 else {
1574 PyErr_Format(PyExc_ValueError,
1575 "charmap decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001576 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001577 errors);
1578 return -1;
1579 }
1580}
1581
1582PyObject *PyUnicode_DecodeCharmap(const char *s,
1583 int size,
1584 PyObject *mapping,
1585 const char *errors)
1586{
1587 PyUnicodeObject *v;
1588 Py_UNICODE *p;
1589
1590 /* Default to Latin-1 */
1591 if (mapping == NULL)
1592 return PyUnicode_DecodeLatin1(s, size, errors);
1593
1594 v = _PyUnicode_New(size);
1595 if (v == NULL)
1596 goto onError;
1597 if (size == 0)
1598 return (PyObject *)v;
1599 p = PyUnicode_AS_UNICODE(v);
1600 while (size-- > 0) {
1601 unsigned char ch = *s++;
1602 PyObject *w, *x;
1603
1604 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1605 w = PyInt_FromLong((long)ch);
1606 if (w == NULL)
1607 goto onError;
1608 x = PyObject_GetItem(mapping, w);
1609 Py_DECREF(w);
1610 if (x == NULL) {
1611 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1612 /* No mapping found: default to Latin-1 mapping */
1613 PyErr_Clear();
1614 *p++ = (Py_UNICODE)ch;
1615 continue;
1616 }
1617 goto onError;
1618 }
1619
1620 /* Apply mapping */
1621 if (PyInt_Check(x)) {
1622 int value = PyInt_AS_LONG(x);
1623 if (value < 0 || value > 65535) {
1624 PyErr_SetString(PyExc_TypeError,
1625 "character mapping must be in range(65336)");
1626 Py_DECREF(x);
1627 goto onError;
1628 }
1629 *p++ = (Py_UNICODE)value;
1630 }
1631 else if (x == Py_None) {
1632 /* undefined mapping */
1633 if (charmap_decoding_error(&s, &p, errors,
1634 "character maps to <undefined>")) {
1635 Py_DECREF(x);
1636 goto onError;
1637 }
1638 }
1639 else if (PyUnicode_Check(x)) {
1640 if (PyUnicode_GET_SIZE(x) != 1) {
1641 /* 1-n mapping */
1642 PyErr_SetString(PyExc_NotImplementedError,
1643 "1-n mappings are currently not implemented");
1644 Py_DECREF(x);
1645 goto onError;
1646 }
1647 *p++ = *PyUnicode_AS_UNICODE(x);
1648 }
1649 else {
1650 /* wrong return value */
1651 PyErr_SetString(PyExc_TypeError,
1652 "character mapping must return integer, None or unicode");
1653 Py_DECREF(x);
1654 goto onError;
1655 }
1656 Py_DECREF(x);
1657 }
1658 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1659 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1660 goto onError;
1661 return (PyObject *)v;
1662
1663 onError:
1664 Py_XDECREF(v);
1665 return NULL;
1666}
1667
1668static
1669int charmap_encoding_error(const Py_UNICODE **source,
1670 char **dest,
1671 const char *errors,
1672 const char *details)
1673{
1674 if ((errors == NULL) ||
1675 (strcmp(errors,"strict") == 0)) {
1676 PyErr_Format(PyExc_UnicodeError,
1677 "charmap encoding error: %s",
1678 details);
1679 return -1;
1680 }
1681 else if (strcmp(errors,"ignore") == 0) {
1682 return 0;
1683 }
1684 else if (strcmp(errors,"replace") == 0) {
1685 **dest = '?';
1686 (*dest)++;
1687 return 0;
1688 }
1689 else {
1690 PyErr_Format(PyExc_ValueError,
1691 "charmap encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001692 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001693 errors);
1694 return -1;
1695 }
1696}
1697
1698PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1699 int size,
1700 PyObject *mapping,
1701 const char *errors)
1702{
1703 PyObject *v;
1704 char *s;
1705
1706 /* Default to Latin-1 */
1707 if (mapping == NULL)
1708 return PyUnicode_EncodeLatin1(p, size, errors);
1709
1710 v = PyString_FromStringAndSize(NULL, size);
1711 if (v == NULL)
1712 return NULL;
1713 s = PyString_AS_STRING(v);
1714 while (size-- > 0) {
1715 Py_UNICODE ch = *p++;
1716 PyObject *w, *x;
1717
1718 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1719 w = PyInt_FromLong((long)ch);
1720 if (w == NULL)
1721 goto onError;
1722 x = PyObject_GetItem(mapping, w);
1723 Py_DECREF(w);
1724 if (x == NULL) {
1725 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1726 /* No mapping found: default to Latin-1 mapping if possible */
1727 PyErr_Clear();
1728 if (ch < 256) {
1729 *s++ = (char)ch;
1730 continue;
1731 }
1732 else if (!charmap_encoding_error(&p, &s, errors,
1733 "missing character mapping"))
1734 continue;
1735 }
1736 goto onError;
1737 }
1738
1739 /* Apply mapping */
1740 if (PyInt_Check(x)) {
1741 int value = PyInt_AS_LONG(x);
1742 if (value < 0 || value > 255) {
1743 PyErr_SetString(PyExc_TypeError,
1744 "character mapping must be in range(256)");
1745 Py_DECREF(x);
1746 goto onError;
1747 }
1748 *s++ = (char)value;
1749 }
1750 else if (x == Py_None) {
1751 /* undefined mapping */
1752 if (charmap_encoding_error(&p, &s, errors,
1753 "character maps to <undefined>")) {
1754 Py_DECREF(x);
1755 goto onError;
1756 }
1757 }
1758 else if (PyString_Check(x)) {
1759 if (PyString_GET_SIZE(x) != 1) {
1760 /* 1-n mapping */
1761 PyErr_SetString(PyExc_NotImplementedError,
1762 "1-n mappings are currently not implemented");
1763 Py_DECREF(x);
1764 goto onError;
1765 }
1766 *s++ = *PyString_AS_STRING(x);
1767 }
1768 else {
1769 /* wrong return value */
1770 PyErr_SetString(PyExc_TypeError,
1771 "character mapping must return integer, None or unicode");
1772 Py_DECREF(x);
1773 goto onError;
1774 }
1775 Py_DECREF(x);
1776 }
1777 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1778 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1779 goto onError;
1780 return v;
1781
1782 onError:
1783 Py_DECREF(v);
1784 return NULL;
1785}
1786
1787PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1788 PyObject *mapping)
1789{
1790 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1791 PyErr_BadArgument();
1792 return NULL;
1793 }
1794 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1795 PyUnicode_GET_SIZE(unicode),
1796 mapping,
1797 NULL);
1798}
1799
1800static
1801int translate_error(const Py_UNICODE **source,
1802 Py_UNICODE **dest,
1803 const char *errors,
1804 const char *details)
1805{
1806 if ((errors == NULL) ||
1807 (strcmp(errors,"strict") == 0)) {
1808 PyErr_Format(PyExc_UnicodeError,
1809 "translate error: %s",
1810 details);
1811 return -1;
1812 }
1813 else if (strcmp(errors,"ignore") == 0) {
1814 return 0;
1815 }
1816 else if (strcmp(errors,"replace") == 0) {
1817 **dest = '?';
1818 (*dest)++;
1819 return 0;
1820 }
1821 else {
1822 PyErr_Format(PyExc_ValueError,
1823 "translate error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001824 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001825 errors);
1826 return -1;
1827 }
1828}
1829
1830PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1831 int size,
1832 PyObject *mapping,
1833 const char *errors)
1834{
1835 PyUnicodeObject *v;
1836 Py_UNICODE *p;
1837
1838 if (mapping == NULL) {
1839 PyErr_BadArgument();
1840 return NULL;
1841 }
1842
1843 /* Output will never be longer than input */
1844 v = _PyUnicode_New(size);
1845 if (v == NULL)
1846 goto onError;
1847 if (size == 0)
1848 goto done;
1849 p = PyUnicode_AS_UNICODE(v);
1850 while (size-- > 0) {
1851 Py_UNICODE ch = *s++;
1852 PyObject *w, *x;
1853
1854 /* Get mapping */
1855 w = PyInt_FromLong(ch);
1856 if (w == NULL)
1857 goto onError;
1858 x = PyObject_GetItem(mapping, w);
1859 Py_DECREF(w);
1860 if (x == NULL) {
1861 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1862 /* No mapping found: default to 1-1 mapping */
1863 PyErr_Clear();
1864 *p++ = ch;
1865 continue;
1866 }
1867 goto onError;
1868 }
1869
1870 /* Apply mapping */
1871 if (PyInt_Check(x))
1872 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1873 else if (x == Py_None) {
1874 /* undefined mapping */
1875 if (translate_error(&s, &p, errors,
1876 "character maps to <undefined>")) {
1877 Py_DECREF(x);
1878 goto onError;
1879 }
1880 }
1881 else if (PyUnicode_Check(x)) {
1882 if (PyUnicode_GET_SIZE(x) != 1) {
1883 /* 1-n mapping */
1884 PyErr_SetString(PyExc_NotImplementedError,
1885 "1-n mappings are currently not implemented");
1886 Py_DECREF(x);
1887 goto onError;
1888 }
1889 *p++ = *PyUnicode_AS_UNICODE(x);
1890 }
1891 else {
1892 /* wrong return value */
1893 PyErr_SetString(PyExc_TypeError,
1894 "translate mapping must return integer, None or unicode");
1895 Py_DECREF(x);
1896 goto onError;
1897 }
1898 Py_DECREF(x);
1899 }
1900 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1901 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1902
1903 done:
1904 return (PyObject *)v;
1905
1906 onError:
1907 Py_XDECREF(v);
1908 return NULL;
1909}
1910
1911PyObject *PyUnicode_Translate(PyObject *str,
1912 PyObject *mapping,
1913 const char *errors)
1914{
1915 PyObject *result;
1916
1917 str = PyUnicode_FromObject(str);
1918 if (str == NULL)
1919 goto onError;
1920 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1921 PyUnicode_GET_SIZE(str),
1922 mapping,
1923 errors);
1924 Py_DECREF(str);
1925 return result;
1926
1927 onError:
1928 Py_XDECREF(str);
1929 return NULL;
1930}
1931
Guido van Rossum9e896b32000-04-05 20:11:21 +00001932/* --- Decimal Encoder ---------------------------------------------------- */
1933
1934int PyUnicode_EncodeDecimal(Py_UNICODE *s,
1935 int length,
1936 char *output,
1937 const char *errors)
1938{
1939 Py_UNICODE *p, *end;
1940
1941 if (output == NULL) {
1942 PyErr_BadArgument();
1943 return -1;
1944 }
1945
1946 p = s;
1947 end = s + length;
1948 while (p < end) {
1949 register Py_UNICODE ch = *p++;
1950 int decimal;
1951
1952 if (Py_UNICODE_ISSPACE(ch)) {
1953 *output++ = ' ';
1954 continue;
1955 }
1956 decimal = Py_UNICODE_TODECIMAL(ch);
1957 if (decimal >= 0) {
1958 *output++ = '0' + decimal;
1959 continue;
1960 }
Guido van Rossumba477042000-04-06 18:18:10 +00001961 if (0 < ch && ch < 256) {
Guido van Rossum34888ed2000-04-05 21:29:50 +00001962 *output++ = (char) ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00001963 continue;
1964 }
1965 /* All other characters are considered invalid */
1966 if (errors == NULL || strcmp(errors, "strict") == 0) {
1967 PyErr_SetString(PyExc_ValueError,
1968 "invalid decimal Unicode string");
1969 goto onError;
1970 }
1971 else if (strcmp(errors, "ignore") == 0)
1972 continue;
1973 else if (strcmp(errors, "replace") == 0) {
1974 *output++ = '?';
1975 continue;
1976 }
1977 }
1978 /* 0-terminate the output string */
1979 *output++ = '\0';
1980 return 0;
1981
1982 onError:
1983 return -1;
1984}
1985
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986/* --- Helpers ------------------------------------------------------------ */
1987
1988static
1989int count(PyUnicodeObject *self,
1990 int start,
1991 int end,
1992 PyUnicodeObject *substring)
1993{
1994 int count = 0;
1995
1996 end -= substring->length;
1997
1998 while (start <= end)
1999 if (Py_UNICODE_MATCH(self, start, substring)) {
2000 count++;
2001 start += substring->length;
2002 } else
2003 start++;
2004
2005 return count;
2006}
2007
2008int PyUnicode_Count(PyObject *str,
2009 PyObject *substr,
2010 int start,
2011 int end)
2012{
2013 int result;
2014
2015 str = PyUnicode_FromObject(str);
2016 if (str == NULL)
2017 return -1;
2018 substr = PyUnicode_FromObject(substr);
2019 if (substr == NULL) {
2020 Py_DECREF(substr);
2021 return -1;
2022 }
2023
2024 result = count((PyUnicodeObject *)str,
2025 start, end,
2026 (PyUnicodeObject *)substr);
2027
2028 Py_DECREF(str);
2029 Py_DECREF(substr);
2030 return result;
2031}
2032
2033static
2034int findstring(PyUnicodeObject *self,
2035 PyUnicodeObject *substring,
2036 int start,
2037 int end,
2038 int direction)
2039{
2040 if (start < 0)
2041 start += self->length;
2042 if (start < 0)
2043 start = 0;
2044
2045 if (substring->length == 0)
2046 return start;
2047
2048 if (end > self->length)
2049 end = self->length;
2050 if (end < 0)
2051 end += self->length;
2052 if (end < 0)
2053 end = 0;
2054
2055 end -= substring->length;
2056
2057 if (direction < 0) {
2058 for (; end >= start; end--)
2059 if (Py_UNICODE_MATCH(self, end, substring))
2060 return end;
2061 } else {
2062 for (; start <= end; start++)
2063 if (Py_UNICODE_MATCH(self, start, substring))
2064 return start;
2065 }
2066
2067 return -1;
2068}
2069
2070int PyUnicode_Find(PyObject *str,
2071 PyObject *substr,
2072 int start,
2073 int end,
2074 int direction)
2075{
2076 int result;
2077
2078 str = PyUnicode_FromObject(str);
2079 if (str == NULL)
2080 return -1;
2081 substr = PyUnicode_FromObject(substr);
2082 if (substr == NULL) {
2083 Py_DECREF(substr);
2084 return -1;
2085 }
2086
2087 result = findstring((PyUnicodeObject *)str,
2088 (PyUnicodeObject *)substr,
2089 start, end, direction);
2090 Py_DECREF(str);
2091 Py_DECREF(substr);
2092 return result;
2093}
2094
2095static
2096int tailmatch(PyUnicodeObject *self,
2097 PyUnicodeObject *substring,
2098 int start,
2099 int end,
2100 int direction)
2101{
2102 if (start < 0)
2103 start += self->length;
2104 if (start < 0)
2105 start = 0;
2106
2107 if (substring->length == 0)
2108 return 1;
2109
2110 if (end > self->length)
2111 end = self->length;
2112 if (end < 0)
2113 end += self->length;
2114 if (end < 0)
2115 end = 0;
2116
2117 end -= substring->length;
2118 if (end < start)
2119 return 0;
2120
2121 if (direction > 0) {
2122 if (Py_UNICODE_MATCH(self, end, substring))
2123 return 1;
2124 } else {
2125 if (Py_UNICODE_MATCH(self, start, substring))
2126 return 1;
2127 }
2128
2129 return 0;
2130}
2131
2132int PyUnicode_Tailmatch(PyObject *str,
2133 PyObject *substr,
2134 int start,
2135 int end,
2136 int direction)
2137{
2138 int result;
2139
2140 str = PyUnicode_FromObject(str);
2141 if (str == NULL)
2142 return -1;
2143 substr = PyUnicode_FromObject(substr);
2144 if (substr == NULL) {
2145 Py_DECREF(substr);
2146 return -1;
2147 }
2148
2149 result = tailmatch((PyUnicodeObject *)str,
2150 (PyUnicodeObject *)substr,
2151 start, end, direction);
2152 Py_DECREF(str);
2153 Py_DECREF(substr);
2154 return result;
2155}
2156
2157static
2158const Py_UNICODE *findchar(const Py_UNICODE *s,
2159 int size,
2160 Py_UNICODE ch)
2161{
2162 /* like wcschr, but doesn't stop at NULL characters */
2163
2164 while (size-- > 0) {
2165 if (*s == ch)
2166 return s;
2167 s++;
2168 }
2169
2170 return NULL;
2171}
2172
2173/* Apply fixfct filter to the Unicode object self and return a
2174 reference to the modified object */
2175
2176static
2177PyObject *fixup(PyUnicodeObject *self,
2178 int (*fixfct)(PyUnicodeObject *s))
2179{
2180
2181 PyUnicodeObject *u;
2182
2183 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2184 self->length);
2185 if (u == NULL)
2186 return NULL;
2187 if (!fixfct(u)) {
2188 /* fixfct should return TRUE if it modified the buffer. If
2189 FALSE, return a reference to the original buffer instead
2190 (to save space, not time) */
2191 Py_INCREF(self);
2192 Py_DECREF(u);
2193 return (PyObject*) self;
2194 }
2195 return (PyObject*) u;
2196}
2197
2198static
2199int fixupper(PyUnicodeObject *self)
2200{
2201 int len = self->length;
2202 Py_UNICODE *s = self->str;
2203 int status = 0;
2204
2205 while (len-- > 0) {
2206 register Py_UNICODE ch;
2207
2208 ch = Py_UNICODE_TOUPPER(*s);
2209 if (ch != *s) {
2210 status = 1;
2211 *s = ch;
2212 }
2213 s++;
2214 }
2215
2216 return status;
2217}
2218
2219static
2220int fixlower(PyUnicodeObject *self)
2221{
2222 int len = self->length;
2223 Py_UNICODE *s = self->str;
2224 int status = 0;
2225
2226 while (len-- > 0) {
2227 register Py_UNICODE ch;
2228
2229 ch = Py_UNICODE_TOLOWER(*s);
2230 if (ch != *s) {
2231 status = 1;
2232 *s = ch;
2233 }
2234 s++;
2235 }
2236
2237 return status;
2238}
2239
2240static
2241int fixswapcase(PyUnicodeObject *self)
2242{
2243 int len = self->length;
2244 Py_UNICODE *s = self->str;
2245 int status = 0;
2246
2247 while (len-- > 0) {
2248 if (Py_UNICODE_ISUPPER(*s)) {
2249 *s = Py_UNICODE_TOLOWER(*s);
2250 status = 1;
2251 } else if (Py_UNICODE_ISLOWER(*s)) {
2252 *s = Py_UNICODE_TOUPPER(*s);
2253 status = 1;
2254 }
2255 s++;
2256 }
2257
2258 return status;
2259}
2260
2261static
2262int fixcapitalize(PyUnicodeObject *self)
2263{
2264 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2265 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2266 return 1;
2267 }
2268 return 0;
2269}
2270
2271static
2272int fixtitle(PyUnicodeObject *self)
2273{
2274 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2275 register Py_UNICODE *e;
2276 int previous_is_cased;
2277
2278 /* Shortcut for single character strings */
2279 if (PyUnicode_GET_SIZE(self) == 1) {
2280 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2281 if (*p != ch) {
2282 *p = ch;
2283 return 1;
2284 }
2285 else
2286 return 0;
2287 }
2288
2289 e = p + PyUnicode_GET_SIZE(self);
2290 previous_is_cased = 0;
2291 for (; p < e; p++) {
2292 register const Py_UNICODE ch = *p;
2293
2294 if (previous_is_cased)
2295 *p = Py_UNICODE_TOLOWER(ch);
2296 else
2297 *p = Py_UNICODE_TOTITLE(ch);
2298
2299 if (Py_UNICODE_ISLOWER(ch) ||
2300 Py_UNICODE_ISUPPER(ch) ||
2301 Py_UNICODE_ISTITLE(ch))
2302 previous_is_cased = 1;
2303 else
2304 previous_is_cased = 0;
2305 }
2306 return 1;
2307}
2308
2309PyObject *PyUnicode_Join(PyObject *separator,
2310 PyObject *seq)
2311{
2312 Py_UNICODE *sep;
2313 int seplen;
2314 PyUnicodeObject *res = NULL;
2315 int reslen = 0;
2316 Py_UNICODE *p;
2317 int seqlen = 0;
2318 int sz = 100;
2319 int i;
2320
2321 seqlen = PySequence_Length(seq);
2322 if (seqlen < 0 && PyErr_Occurred())
2323 return NULL;
2324
2325 if (separator == NULL) {
2326 Py_UNICODE blank = ' ';
2327 sep = &blank;
2328 seplen = 1;
2329 }
2330 else {
2331 separator = PyUnicode_FromObject(separator);
2332 if (separator == NULL)
2333 return NULL;
2334 sep = PyUnicode_AS_UNICODE(separator);
2335 seplen = PyUnicode_GET_SIZE(separator);
2336 }
2337
2338 res = _PyUnicode_New(sz);
2339 if (res == NULL)
2340 goto onError;
2341 p = PyUnicode_AS_UNICODE(res);
2342 reslen = 0;
2343
2344 for (i = 0; i < seqlen; i++) {
2345 int itemlen;
2346 PyObject *item;
2347
2348 item = PySequence_GetItem(seq, i);
2349 if (item == NULL)
2350 goto onError;
2351 if (!PyUnicode_Check(item)) {
2352 PyObject *v;
2353 v = PyUnicode_FromObject(item);
2354 Py_DECREF(item);
2355 item = v;
2356 if (item == NULL)
2357 goto onError;
2358 }
2359 itemlen = PyUnicode_GET_SIZE(item);
2360 while (reslen + itemlen + seplen >= sz) {
2361 if (_PyUnicode_Resize(res, sz*2))
2362 goto onError;
2363 sz *= 2;
2364 p = PyUnicode_AS_UNICODE(res) + reslen;
2365 }
2366 if (i > 0) {
2367 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2368 p += seplen;
2369 reslen += seplen;
2370 }
2371 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2372 p += itemlen;
2373 reslen += itemlen;
2374 Py_DECREF(item);
2375 }
2376 if (_PyUnicode_Resize(res, reslen))
2377 goto onError;
2378
2379 Py_XDECREF(separator);
2380 return (PyObject *)res;
2381
2382 onError:
2383 Py_XDECREF(separator);
2384 Py_DECREF(res);
2385 return NULL;
2386}
2387
2388static
2389PyUnicodeObject *pad(PyUnicodeObject *self,
2390 int left,
2391 int right,
2392 Py_UNICODE fill)
2393{
2394 PyUnicodeObject *u;
2395
2396 if (left < 0)
2397 left = 0;
2398 if (right < 0)
2399 right = 0;
2400
2401 if (left == 0 && right == 0) {
2402 Py_INCREF(self);
2403 return self;
2404 }
2405
2406 u = _PyUnicode_New(left + self->length + right);
2407 if (u) {
2408 if (left)
2409 Py_UNICODE_FILL(u->str, fill, left);
2410 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2411 if (right)
2412 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2413 }
2414
2415 return u;
2416}
2417
2418#define SPLIT_APPEND(data, left, right) \
2419 str = PyUnicode_FromUnicode(data + left, right - left); \
2420 if (!str) \
2421 goto onError; \
2422 if (PyList_Append(list, str)) { \
2423 Py_DECREF(str); \
2424 goto onError; \
2425 } \
2426 else \
2427 Py_DECREF(str);
2428
2429static
2430PyObject *split_whitespace(PyUnicodeObject *self,
2431 PyObject *list,
2432 int maxcount)
2433{
2434 register int i;
2435 register int j;
2436 int len = self->length;
2437 PyObject *str;
2438
2439 for (i = j = 0; i < len; ) {
2440 /* find a token */
2441 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2442 i++;
2443 j = i;
2444 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2445 i++;
2446 if (j < i) {
2447 if (maxcount-- <= 0)
2448 break;
2449 SPLIT_APPEND(self->str, j, i);
2450 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2451 i++;
2452 j = i;
2453 }
2454 }
2455 if (j < len) {
2456 SPLIT_APPEND(self->str, j, len);
2457 }
2458 return list;
2459
2460 onError:
2461 Py_DECREF(list);
2462 return NULL;
2463}
2464
2465PyObject *PyUnicode_Splitlines(PyObject *string,
2466 int maxcount)
2467{
2468 register int i;
2469 register int j;
2470 int len;
2471 PyObject *list;
2472 PyObject *str;
2473 Py_UNICODE *data;
2474
2475 string = PyUnicode_FromObject(string);
2476 if (string == NULL)
2477 return NULL;
2478 data = PyUnicode_AS_UNICODE(string);
2479 len = PyUnicode_GET_SIZE(string);
2480
2481 if (maxcount < 0)
2482 maxcount = INT_MAX;
2483
2484 list = PyList_New(0);
2485 if (!list)
2486 goto onError;
2487
2488 for (i = j = 0; i < len; ) {
2489 /* Find a line and append it */
2490 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2491 i++;
2492 if (maxcount-- <= 0)
2493 break;
2494 SPLIT_APPEND(data, j, i);
2495
2496 /* Skip the line break reading CRLF as one line break */
2497 if (i < len) {
2498 if (data[i] == '\r' && i + 1 < len &&
2499 data[i+1] == '\n')
2500 i += 2;
2501 else
2502 i++;
2503 }
2504 j = i;
2505 }
2506 if (j < len) {
2507 SPLIT_APPEND(data, j, len);
2508 }
2509
2510 Py_DECREF(string);
2511 return list;
2512
2513 onError:
2514 Py_DECREF(list);
2515 Py_DECREF(string);
2516 return NULL;
2517}
2518
2519static
2520PyObject *split_char(PyUnicodeObject *self,
2521 PyObject *list,
2522 Py_UNICODE ch,
2523 int maxcount)
2524{
2525 register int i;
2526 register int j;
2527 int len = self->length;
2528 PyObject *str;
2529
2530 for (i = j = 0; i < len; ) {
2531 if (self->str[i] == ch) {
2532 if (maxcount-- <= 0)
2533 break;
2534 SPLIT_APPEND(self->str, j, i);
2535 i = j = i + 1;
2536 } else
2537 i++;
2538 }
2539 if (j <= len) {
2540 SPLIT_APPEND(self->str, j, len);
2541 }
2542 return list;
2543
2544 onError:
2545 Py_DECREF(list);
2546 return NULL;
2547}
2548
2549static
2550PyObject *split_substring(PyUnicodeObject *self,
2551 PyObject *list,
2552 PyUnicodeObject *substring,
2553 int maxcount)
2554{
2555 register int i;
2556 register int j;
2557 int len = self->length;
2558 int sublen = substring->length;
2559 PyObject *str;
2560
2561 for (i = j = 0; i < len - sublen; ) {
2562 if (Py_UNICODE_MATCH(self, i, substring)) {
2563 if (maxcount-- <= 0)
2564 break;
2565 SPLIT_APPEND(self->str, j, i);
2566 i = j = i + sublen;
2567 } else
2568 i++;
2569 }
2570 if (j <= len) {
2571 SPLIT_APPEND(self->str, j, len);
2572 }
2573 return list;
2574
2575 onError:
2576 Py_DECREF(list);
2577 return NULL;
2578}
2579
2580#undef SPLIT_APPEND
2581
2582static
2583PyObject *split(PyUnicodeObject *self,
2584 PyUnicodeObject *substring,
2585 int maxcount)
2586{
2587 PyObject *list;
2588
2589 if (maxcount < 0)
2590 maxcount = INT_MAX;
2591
2592 list = PyList_New(0);
2593 if (!list)
2594 return NULL;
2595
2596 if (substring == NULL)
2597 return split_whitespace(self,list,maxcount);
2598
2599 else if (substring->length == 1)
2600 return split_char(self,list,substring->str[0],maxcount);
2601
2602 else if (substring->length == 0) {
2603 Py_DECREF(list);
2604 PyErr_SetString(PyExc_ValueError, "empty separator");
2605 return NULL;
2606 }
2607 else
2608 return split_substring(self,list,substring,maxcount);
2609}
2610
2611static
2612PyObject *strip(PyUnicodeObject *self,
2613 int left,
2614 int right)
2615{
2616 Py_UNICODE *p = self->str;
2617 int start = 0;
2618 int end = self->length;
2619
2620 if (left)
2621 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2622 start++;
2623
2624 if (right)
2625 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2626 end--;
2627
2628 if (start == 0 && end == self->length) {
2629 /* couldn't strip anything off, return original string */
2630 Py_INCREF(self);
2631 return (PyObject*) self;
2632 }
2633
2634 return (PyObject*) PyUnicode_FromUnicode(
2635 self->str + start,
2636 end - start
2637 );
2638}
2639
2640static
2641PyObject *replace(PyUnicodeObject *self,
2642 PyUnicodeObject *str1,
2643 PyUnicodeObject *str2,
2644 int maxcount)
2645{
2646 PyUnicodeObject *u;
2647
2648 if (maxcount < 0)
2649 maxcount = INT_MAX;
2650
2651 if (str1->length == 1 && str2->length == 1) {
2652 int i;
2653
2654 /* replace characters */
2655 if (!findchar(self->str, self->length, str1->str[0])) {
2656 /* nothing to replace, return original string */
2657 Py_INCREF(self);
2658 u = self;
2659 } else {
2660 Py_UNICODE u1 = str1->str[0];
2661 Py_UNICODE u2 = str2->str[0];
2662
2663 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2664 self->str,
2665 self->length
2666 );
2667 if (u)
2668 for (i = 0; i < u->length; i++)
2669 if (u->str[i] == u1) {
2670 if (--maxcount < 0)
2671 break;
2672 u->str[i] = u2;
2673 }
2674 }
2675
2676 } else {
2677 int n, i;
2678 Py_UNICODE *p;
2679
2680 /* replace strings */
2681 n = count(self, 0, self->length, str1);
2682 if (n > maxcount)
2683 n = maxcount;
2684 if (n == 0) {
2685 /* nothing to replace, return original string */
2686 Py_INCREF(self);
2687 u = self;
2688 } else {
2689 u = _PyUnicode_New(
2690 self->length + n * (str2->length - str1->length));
2691 if (u) {
2692 i = 0;
2693 p = u->str;
2694 while (i <= self->length - str1->length)
2695 if (Py_UNICODE_MATCH(self, i, str1)) {
2696 /* replace string segment */
2697 Py_UNICODE_COPY(p, str2->str, str2->length);
2698 p += str2->length;
2699 i += str1->length;
2700 if (--n <= 0) {
2701 /* copy remaining part */
2702 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2703 break;
2704 }
2705 } else
2706 *p++ = self->str[i++];
2707 }
2708 }
2709 }
2710
2711 return (PyObject *) u;
2712}
2713
2714/* --- Unicode Object Methods --------------------------------------------- */
2715
2716static char title__doc__[] =
2717"S.title() -> unicode\n\
2718\n\
2719Return a titlecased version of S, i.e. words start with title case\n\
2720characters, all remaining cased characters have lower case.";
2721
2722static PyObject*
2723unicode_title(PyUnicodeObject *self, PyObject *args)
2724{
2725 if (!PyArg_NoArgs(args))
2726 return NULL;
2727 return fixup(self, fixtitle);
2728}
2729
2730static char capitalize__doc__[] =
2731"S.capitalize() -> unicode\n\
2732\n\
2733Return a capitalized version of S, i.e. make the first character\n\
2734have upper case.";
2735
2736static PyObject*
2737unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2738{
2739 if (!PyArg_NoArgs(args))
2740 return NULL;
2741 return fixup(self, fixcapitalize);
2742}
2743
2744#if 0
2745static char capwords__doc__[] =
2746"S.capwords() -> unicode\n\
2747\n\
2748Apply .capitalize() to all words in S and return the result with\n\
2749normalized whitespace (all whitespace strings are replaced by ' ').";
2750
2751static PyObject*
2752unicode_capwords(PyUnicodeObject *self, PyObject *args)
2753{
2754 PyObject *list;
2755 PyObject *item;
2756 int i;
2757
2758 if (!PyArg_NoArgs(args))
2759 return NULL;
2760
2761 /* Split into words */
2762 list = split(self, NULL, -1);
2763 if (!list)
2764 return NULL;
2765
2766 /* Capitalize each word */
2767 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2768 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2769 fixcapitalize);
2770 if (item == NULL)
2771 goto onError;
2772 Py_DECREF(PyList_GET_ITEM(list, i));
2773 PyList_SET_ITEM(list, i, item);
2774 }
2775
2776 /* Join the words to form a new string */
2777 item = PyUnicode_Join(NULL, list);
2778
2779onError:
2780 Py_DECREF(list);
2781 return (PyObject *)item;
2782}
2783#endif
2784
2785static char center__doc__[] =
2786"S.center(width) -> unicode\n\
2787\n\
2788Return S centered in a Unicode string of length width. Padding is done\n\
2789using spaces.";
2790
2791static PyObject *
2792unicode_center(PyUnicodeObject *self, PyObject *args)
2793{
2794 int marg, left;
2795 int width;
2796
2797 if (!PyArg_ParseTuple(args, "i:center", &width))
2798 return NULL;
2799
2800 if (self->length >= width) {
2801 Py_INCREF(self);
2802 return (PyObject*) self;
2803 }
2804
2805 marg = width - self->length;
2806 left = marg / 2 + (marg & width & 1);
2807
2808 return (PyObject*) pad(self, left, marg - left, ' ');
2809}
2810
2811static int
2812unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2813{
2814 int len1, len2;
2815 Py_UNICODE *s1 = str1->str;
2816 Py_UNICODE *s2 = str2->str;
2817
2818 len1 = str1->length;
2819 len2 = str2->length;
2820
2821 while (len1 > 0 && len2 > 0) {
2822 int cmp = (*s1++) - (*s2++);
2823 if (cmp)
2824 /* This should make Christian happy! */
2825 return (cmp < 0) ? -1 : (cmp != 0);
2826 len1--, len2--;
2827 }
2828
2829 return (len1 < len2) ? -1 : (len1 != len2);
2830}
2831
2832int PyUnicode_Compare(PyObject *left,
2833 PyObject *right)
2834{
2835 PyUnicodeObject *u = NULL, *v = NULL;
2836 int result;
2837
2838 /* Coerce the two arguments */
2839 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2840 if (u == NULL)
2841 goto onError;
2842 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2843 if (v == NULL)
2844 goto onError;
2845
2846 /* Shortcut for emtpy or interned objects */
2847 if (v == u) {
2848 Py_DECREF(u);
2849 Py_DECREF(v);
2850 return 0;
2851 }
2852
2853 result = unicode_compare(u, v);
2854
2855 Py_DECREF(u);
2856 Py_DECREF(v);
2857 return result;
2858
2859onError:
2860 Py_XDECREF(u);
2861 Py_XDECREF(v);
2862 return -1;
2863}
2864
Guido van Rossum403d68b2000-03-13 15:55:09 +00002865int PyUnicode_Contains(PyObject *container,
2866 PyObject *element)
2867{
2868 PyUnicodeObject *u = NULL, *v = NULL;
2869 int result;
2870 register const Py_UNICODE *p, *e;
2871 register Py_UNICODE ch;
2872
2873 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002874 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2875 if (v == NULL)
2876 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002877 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2878 if (u == NULL) {
2879 Py_DECREF(v);
2880 goto onError;
2881 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002882
2883 /* Check v in u */
2884 if (PyUnicode_GET_SIZE(v) != 1) {
2885 PyErr_SetString(PyExc_TypeError,
2886 "string member test needs char left operand");
2887 goto onError;
2888 }
2889 ch = *PyUnicode_AS_UNICODE(v);
2890 p = PyUnicode_AS_UNICODE(u);
2891 e = p + PyUnicode_GET_SIZE(u);
2892 result = 0;
2893 while (p < e) {
2894 if (*p++ == ch) {
2895 result = 1;
2896 break;
2897 }
2898 }
2899
2900 Py_DECREF(u);
2901 Py_DECREF(v);
2902 return result;
2903
2904onError:
2905 Py_XDECREF(u);
2906 Py_XDECREF(v);
2907 return -1;
2908}
2909
Guido van Rossumd57fd912000-03-10 22:53:23 +00002910/* Concat to string or Unicode object giving a new Unicode object. */
2911
2912PyObject *PyUnicode_Concat(PyObject *left,
2913 PyObject *right)
2914{
2915 PyUnicodeObject *u = NULL, *v = NULL, *w;
2916
2917 /* Coerce the two arguments */
2918 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2919 if (u == NULL)
2920 goto onError;
2921 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2922 if (v == NULL)
2923 goto onError;
2924
2925 /* Shortcuts */
2926 if (v == unicode_empty) {
2927 Py_DECREF(v);
2928 return (PyObject *)u;
2929 }
2930 if (u == unicode_empty) {
2931 Py_DECREF(u);
2932 return (PyObject *)v;
2933 }
2934
2935 /* Concat the two Unicode strings */
2936 w = _PyUnicode_New(u->length + v->length);
2937 if (w == NULL)
2938 goto onError;
2939 Py_UNICODE_COPY(w->str, u->str, u->length);
2940 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
2941
2942 Py_DECREF(u);
2943 Py_DECREF(v);
2944 return (PyObject *)w;
2945
2946onError:
2947 Py_XDECREF(u);
2948 Py_XDECREF(v);
2949 return NULL;
2950}
2951
2952static char count__doc__[] =
2953"S.count(sub[, start[, end]]) -> int\n\
2954\n\
2955Return the number of occurrences of substring sub in Unicode string\n\
2956S[start:end]. Optional arguments start and end are\n\
2957interpreted as in slice notation.";
2958
2959static PyObject *
2960unicode_count(PyUnicodeObject *self, PyObject *args)
2961{
2962 PyUnicodeObject *substring;
2963 int start = 0;
2964 int end = INT_MAX;
2965 PyObject *result;
2966
2967 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
2968 return NULL;
2969
2970 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2971 (PyObject *)substring);
2972 if (substring == NULL)
2973 return NULL;
2974
2975 if (substring->length == 0) {
2976 Py_DECREF(substring);
2977 return PyInt_FromLong((long) 0);
2978 }
2979
2980 if (start < 0)
2981 start += self->length;
2982 if (start < 0)
2983 start = 0;
2984 if (end > self->length)
2985 end = self->length;
2986 if (end < 0)
2987 end += self->length;
2988 if (end < 0)
2989 end = 0;
2990
2991 result = PyInt_FromLong((long) count(self, start, end, substring));
2992
2993 Py_DECREF(substring);
2994 return result;
2995}
2996
2997static char encode__doc__[] =
2998"S.encode([encoding[,errors]]) -> string\n\
2999\n\
3000Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
3001errors may be given to set a different error handling scheme. Default\n\
3002is 'strict' meaning that encoding errors raise a ValueError. Other\n\
3003possible values are 'ignore' and 'replace'.";
3004
3005static PyObject *
3006unicode_encode(PyUnicodeObject *self, PyObject *args)
3007{
3008 char *encoding = NULL;
3009 char *errors = NULL;
3010 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3011 return NULL;
3012 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3013}
3014
3015static char expandtabs__doc__[] =
3016"S.expandtabs([tabsize]) -> unicode\n\
3017\n\
3018Return a copy of S where all tab characters are expanded using spaces.\n\
3019If tabsize is not given, a tab size of 8 characters is assumed.";
3020
3021static PyObject*
3022unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3023{
3024 Py_UNICODE *e;
3025 Py_UNICODE *p;
3026 Py_UNICODE *q;
3027 int i, j;
3028 PyUnicodeObject *u;
3029 int tabsize = 8;
3030
3031 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3032 return NULL;
3033
3034 /* First pass: determine size of ouput string */
3035 i = j = 0;
3036 e = self->str + self->length;
3037 for (p = self->str; p < e; p++)
3038 if (*p == '\t') {
3039 if (tabsize > 0)
3040 j += tabsize - (j % tabsize);
3041 }
3042 else {
3043 j++;
3044 if (*p == '\n' || *p == '\r') {
3045 i += j;
3046 j = 0;
3047 }
3048 }
3049
3050 /* Second pass: create output string and fill it */
3051 u = _PyUnicode_New(i + j);
3052 if (!u)
3053 return NULL;
3054
3055 j = 0;
3056 q = u->str;
3057
3058 for (p = self->str; p < e; p++)
3059 if (*p == '\t') {
3060 if (tabsize > 0) {
3061 i = tabsize - (j % tabsize);
3062 j += i;
3063 while (i--)
3064 *q++ = ' ';
3065 }
3066 }
3067 else {
3068 j++;
3069 *q++ = *p;
3070 if (*p == '\n' || *p == '\r')
3071 j = 0;
3072 }
3073
3074 return (PyObject*) u;
3075}
3076
3077static char find__doc__[] =
3078"S.find(sub [,start [,end]]) -> int\n\
3079\n\
3080Return the lowest index in S where substring sub is found,\n\
3081such that sub is contained within s[start,end]. Optional\n\
3082arguments start and end are interpreted as in slice notation.\n\
3083\n\
3084Return -1 on failure.";
3085
3086static PyObject *
3087unicode_find(PyUnicodeObject *self, PyObject *args)
3088{
3089 PyUnicodeObject *substring;
3090 int start = 0;
3091 int end = INT_MAX;
3092 PyObject *result;
3093
3094 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
3095 return NULL;
3096 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3097 (PyObject *)substring);
3098 if (substring == NULL)
3099 return NULL;
3100
3101 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3102
3103 Py_DECREF(substring);
3104 return result;
3105}
3106
3107static PyObject *
3108unicode_getitem(PyUnicodeObject *self, int index)
3109{
3110 if (index < 0 || index >= self->length) {
3111 PyErr_SetString(PyExc_IndexError, "string index out of range");
3112 return NULL;
3113 }
3114
3115 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3116}
3117
3118static long
3119unicode_hash(PyUnicodeObject *self)
3120{
3121 long hash;
3122 PyObject *utf8;
3123
3124 /* Since Unicode objects compare equal to their UTF-8 string
3125 counterparts, they should also use the UTF-8 strings as basis
3126 for their hash value. This is needed to assure that strings and
3127 Unicode objects behave in the same way as dictionary
3128 keys. Unfortunately, this costs some performance and also some
3129 memory if the cached UTF-8 representation is not used later
3130 on. */
3131 if (self->hash != -1)
3132 return self->hash;
3133 utf8 = utf8_string(self, NULL);
3134 if (utf8 == NULL)
3135 return -1;
3136 hash = PyObject_Hash(utf8);
3137 if (hash == -1)
3138 return -1;
3139 self->hash = hash;
3140 return hash;
3141}
3142
3143static char index__doc__[] =
3144"S.index(sub [,start [,end]]) -> int\n\
3145\n\
3146Like S.find() but raise ValueError when the substring is not found.";
3147
3148static PyObject *
3149unicode_index(PyUnicodeObject *self, PyObject *args)
3150{
3151 int result;
3152 PyUnicodeObject *substring;
3153 int start = 0;
3154 int end = INT_MAX;
3155
3156 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
3157 return NULL;
3158
3159 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3160 (PyObject *)substring);
3161 if (substring == NULL)
3162 return NULL;
3163
3164 result = findstring(self, substring, start, end, 1);
3165
3166 Py_DECREF(substring);
3167 if (result < 0) {
3168 PyErr_SetString(PyExc_ValueError, "substring not found");
3169 return NULL;
3170 }
3171 return PyInt_FromLong(result);
3172}
3173
3174static char islower__doc__[] =
3175"S.islower() -> int\n\
3176\n\
3177Return 1 if all cased characters in S are lowercase and there is\n\
3178at least one cased character in S, 0 otherwise.";
3179
3180static PyObject*
3181unicode_islower(PyUnicodeObject *self, PyObject *args)
3182{
3183 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3184 register const Py_UNICODE *e;
3185 int cased;
3186
3187 if (!PyArg_NoArgs(args))
3188 return NULL;
3189
3190 /* Shortcut for single character strings */
3191 if (PyUnicode_GET_SIZE(self) == 1)
3192 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3193
3194 e = p + PyUnicode_GET_SIZE(self);
3195 cased = 0;
3196 for (; p < e; p++) {
3197 register const Py_UNICODE ch = *p;
3198
3199 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3200 return PyInt_FromLong(0);
3201 else if (!cased && Py_UNICODE_ISLOWER(ch))
3202 cased = 1;
3203 }
3204 return PyInt_FromLong(cased);
3205}
3206
3207static char isupper__doc__[] =
3208"S.isupper() -> int\n\
3209\n\
3210Return 1 if all cased characters in S are uppercase and there is\n\
3211at least one cased character in S, 0 otherwise.";
3212
3213static PyObject*
3214unicode_isupper(PyUnicodeObject *self, PyObject *args)
3215{
3216 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3217 register const Py_UNICODE *e;
3218 int cased;
3219
3220 if (!PyArg_NoArgs(args))
3221 return NULL;
3222
3223 /* Shortcut for single character strings */
3224 if (PyUnicode_GET_SIZE(self) == 1)
3225 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3226
3227 e = p + PyUnicode_GET_SIZE(self);
3228 cased = 0;
3229 for (; p < e; p++) {
3230 register const Py_UNICODE ch = *p;
3231
3232 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3233 return PyInt_FromLong(0);
3234 else if (!cased && Py_UNICODE_ISUPPER(ch))
3235 cased = 1;
3236 }
3237 return PyInt_FromLong(cased);
3238}
3239
3240static char istitle__doc__[] =
3241"S.istitle() -> int\n\
3242\n\
3243Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3244may only follow uncased characters and lowercase characters only cased\n\
3245ones. Return 0 otherwise.";
3246
3247static PyObject*
3248unicode_istitle(PyUnicodeObject *self, PyObject *args)
3249{
3250 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3251 register const Py_UNICODE *e;
3252 int cased, previous_is_cased;
3253
3254 if (!PyArg_NoArgs(args))
3255 return NULL;
3256
3257 /* Shortcut for single character strings */
3258 if (PyUnicode_GET_SIZE(self) == 1)
3259 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3260 (Py_UNICODE_ISUPPER(*p) != 0));
3261
3262 e = p + PyUnicode_GET_SIZE(self);
3263 cased = 0;
3264 previous_is_cased = 0;
3265 for (; p < e; p++) {
3266 register const Py_UNICODE ch = *p;
3267
3268 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3269 if (previous_is_cased)
3270 return PyInt_FromLong(0);
3271 previous_is_cased = 1;
3272 cased = 1;
3273 }
3274 else if (Py_UNICODE_ISLOWER(ch)) {
3275 if (!previous_is_cased)
3276 return PyInt_FromLong(0);
3277 previous_is_cased = 1;
3278 cased = 1;
3279 }
3280 else
3281 previous_is_cased = 0;
3282 }
3283 return PyInt_FromLong(cased);
3284}
3285
3286static char isspace__doc__[] =
3287"S.isspace() -> int\n\
3288\n\
3289Return 1 if there are only whitespace characters in S,\n\
32900 otherwise.";
3291
3292static PyObject*
3293unicode_isspace(PyUnicodeObject *self, PyObject *args)
3294{
3295 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3296 register const Py_UNICODE *e;
3297
3298 if (!PyArg_NoArgs(args))
3299 return NULL;
3300
3301 /* Shortcut for single character strings */
3302 if (PyUnicode_GET_SIZE(self) == 1 &&
3303 Py_UNICODE_ISSPACE(*p))
3304 return PyInt_FromLong(1);
3305
3306 e = p + PyUnicode_GET_SIZE(self);
3307 for (; p < e; p++) {
3308 if (!Py_UNICODE_ISSPACE(*p))
3309 return PyInt_FromLong(0);
3310 }
3311 return PyInt_FromLong(1);
3312}
3313
3314static char isdecimal__doc__[] =
3315"S.isdecimal() -> int\n\
3316\n\
3317Return 1 if there are only decimal characters in S,\n\
33180 otherwise.";
3319
3320static PyObject*
3321unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3322{
3323 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3324 register const Py_UNICODE *e;
3325
3326 if (!PyArg_NoArgs(args))
3327 return NULL;
3328
3329 /* Shortcut for single character strings */
3330 if (PyUnicode_GET_SIZE(self) == 1 &&
3331 Py_UNICODE_ISDECIMAL(*p))
3332 return PyInt_FromLong(1);
3333
3334 e = p + PyUnicode_GET_SIZE(self);
3335 for (; p < e; p++) {
3336 if (!Py_UNICODE_ISDECIMAL(*p))
3337 return PyInt_FromLong(0);
3338 }
3339 return PyInt_FromLong(1);
3340}
3341
3342static char isdigit__doc__[] =
3343"S.isdigit() -> int\n\
3344\n\
3345Return 1 if there are only digit characters in S,\n\
33460 otherwise.";
3347
3348static PyObject*
3349unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3350{
3351 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3352 register const Py_UNICODE *e;
3353
3354 if (!PyArg_NoArgs(args))
3355 return NULL;
3356
3357 /* Shortcut for single character strings */
3358 if (PyUnicode_GET_SIZE(self) == 1 &&
3359 Py_UNICODE_ISDIGIT(*p))
3360 return PyInt_FromLong(1);
3361
3362 e = p + PyUnicode_GET_SIZE(self);
3363 for (; p < e; p++) {
3364 if (!Py_UNICODE_ISDIGIT(*p))
3365 return PyInt_FromLong(0);
3366 }
3367 return PyInt_FromLong(1);
3368}
3369
3370static char isnumeric__doc__[] =
3371"S.isnumeric() -> int\n\
3372\n\
3373Return 1 if there are only numeric characters in S,\n\
33740 otherwise.";
3375
3376static PyObject*
3377unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3378{
3379 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3380 register const Py_UNICODE *e;
3381
3382 if (!PyArg_NoArgs(args))
3383 return NULL;
3384
3385 /* Shortcut for single character strings */
3386 if (PyUnicode_GET_SIZE(self) == 1 &&
3387 Py_UNICODE_ISNUMERIC(*p))
3388 return PyInt_FromLong(1);
3389
3390 e = p + PyUnicode_GET_SIZE(self);
3391 for (; p < e; p++) {
3392 if (!Py_UNICODE_ISNUMERIC(*p))
3393 return PyInt_FromLong(0);
3394 }
3395 return PyInt_FromLong(1);
3396}
3397
3398static char join__doc__[] =
3399"S.join(sequence) -> unicode\n\
3400\n\
3401Return a string which is the concatenation of the strings in the\n\
3402sequence. The separator between elements is S.";
3403
3404static PyObject*
3405unicode_join(PyUnicodeObject *self, PyObject *args)
3406{
3407 PyObject *data;
3408 if (!PyArg_ParseTuple(args, "O:join", &data))
3409 return NULL;
3410
3411 return PyUnicode_Join((PyObject *)self, data);
3412}
3413
3414static int
3415unicode_length(PyUnicodeObject *self)
3416{
3417 return self->length;
3418}
3419
3420static char ljust__doc__[] =
3421"S.ljust(width) -> unicode\n\
3422\n\
3423Return S left justified in a Unicode string of length width. Padding is\n\
3424done using spaces.";
3425
3426static PyObject *
3427unicode_ljust(PyUnicodeObject *self, PyObject *args)
3428{
3429 int width;
3430 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3431 return NULL;
3432
3433 if (self->length >= width) {
3434 Py_INCREF(self);
3435 return (PyObject*) self;
3436 }
3437
3438 return (PyObject*) pad(self, 0, width - self->length, ' ');
3439}
3440
3441static char lower__doc__[] =
3442"S.lower() -> unicode\n\
3443\n\
3444Return a copy of the string S converted to lowercase.";
3445
3446static PyObject*
3447unicode_lower(PyUnicodeObject *self, PyObject *args)
3448{
3449 if (!PyArg_NoArgs(args))
3450 return NULL;
3451 return fixup(self, fixlower);
3452}
3453
3454static char lstrip__doc__[] =
3455"S.lstrip() -> unicode\n\
3456\n\
3457Return a copy of the string S with leading whitespace removed.";
3458
3459static PyObject *
3460unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3461{
3462 if (!PyArg_NoArgs(args))
3463 return NULL;
3464 return strip(self, 1, 0);
3465}
3466
3467static PyObject*
3468unicode_repeat(PyUnicodeObject *str, int len)
3469{
3470 PyUnicodeObject *u;
3471 Py_UNICODE *p;
3472
3473 if (len < 0)
3474 len = 0;
3475
3476 if (len == 1) {
3477 /* no repeat, return original string */
3478 Py_INCREF(str);
3479 return (PyObject*) str;
3480 }
3481
3482 u = _PyUnicode_New(len * str->length);
3483 if (!u)
3484 return NULL;
3485
3486 p = u->str;
3487
3488 while (len-- > 0) {
3489 Py_UNICODE_COPY(p, str->str, str->length);
3490 p += str->length;
3491 }
3492
3493 return (PyObject*) u;
3494}
3495
3496PyObject *PyUnicode_Replace(PyObject *obj,
3497 PyObject *subobj,
3498 PyObject *replobj,
3499 int maxcount)
3500{
3501 PyObject *self;
3502 PyObject *str1;
3503 PyObject *str2;
3504 PyObject *result;
3505
3506 self = PyUnicode_FromObject(obj);
3507 if (self == NULL)
3508 return NULL;
3509 str1 = PyUnicode_FromObject(subobj);
3510 if (str1 == NULL) {
3511 Py_DECREF(self);
3512 return NULL;
3513 }
3514 str2 = PyUnicode_FromObject(replobj);
3515 if (str2 == NULL) {
3516 Py_DECREF(self);
3517 Py_DECREF(str1);
3518 return NULL;
3519 }
3520 result = replace((PyUnicodeObject *)self,
3521 (PyUnicodeObject *)str1,
3522 (PyUnicodeObject *)str2,
3523 maxcount);
3524 Py_DECREF(self);
3525 Py_DECREF(str1);
3526 Py_DECREF(str2);
3527 return result;
3528}
3529
3530static char replace__doc__[] =
3531"S.replace (old, new[, maxsplit]) -> unicode\n\
3532\n\
3533Return a copy of S with all occurrences of substring\n\
3534old replaced by new. If the optional argument maxsplit is\n\
3535given, only the first maxsplit occurrences are replaced.";
3536
3537static PyObject*
3538unicode_replace(PyUnicodeObject *self, PyObject *args)
3539{
3540 PyUnicodeObject *str1;
3541 PyUnicodeObject *str2;
3542 int maxcount = -1;
3543 PyObject *result;
3544
3545 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3546 return NULL;
3547 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3548 if (str1 == NULL)
3549 return NULL;
3550 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3551 if (str2 == NULL)
3552 return NULL;
3553
3554 result = replace(self, str1, str2, maxcount);
3555
3556 Py_DECREF(str1);
3557 Py_DECREF(str2);
3558 return result;
3559}
3560
3561static
3562PyObject *unicode_repr(PyObject *unicode)
3563{
3564 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3565 PyUnicode_GET_SIZE(unicode),
3566 1);
3567}
3568
3569static char rfind__doc__[] =
3570"S.rfind(sub [,start [,end]]) -> int\n\
3571\n\
3572Return the highest index in S where substring sub is found,\n\
3573such that sub is contained within s[start,end]. Optional\n\
3574arguments start and end are interpreted as in slice notation.\n\
3575\n\
3576Return -1 on failure.";
3577
3578static PyObject *
3579unicode_rfind(PyUnicodeObject *self, PyObject *args)
3580{
3581 PyUnicodeObject *substring;
3582 int start = 0;
3583 int end = INT_MAX;
3584 PyObject *result;
3585
3586 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3587 return NULL;
3588 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3589 (PyObject *)substring);
3590 if (substring == NULL)
3591 return NULL;
3592
3593 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3594
3595 Py_DECREF(substring);
3596 return result;
3597}
3598
3599static char rindex__doc__[] =
3600"S.rindex(sub [,start [,end]]) -> int\n\
3601\n\
3602Like S.rfind() but raise ValueError when the substring is not found.";
3603
3604static PyObject *
3605unicode_rindex(PyUnicodeObject *self, PyObject *args)
3606{
3607 int result;
3608 PyUnicodeObject *substring;
3609 int start = 0;
3610 int end = INT_MAX;
3611
3612 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3613 return NULL;
3614 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3615 (PyObject *)substring);
3616 if (substring == NULL)
3617 return NULL;
3618
3619 result = findstring(self, substring, start, end, -1);
3620
3621 Py_DECREF(substring);
3622 if (result < 0) {
3623 PyErr_SetString(PyExc_ValueError, "substring not found");
3624 return NULL;
3625 }
3626 return PyInt_FromLong(result);
3627}
3628
3629static char rjust__doc__[] =
3630"S.rjust(width) -> unicode\n\
3631\n\
3632Return S right justified in a Unicode string of length width. Padding is\n\
3633done using spaces.";
3634
3635static PyObject *
3636unicode_rjust(PyUnicodeObject *self, PyObject *args)
3637{
3638 int width;
3639 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3640 return NULL;
3641
3642 if (self->length >= width) {
3643 Py_INCREF(self);
3644 return (PyObject*) self;
3645 }
3646
3647 return (PyObject*) pad(self, width - self->length, 0, ' ');
3648}
3649
3650static char rstrip__doc__[] =
3651"S.rstrip() -> unicode\n\
3652\n\
3653Return a copy of the string S with trailing whitespace removed.";
3654
3655static PyObject *
3656unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3657{
3658 if (!PyArg_NoArgs(args))
3659 return NULL;
3660 return strip(self, 0, 1);
3661}
3662
3663static PyObject*
3664unicode_slice(PyUnicodeObject *self, int start, int end)
3665{
3666 /* standard clamping */
3667 if (start < 0)
3668 start = 0;
3669 if (end < 0)
3670 end = 0;
3671 if (end > self->length)
3672 end = self->length;
3673 if (start == 0 && end == self->length) {
3674 /* full slice, return original string */
3675 Py_INCREF(self);
3676 return (PyObject*) self;
3677 }
3678 if (start > end)
3679 start = end;
3680 /* copy slice */
3681 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3682 end - start);
3683}
3684
3685PyObject *PyUnicode_Split(PyObject *s,
3686 PyObject *sep,
3687 int maxsplit)
3688{
3689 PyObject *result;
3690
3691 s = PyUnicode_FromObject(s);
3692 if (s == NULL)
3693 return NULL;
3694 if (sep != NULL) {
3695 sep = PyUnicode_FromObject(sep);
3696 if (sep == NULL) {
3697 Py_DECREF(s);
3698 return NULL;
3699 }
3700 }
3701
3702 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3703
3704 Py_DECREF(s);
3705 Py_XDECREF(sep);
3706 return result;
3707}
3708
3709static char split__doc__[] =
3710"S.split([sep [,maxsplit]]) -> list of strings\n\
3711\n\
3712Return a list of the words in S, using sep as the\n\
3713delimiter string. If maxsplit is given, at most maxsplit\n\
3714splits are done. If sep is not specified, any whitespace string\n\
3715is a separator.";
3716
3717static PyObject*
3718unicode_split(PyUnicodeObject *self, PyObject *args)
3719{
3720 PyObject *substring = Py_None;
3721 int maxcount = -1;
3722
3723 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3724 return NULL;
3725
3726 if (substring == Py_None)
3727 return split(self, NULL, maxcount);
3728 else if (PyUnicode_Check(substring))
3729 return split(self, (PyUnicodeObject *)substring, maxcount);
3730 else
3731 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3732}
3733
3734static char splitlines__doc__[] =
3735"S.splitlines([maxsplit]]) -> list of strings\n\
3736\n\
3737Return a list of the lines in S, breaking at line boundaries.\n\
3738If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
3739included in the resulting list.";
3740
3741static PyObject*
3742unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3743{
3744 int maxcount = -1;
3745
3746 if (!PyArg_ParseTuple(args, "|i:splitlines", &maxcount))
3747 return NULL;
3748
3749 return PyUnicode_Splitlines((PyObject *)self, maxcount);
3750}
3751
3752static
3753PyObject *unicode_str(PyUnicodeObject *self)
3754{
3755 return PyUnicode_AsUTF8String((PyObject *)self);
3756}
3757
3758static char strip__doc__[] =
3759"S.strip() -> unicode\n\
3760\n\
3761Return a copy of S with leading and trailing whitespace removed.";
3762
3763static PyObject *
3764unicode_strip(PyUnicodeObject *self, PyObject *args)
3765{
3766 if (!PyArg_NoArgs(args))
3767 return NULL;
3768 return strip(self, 1, 1);
3769}
3770
3771static char swapcase__doc__[] =
3772"S.swapcase() -> unicode\n\
3773\n\
3774Return a copy of S with uppercase characters converted to lowercase\n\
3775and vice versa.";
3776
3777static PyObject*
3778unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3779{
3780 if (!PyArg_NoArgs(args))
3781 return NULL;
3782 return fixup(self, fixswapcase);
3783}
3784
3785static char translate__doc__[] =
3786"S.translate(table) -> unicode\n\
3787\n\
3788Return a copy of the string S, where all characters have been mapped\n\
3789through the given translation table, which must be a mapping of\n\
3790Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3791are left untouched. Characters mapped to None are deleted.";
3792
3793static PyObject*
3794unicode_translate(PyUnicodeObject *self, PyObject *args)
3795{
3796 PyObject *table;
3797
3798 if (!PyArg_ParseTuple(args, "O:translate", &table))
3799 return NULL;
3800 return PyUnicode_TranslateCharmap(self->str,
3801 self->length,
3802 table,
3803 "ignore");
3804}
3805
3806static char upper__doc__[] =
3807"S.upper() -> unicode\n\
3808\n\
3809Return a copy of S converted to uppercase.";
3810
3811static PyObject*
3812unicode_upper(PyUnicodeObject *self, PyObject *args)
3813{
3814 if (!PyArg_NoArgs(args))
3815 return NULL;
3816 return fixup(self, fixupper);
3817}
3818
3819#if 0
3820static char zfill__doc__[] =
3821"S.zfill(width) -> unicode\n\
3822\n\
3823Pad a numeric string x with zeros on the left, to fill a field\n\
3824of the specified width. The string x is never truncated.";
3825
3826static PyObject *
3827unicode_zfill(PyUnicodeObject *self, PyObject *args)
3828{
3829 int fill;
3830 PyUnicodeObject *u;
3831
3832 int width;
3833 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3834 return NULL;
3835
3836 if (self->length >= width) {
3837 Py_INCREF(self);
3838 return (PyObject*) self;
3839 }
3840
3841 fill = width - self->length;
3842
3843 u = pad(self, fill, 0, '0');
3844
3845 if (u->str[fill] == '+' || u->str[fill] == '-') {
3846 /* move sign to beginning of string */
3847 u->str[0] = u->str[fill];
3848 u->str[fill] = '0';
3849 }
3850
3851 return (PyObject*) u;
3852}
3853#endif
3854
3855#if 0
3856static PyObject*
3857unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3858{
3859 if (!PyArg_NoArgs(args))
3860 return NULL;
3861 return PyInt_FromLong(unicode_freelist_size);
3862}
3863#endif
3864
3865static char startswith__doc__[] =
3866"S.startswith(prefix[, start[, end]]) -> int\n\
3867\n\
3868Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3869optional start, test S beginning at that position. With optional end, stop\n\
3870comparing S at that position.";
3871
3872static PyObject *
3873unicode_startswith(PyUnicodeObject *self,
3874 PyObject *args)
3875{
3876 PyUnicodeObject *substring;
3877 int start = 0;
3878 int end = INT_MAX;
3879 PyObject *result;
3880
3881 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3882 return NULL;
3883 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3884 (PyObject *)substring);
3885 if (substring == NULL)
3886 return NULL;
3887
3888 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3889
3890 Py_DECREF(substring);
3891 return result;
3892}
3893
3894
3895static char endswith__doc__[] =
3896"S.endswith(suffix[, start[, end]]) -> int\n\
3897\n\
3898Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3899optional start, test S beginning at that position. With optional end, stop\n\
3900comparing S at that position.";
3901
3902static PyObject *
3903unicode_endswith(PyUnicodeObject *self,
3904 PyObject *args)
3905{
3906 PyUnicodeObject *substring;
3907 int start = 0;
3908 int end = INT_MAX;
3909 PyObject *result;
3910
3911 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3912 return NULL;
3913 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3914 (PyObject *)substring);
3915 if (substring == NULL)
3916 return NULL;
3917
3918 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3919
3920 Py_DECREF(substring);
3921 return result;
3922}
3923
3924
3925static PyMethodDef unicode_methods[] = {
3926
3927 /* Order is according to common usage: often used methods should
3928 appear first, since lookup is done sequentially. */
3929
3930 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3931 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3932 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3933 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3934 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3935 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3936 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3937 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3938 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3939 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3940 {"index", (PyCFunction) unicode_index, 1, index__doc__},
3941 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
3942 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
3943 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
3944/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
3945 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
3946 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
3947 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
3948 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
3949 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
3950 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
3951 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
3952 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
3953 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
3954 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
3955 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
3956 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
3957 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
3958 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
3959 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
3960 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
3961 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
3962 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
3963#if 0
3964 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
3965 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
3966#endif
3967
3968#if 0
3969 /* This one is just used for debugging the implementation. */
3970 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
3971#endif
3972
3973 {NULL, NULL}
3974};
3975
3976static PyObject *
3977unicode_getattr(PyUnicodeObject *self, char *name)
3978{
3979 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
3980}
3981
3982static PySequenceMethods unicode_as_sequence = {
3983 (inquiry) unicode_length, /* sq_length */
3984 (binaryfunc) PyUnicode_Concat, /* sq_concat */
3985 (intargfunc) unicode_repeat, /* sq_repeat */
3986 (intargfunc) unicode_getitem, /* sq_item */
3987 (intintargfunc) unicode_slice, /* sq_slice */
3988 0, /* sq_ass_item */
3989 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003990 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00003991};
3992
3993static int
3994unicode_buffer_getreadbuf(PyUnicodeObject *self,
3995 int index,
3996 const void **ptr)
3997{
3998 if (index != 0) {
3999 PyErr_SetString(PyExc_SystemError,
4000 "accessing non-existent unicode segment");
4001 return -1;
4002 }
4003 *ptr = (void *) self->str;
4004 return PyUnicode_GET_DATA_SIZE(self);
4005}
4006
4007static int
4008unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4009 const void **ptr)
4010{
4011 PyErr_SetString(PyExc_TypeError,
4012 "cannot use unicode as modifyable buffer");
4013 return -1;
4014}
4015
4016static int
4017unicode_buffer_getsegcount(PyUnicodeObject *self,
4018 int *lenp)
4019{
4020 if (lenp)
4021 *lenp = PyUnicode_GET_DATA_SIZE(self);
4022 return 1;
4023}
4024
4025static int
4026unicode_buffer_getcharbuf(PyUnicodeObject *self,
4027 int index,
4028 const void **ptr)
4029{
4030 PyObject *str;
4031
4032 if (index != 0) {
4033 PyErr_SetString(PyExc_SystemError,
4034 "accessing non-existent unicode segment");
4035 return -1;
4036 }
4037 str = utf8_string(self, NULL);
4038 if (str == NULL)
4039 return -1;
4040 *ptr = (void *) PyString_AS_STRING(str);
4041 return PyString_GET_SIZE(str);
4042}
4043
4044/* Helpers for PyUnicode_Format() */
4045
4046static PyObject *
4047getnextarg(args, arglen, p_argidx)
4048 PyObject *args;
4049int arglen;
4050int *p_argidx;
4051{
4052 int argidx = *p_argidx;
4053 if (argidx < arglen) {
4054 (*p_argidx)++;
4055 if (arglen < 0)
4056 return args;
4057 else
4058 return PyTuple_GetItem(args, argidx);
4059 }
4060 PyErr_SetString(PyExc_TypeError,
4061 "not enough arguments for format string");
4062 return NULL;
4063}
4064
4065#define F_LJUST (1<<0)
4066#define F_SIGN (1<<1)
4067#define F_BLANK (1<<2)
4068#define F_ALT (1<<3)
4069#define F_ZERO (1<<4)
4070
4071static
4072#ifdef HAVE_STDARG_PROTOTYPES
4073int usprintf(register Py_UNICODE *buffer, char *format, ...)
4074#else
4075int usprintf(va_alist) va_dcl
4076#endif
4077{
4078 register int i;
4079 int len;
4080 va_list va;
4081 char *charbuffer;
4082#ifdef HAVE_STDARG_PROTOTYPES
4083 va_start(va, format);
4084#else
4085 Py_UNICODE *args;
4086 char *format;
4087
4088 va_start(va);
4089 buffer = va_arg(va, Py_UNICODE *);
4090 format = va_arg(va, char *);
4091#endif
4092
4093 /* First, format the string as char array, then expand to Py_UNICODE
4094 array. */
4095 charbuffer = (char *)buffer;
4096 len = vsprintf(charbuffer, format, va);
4097 for (i = len - 1; i >= 0; i--)
4098 buffer[i] = (Py_UNICODE) charbuffer[i];
4099
4100 va_end(va);
4101 return len;
4102}
4103
4104static int
4105formatfloat(Py_UNICODE *buf,
4106 int flags,
4107 int prec,
4108 int type,
4109 PyObject *v)
4110{
4111 char fmt[20];
4112 double x;
4113
4114 x = PyFloat_AsDouble(v);
4115 if (x == -1.0 && PyErr_Occurred())
4116 return -1;
4117 if (prec < 0)
4118 prec = 6;
4119 if (prec > 50)
4120 prec = 50; /* Arbitrary limitation */
4121 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4122 type = 'g';
4123 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4124 return usprintf(buf, fmt, x);
4125}
4126
4127static int
4128formatint(Py_UNICODE *buf,
4129 int flags,
4130 int prec,
4131 int type,
4132 PyObject *v)
4133{
4134 char fmt[20];
4135 long x;
4136
4137 x = PyInt_AsLong(v);
4138 if (x == -1 && PyErr_Occurred())
4139 return -1;
4140 if (prec < 0)
4141 prec = 1;
4142 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4143 return usprintf(buf, fmt, x);
4144}
4145
4146static int
4147formatchar(Py_UNICODE *buf,
4148 PyObject *v)
4149{
4150 if (PyUnicode_Check(v))
4151 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4152
4153 else if (PyString_Check(v))
4154 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4155
4156 else {
4157 /* Integer input truncated to a character */
4158 long x;
4159 x = PyInt_AsLong(v);
4160 if (x == -1 && PyErr_Occurred())
4161 return -1;
4162 buf[0] = (char) x;
4163 }
4164 buf[1] = '\0';
4165 return 1;
4166}
4167
4168PyObject *PyUnicode_Format(PyObject *format,
4169 PyObject *args)
4170{
4171 Py_UNICODE *fmt, *res;
4172 int fmtcnt, rescnt, reslen, arglen, argidx;
4173 int args_owned = 0;
4174 PyUnicodeObject *result = NULL;
4175 PyObject *dict = NULL;
4176 PyObject *uformat;
4177
4178 if (format == NULL || args == NULL) {
4179 PyErr_BadInternalCall();
4180 return NULL;
4181 }
4182 uformat = PyUnicode_FromObject(format);
4183 fmt = PyUnicode_AS_UNICODE(uformat);
4184 fmtcnt = PyUnicode_GET_SIZE(uformat);
4185
4186 reslen = rescnt = fmtcnt + 100;
4187 result = _PyUnicode_New(reslen);
4188 if (result == NULL)
4189 goto onError;
4190 res = PyUnicode_AS_UNICODE(result);
4191
4192 if (PyTuple_Check(args)) {
4193 arglen = PyTuple_Size(args);
4194 argidx = 0;
4195 }
4196 else {
4197 arglen = -1;
4198 argidx = -2;
4199 }
4200 if (args->ob_type->tp_as_mapping)
4201 dict = args;
4202
4203 while (--fmtcnt >= 0) {
4204 if (*fmt != '%') {
4205 if (--rescnt < 0) {
4206 rescnt = fmtcnt + 100;
4207 reslen += rescnt;
4208 if (_PyUnicode_Resize(result, reslen) < 0)
4209 return NULL;
4210 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4211 --rescnt;
4212 }
4213 *res++ = *fmt++;
4214 }
4215 else {
4216 /* Got a format specifier */
4217 int flags = 0;
4218 int width = -1;
4219 int prec = -1;
4220 int size = 0;
4221 Py_UNICODE c = '\0';
4222 Py_UNICODE fill;
4223 PyObject *v = NULL;
4224 PyObject *temp = NULL;
4225 Py_UNICODE *buf;
4226 Py_UNICODE sign;
4227 int len;
4228 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4229
4230 fmt++;
4231 if (*fmt == '(') {
4232 Py_UNICODE *keystart;
4233 int keylen;
4234 PyObject *key;
4235 int pcount = 1;
4236
4237 if (dict == NULL) {
4238 PyErr_SetString(PyExc_TypeError,
4239 "format requires a mapping");
4240 goto onError;
4241 }
4242 ++fmt;
4243 --fmtcnt;
4244 keystart = fmt;
4245 /* Skip over balanced parentheses */
4246 while (pcount > 0 && --fmtcnt >= 0) {
4247 if (*fmt == ')')
4248 --pcount;
4249 else if (*fmt == '(')
4250 ++pcount;
4251 fmt++;
4252 }
4253 keylen = fmt - keystart - 1;
4254 if (fmtcnt < 0 || pcount > 0) {
4255 PyErr_SetString(PyExc_ValueError,
4256 "incomplete format key");
4257 goto onError;
4258 }
4259 /* keys are converted to strings (using UTF-8) and
4260 then looked up since Python uses strings to hold
4261 variables names etc. in its namespaces and we
4262 wouldn't want to break common idioms. The
4263 alternative would be using Unicode objects for the
4264 lookup but u"abc" and "abc" have different hash
4265 values (on purpose). */
4266 key = PyUnicode_EncodeUTF8(keystart,
4267 keylen,
4268 NULL);
4269 if (key == NULL)
4270 goto onError;
4271 if (args_owned) {
4272 Py_DECREF(args);
4273 args_owned = 0;
4274 }
4275 args = PyObject_GetItem(dict, key);
4276 Py_DECREF(key);
4277 if (args == NULL) {
4278 goto onError;
4279 }
4280 args_owned = 1;
4281 arglen = -1;
4282 argidx = -2;
4283 }
4284 while (--fmtcnt >= 0) {
4285 switch (c = *fmt++) {
4286 case '-': flags |= F_LJUST; continue;
4287 case '+': flags |= F_SIGN; continue;
4288 case ' ': flags |= F_BLANK; continue;
4289 case '#': flags |= F_ALT; continue;
4290 case '0': flags |= F_ZERO; continue;
4291 }
4292 break;
4293 }
4294 if (c == '*') {
4295 v = getnextarg(args, arglen, &argidx);
4296 if (v == NULL)
4297 goto onError;
4298 if (!PyInt_Check(v)) {
4299 PyErr_SetString(PyExc_TypeError,
4300 "* wants int");
4301 goto onError;
4302 }
4303 width = PyInt_AsLong(v);
4304 if (width < 0) {
4305 flags |= F_LJUST;
4306 width = -width;
4307 }
4308 if (--fmtcnt >= 0)
4309 c = *fmt++;
4310 }
4311 else if (c >= '0' && c <= '9') {
4312 width = c - '0';
4313 while (--fmtcnt >= 0) {
4314 c = *fmt++;
4315 if (c < '0' || c > '9')
4316 break;
4317 if ((width*10) / 10 != width) {
4318 PyErr_SetString(PyExc_ValueError,
4319 "width too big");
4320 goto onError;
4321 }
4322 width = width*10 + (c - '0');
4323 }
4324 }
4325 if (c == '.') {
4326 prec = 0;
4327 if (--fmtcnt >= 0)
4328 c = *fmt++;
4329 if (c == '*') {
4330 v = getnextarg(args, arglen, &argidx);
4331 if (v == NULL)
4332 goto onError;
4333 if (!PyInt_Check(v)) {
4334 PyErr_SetString(PyExc_TypeError,
4335 "* wants int");
4336 goto onError;
4337 }
4338 prec = PyInt_AsLong(v);
4339 if (prec < 0)
4340 prec = 0;
4341 if (--fmtcnt >= 0)
4342 c = *fmt++;
4343 }
4344 else if (c >= '0' && c <= '9') {
4345 prec = c - '0';
4346 while (--fmtcnt >= 0) {
4347 c = Py_CHARMASK(*fmt++);
4348 if (c < '0' || c > '9')
4349 break;
4350 if ((prec*10) / 10 != prec) {
4351 PyErr_SetString(PyExc_ValueError,
4352 "prec too big");
4353 goto onError;
4354 }
4355 prec = prec*10 + (c - '0');
4356 }
4357 }
4358 } /* prec */
4359 if (fmtcnt >= 0) {
4360 if (c == 'h' || c == 'l' || c == 'L') {
4361 size = c;
4362 if (--fmtcnt >= 0)
4363 c = *fmt++;
4364 }
4365 }
4366 if (fmtcnt < 0) {
4367 PyErr_SetString(PyExc_ValueError,
4368 "incomplete format");
4369 goto onError;
4370 }
4371 if (c != '%') {
4372 v = getnextarg(args, arglen, &argidx);
4373 if (v == NULL)
4374 goto onError;
4375 }
4376 sign = 0;
4377 fill = ' ';
4378 switch (c) {
4379
4380 case '%':
4381 buf = tmpbuf;
4382 buf[0] = '%';
4383 len = 1;
4384 break;
4385
4386 case 's':
4387 case 'r':
4388 if (PyUnicode_Check(v) && c == 's') {
4389 temp = v;
4390 Py_INCREF(temp);
4391 }
4392 else {
4393 PyObject *unicode;
4394 if (c == 's')
4395 temp = PyObject_Str(v);
4396 else
4397 temp = PyObject_Repr(v);
4398 if (temp == NULL)
4399 goto onError;
4400 if (!PyString_Check(temp)) {
4401 /* XXX Note: this should never happen, since
4402 PyObject_Repr() and PyObject_Str() assure
4403 this */
4404 Py_DECREF(temp);
4405 PyErr_SetString(PyExc_TypeError,
4406 "%s argument has non-string str()");
4407 goto onError;
4408 }
4409 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4410 PyString_GET_SIZE(temp),
4411 "strict");
4412 Py_DECREF(temp);
4413 temp = unicode;
4414 if (temp == NULL)
4415 goto onError;
4416 }
4417 buf = PyUnicode_AS_UNICODE(temp);
4418 len = PyUnicode_GET_SIZE(temp);
4419 if (prec >= 0 && len > prec)
4420 len = prec;
4421 break;
4422
4423 case 'i':
4424 case 'd':
4425 case 'u':
4426 case 'o':
4427 case 'x':
4428 case 'X':
4429 if (c == 'i')
4430 c = 'd';
4431 buf = tmpbuf;
4432 len = formatint(buf, flags, prec, c, v);
4433 if (len < 0)
4434 goto onError;
4435 sign = (c == 'd');
4436 if (flags & F_ZERO) {
4437 fill = '0';
4438 if ((flags&F_ALT) &&
4439 (c == 'x' || c == 'X') &&
4440 buf[0] == '0' && buf[1] == c) {
4441 *res++ = *buf++;
4442 *res++ = *buf++;
4443 rescnt -= 2;
4444 len -= 2;
4445 width -= 2;
4446 if (width < 0)
4447 width = 0;
4448 }
4449 }
4450 break;
4451
4452 case 'e':
4453 case 'E':
4454 case 'f':
4455 case 'g':
4456 case 'G':
4457 buf = tmpbuf;
4458 len = formatfloat(buf, flags, prec, c, v);
4459 if (len < 0)
4460 goto onError;
4461 sign = 1;
4462 if (flags&F_ZERO)
4463 fill = '0';
4464 break;
4465
4466 case 'c':
4467 buf = tmpbuf;
4468 len = formatchar(buf, v);
4469 if (len < 0)
4470 goto onError;
4471 break;
4472
4473 default:
4474 PyErr_Format(PyExc_ValueError,
4475 "unsupported format character '%c' (0x%x)",
4476 c, c);
4477 goto onError;
4478 }
4479 if (sign) {
4480 if (*buf == '-' || *buf == '+') {
4481 sign = *buf++;
4482 len--;
4483 }
4484 else if (flags & F_SIGN)
4485 sign = '+';
4486 else if (flags & F_BLANK)
4487 sign = ' ';
4488 else
4489 sign = 0;
4490 }
4491 if (width < len)
4492 width = len;
4493 if (rescnt < width + (sign != 0)) {
4494 reslen -= rescnt;
4495 rescnt = width + fmtcnt + 100;
4496 reslen += rescnt;
4497 if (_PyUnicode_Resize(result, reslen) < 0)
4498 return NULL;
4499 res = PyUnicode_AS_UNICODE(result)
4500 + reslen - rescnt;
4501 }
4502 if (sign) {
4503 if (fill != ' ')
4504 *res++ = sign;
4505 rescnt--;
4506 if (width > len)
4507 width--;
4508 }
4509 if (width > len && !(flags & F_LJUST)) {
4510 do {
4511 --rescnt;
4512 *res++ = fill;
4513 } while (--width > len);
4514 }
4515 if (sign && fill == ' ')
4516 *res++ = sign;
4517 memcpy(res, buf, len * sizeof(Py_UNICODE));
4518 res += len;
4519 rescnt -= len;
4520 while (--width >= len) {
4521 --rescnt;
4522 *res++ = ' ';
4523 }
4524 if (dict && (argidx < arglen) && c != '%') {
4525 PyErr_SetString(PyExc_TypeError,
4526 "not all arguments converted");
4527 goto onError;
4528 }
4529 Py_XDECREF(temp);
4530 } /* '%' */
4531 } /* until end */
4532 if (argidx < arglen && !dict) {
4533 PyErr_SetString(PyExc_TypeError,
4534 "not all arguments converted");
4535 goto onError;
4536 }
4537
4538 if (args_owned) {
4539 Py_DECREF(args);
4540 }
4541 Py_DECREF(uformat);
4542 _PyUnicode_Resize(result, reslen - rescnt);
4543 return (PyObject *)result;
4544
4545 onError:
4546 Py_XDECREF(result);
4547 Py_DECREF(uformat);
4548 if (args_owned) {
4549 Py_DECREF(args);
4550 }
4551 return NULL;
4552}
4553
4554static PyBufferProcs unicode_as_buffer = {
4555 (getreadbufferproc) unicode_buffer_getreadbuf,
4556 (getwritebufferproc) unicode_buffer_getwritebuf,
4557 (getsegcountproc) unicode_buffer_getsegcount,
4558 (getcharbufferproc) unicode_buffer_getcharbuf,
4559};
4560
4561PyTypeObject PyUnicode_Type = {
4562 PyObject_HEAD_INIT(&PyType_Type)
4563 0, /* ob_size */
4564 "unicode", /* tp_name */
4565 sizeof(PyUnicodeObject), /* tp_size */
4566 0, /* tp_itemsize */
4567 /* Slots */
4568 (destructor)_PyUnicode_Free, /* tp_dealloc */
4569 0, /* tp_print */
4570 (getattrfunc)unicode_getattr, /* tp_getattr */
4571 0, /* tp_setattr */
4572 (cmpfunc) unicode_compare, /* tp_compare */
4573 (reprfunc) unicode_repr, /* tp_repr */
4574 0, /* tp_as_number */
4575 &unicode_as_sequence, /* tp_as_sequence */
4576 0, /* tp_as_mapping */
4577 (hashfunc) unicode_hash, /* tp_hash*/
4578 0, /* tp_call*/
4579 (reprfunc) unicode_str, /* tp_str */
4580 (getattrofunc) NULL, /* tp_getattro */
4581 (setattrofunc) NULL, /* tp_setattro */
4582 &unicode_as_buffer, /* tp_as_buffer */
4583 Py_TPFLAGS_DEFAULT, /* tp_flags */
4584};
4585
4586/* Initialize the Unicode implementation */
4587
4588void _PyUnicode_Init()
4589{
4590 /* Doublecheck the configuration... */
4591 if (sizeof(Py_UNICODE) != 2)
4592 Py_FatalError("Unicode configuration error: "
4593 "sizeof(Py_UNICODE) != 2 bytes");
4594
4595 unicode_empty = _PyUnicode_New(0);
4596}
4597
4598/* Finalize the Unicode implementation */
4599
4600void
4601_PyUnicode_Fini()
4602{
4603 PyUnicodeObject *u = unicode_freelist;
4604
4605 while (u != NULL) {
4606 PyUnicodeObject *v = u;
4607 u = *(PyUnicodeObject **)u;
4608 free(v);
4609 }
4610 Py_XDECREF(unicode_empty);
4611}