blob: 8c1db7eb00d0c8e1350152b6f311fa9a4c1cb2bf [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000079/* Limit for the Unicode object free list */
80
81#define MAX_UNICODE_FREELIST_SIZE 1024
82
83/* Limit for the Unicode object free list stay alive optimization.
84
85 The implementation will keep allocated Unicode memory intact for
86 all objects on the free list having a size less than this
87 limit. This reduces malloc() overhead for small Unicode objects.
88
Barry Warsaw51ac5802000-03-20 16:36:48 +000089 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumd57fd912000-03-10 22:53:23 +000090 (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
91 malloc()-overhead) bytes of unused garbage.
92
93 Setting the limit to 0 effectively turns the feature off.
94
95 XXX The feature is currently turned off because there are
96 apparently some lingering bugs in its implementation which I
97 haven't yet been able to sort out.
98
99*/
100
101#define STAYALIVE_SIZE_LIMIT 0
102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------ */
112
113/* The empty Unicode object */
114static PyUnicodeObject *unicode_empty = NULL;
115
116/* Free list for Unicode objects */
117static PyUnicodeObject *unicode_freelist = NULL;
118static int unicode_freelist_size = 0;
119
120/* --- Unicode Object ----------------------------------------------------- */
121
122static
123int _PyUnicode_Resize(register PyUnicodeObject *unicode,
124 int length)
125{
126 void *oldstr;
127
128 /* Shortcut if there's nothing to do. */
129 if (unicode->length == length)
130 return 0;
131
132 /* Resizing unicode_empty is not allowed. */
133 if (unicode == unicode_empty) {
134 PyErr_SetString(PyExc_SystemError,
135 "can't resize empty unicode object");
136 return -1;
137 }
138
139 /* We allocate one more byte to make sure the string is
140 Ux0000 terminated -- XXX is this needed ? */
141 oldstr = unicode->str;
142 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
143 if (!unicode->str) {
144 unicode->str = oldstr;
145 PyErr_NoMemory();
146 return -1;
147 }
148 unicode->str[length] = 0;
149 unicode->length = length;
150
151 /* Reset the object caches */
152 if (unicode->utf8str) {
153 Py_DECREF(unicode->utf8str);
154 unicode->utf8str = NULL;
155 }
156 unicode->hash = -1;
157
158 return 0;
159}
160
161/* We allocate one more byte to make sure the string is
162 Ux0000 terminated -- XXX is this needed ?
163
164 XXX This allocator could further be enhanced by assuring that the
165 free list never reduces its size below 1.
166
167*/
168
169static
170PyUnicodeObject *_PyUnicode_New(int length)
171{
172 register PyUnicodeObject *unicode;
173
174 /* Optimization for empty strings */
175 if (length == 0 && unicode_empty != NULL) {
176 Py_INCREF(unicode_empty);
177 return unicode_empty;
178 }
179
180 /* Unicode freelist & memory allocation */
181 if (unicode_freelist) {
182 unicode = unicode_freelist;
183 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
184 unicode_freelist_size--;
185 unicode->ob_type = &PyUnicode_Type;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000186 _Py_NewReference((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->str) {
188 if (unicode->length < length &&
189 _PyUnicode_Resize(unicode, length)) {
190 free(unicode->str);
191 PyMem_DEL(unicode);
192 return NULL;
193 }
194 }
195 else
196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
197 }
198 else {
199 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
200 if (unicode == NULL)
201 return NULL;
202 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
203 }
204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205 if (!unicode->str)
206 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 unicode->str[length] = 0;
208 unicode->length = length;
209 unicode->hash = -1;
210 unicode->utf8str = NULL;
211 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212
213 onError:
214 _Py_ForgetReference((PyObject *)unicode);
215 PyMem_DEL(unicode);
216 PyErr_NoMemory();
217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218}
219
220static
221void _PyUnicode_Free(register PyUnicodeObject *unicode)
222{
223 Py_XDECREF(unicode->utf8str);
224 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
225 if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
226 free(unicode->str);
227 unicode->str = NULL;
228 unicode->length = 0;
229 }
230 *(PyUnicodeObject **)unicode = unicode_freelist;
231 unicode_freelist = unicode;
232 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 }
234 else {
235 free(unicode->str);
236 PyMem_DEL(unicode);
237 }
238}
239
240PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
241 int size)
242{
243 PyUnicodeObject *unicode;
244
245 unicode = _PyUnicode_New(size);
246 if (!unicode)
247 return NULL;
248
249 /* Copy the Unicode data into the new object */
250 if (u != NULL)
251 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
252
253 return (PyObject *)unicode;
254}
255
256#ifdef HAVE_WCHAR_H
257
258PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
259 int size)
260{
261 PyUnicodeObject *unicode;
262
263 if (w == NULL) {
264 PyErr_BadInternalCall();
265 return NULL;
266 }
267
268 unicode = _PyUnicode_New(size);
269 if (!unicode)
270 return NULL;
271
272 /* Copy the wchar_t data into the new object */
273#ifdef HAVE_USABLE_WCHAR_T
274 memcpy(unicode->str, w, size * sizeof(wchar_t));
275#else
276 {
277 register Py_UNICODE *u;
278 register int i;
279 u = PyUnicode_AS_UNICODE(unicode);
280 for (i = size; i >= 0; i--)
281 *u++ = *w++;
282 }
283#endif
284
285 return (PyObject *)unicode;
286}
287
288int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
289 register wchar_t *w,
290 int size)
291{
292 if (unicode == NULL) {
293 PyErr_BadInternalCall();
294 return -1;
295 }
296 if (size > PyUnicode_GET_SIZE(unicode))
297 size = PyUnicode_GET_SIZE(unicode);
298#ifdef HAVE_USABLE_WCHAR_T
299 memcpy(w, unicode->str, size * sizeof(wchar_t));
300#else
301 {
302 register Py_UNICODE *u;
303 register int i;
304 u = PyUnicode_AS_UNICODE(unicode);
305 for (i = size; i >= 0; i--)
306 *w++ = *u++;
307 }
308#endif
309
310 return size;
311}
312
313#endif
314
315PyObject *PyUnicode_FromObject(register PyObject *obj)
316{
317 const char *s;
318 int len;
319
320 if (obj == NULL) {
321 PyErr_BadInternalCall();
322 return NULL;
323 }
324 else if (PyUnicode_Check(obj)) {
325 Py_INCREF(obj);
326 return obj;
327 }
328 else if (PyString_Check(obj)) {
329 s = PyString_AS_STRING(obj);
330 len = PyString_GET_SIZE(obj);
331 }
332 else if (PyObject_AsCharBuffer(obj, &s, &len))
333 return NULL;
334 if (len == 0) {
335 Py_INCREF(unicode_empty);
336 return (PyObject *)unicode_empty;
337 }
338 return PyUnicode_DecodeUTF8(s, len, "strict");
339}
340
341PyObject *PyUnicode_Decode(const char *s,
342 int size,
343 const char *encoding,
344 const char *errors)
345{
346 PyObject *buffer = NULL, *unicode;
347
348 /* Shortcut for the default encoding UTF-8 */
349 if (encoding == NULL ||
350 (strcmp(encoding, "utf-8") == 0))
351 return PyUnicode_DecodeUTF8(s, size, errors);
352
353 /* Decode via the codec registry */
354 buffer = PyBuffer_FromMemory((void *)s, size);
355 if (buffer == NULL)
356 goto onError;
357 unicode = PyCodec_Decode(buffer, encoding, errors);
358 if (unicode == NULL)
359 goto onError;
360 if (!PyUnicode_Check(unicode)) {
361 PyErr_Format(PyExc_TypeError,
362 "decoder did not return an unicode object (type=%s)",
363 unicode->ob_type->tp_name);
364 Py_DECREF(unicode);
365 goto onError;
366 }
367 Py_DECREF(buffer);
368 return unicode;
369
370 onError:
371 Py_XDECREF(buffer);
372 return NULL;
373}
374
375PyObject *PyUnicode_Encode(const Py_UNICODE *s,
376 int size,
377 const char *encoding,
378 const char *errors)
379{
380 PyObject *v, *unicode;
381
382 unicode = PyUnicode_FromUnicode(s, size);
383 if (unicode == NULL)
384 return NULL;
385 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
386 Py_DECREF(unicode);
387 return v;
388}
389
390PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
391 const char *encoding,
392 const char *errors)
393{
394 PyObject *v;
395
396 if (!PyUnicode_Check(unicode)) {
397 PyErr_BadArgument();
398 goto onError;
399 }
400 /* Shortcut for the default encoding UTF-8 */
401 if ((encoding == NULL ||
402 (strcmp(encoding, "utf-8") == 0)) &&
403 errors == NULL)
404 return PyUnicode_AsUTF8String(unicode);
405
406 /* Encode via the codec registry */
407 v = PyCodec_Encode(unicode, encoding, errors);
408 if (v == NULL)
409 goto onError;
410 /* XXX Should we really enforce this ? */
411 if (!PyString_Check(v)) {
412 PyErr_Format(PyExc_TypeError,
413 "encoder did not return a string object (type=%s)",
414 v->ob_type->tp_name);
415 Py_DECREF(v);
416 goto onError;
417 }
418 return v;
419
420 onError:
421 return NULL;
422}
423
424Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
425{
426 if (!PyUnicode_Check(unicode)) {
427 PyErr_BadArgument();
428 goto onError;
429 }
430 return PyUnicode_AS_UNICODE(unicode);
431
432 onError:
433 return NULL;
434}
435
436int PyUnicode_GetSize(PyObject *unicode)
437{
438 if (!PyUnicode_Check(unicode)) {
439 PyErr_BadArgument();
440 goto onError;
441 }
442 return PyUnicode_GET_SIZE(unicode);
443
444 onError:
445 return -1;
446}
447
448/* --- UTF-8 Codec -------------------------------------------------------- */
449
450static
451char utf8_code_length[256] = {
452 /* Map UTF-8 encoded prefix byte to sequence length. zero means
453 illegal prefix. see RFC 2279 for details */
454 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
455 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
456 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
458 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
459 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
466 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
467 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
468 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
469 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
470};
471
472static
473int utf8_decoding_error(const char **source,
474 Py_UNICODE **dest,
475 const char *errors,
476 const char *details)
477{
478 if ((errors == NULL) ||
479 (strcmp(errors,"strict") == 0)) {
480 PyErr_Format(PyExc_UnicodeError,
481 "UTF-8 decoding error: %s",
482 details);
483 return -1;
484 }
485 else if (strcmp(errors,"ignore") == 0) {
486 (*source)++;
487 return 0;
488 }
489 else if (strcmp(errors,"replace") == 0) {
490 (*source)++;
491 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
492 (*dest)++;
493 return 0;
494 }
495 else {
496 PyErr_Format(PyExc_ValueError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000497 "UTF-8 decoding error; unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 errors);
499 return -1;
500 }
501}
502
503#define UTF8_ERROR(details) do { \
504 if (utf8_decoding_error(&s, &p, errors, details)) \
505 goto onError; \
506 continue; \
507} while (0)
508
509PyObject *PyUnicode_DecodeUTF8(const char *s,
510 int size,
511 const char *errors)
512{
513 int n;
514 const char *e;
515 PyUnicodeObject *unicode;
516 Py_UNICODE *p;
517
518 /* Note: size will always be longer than the resulting Unicode
519 character count */
520 unicode = _PyUnicode_New(size);
521 if (!unicode)
522 return NULL;
523 if (size == 0)
524 return (PyObject *)unicode;
525
526 /* Unpack UTF-8 encoded data */
527 p = unicode->str;
528 e = s + size;
529
530 while (s < e) {
531 register Py_UNICODE ch = (unsigned char)*s;
532
533 if (ch < 0x80) {
534 *p++ = ch;
535 s++;
536 continue;
537 }
538
539 n = utf8_code_length[ch];
540
541 if (s + n > e)
542 UTF8_ERROR("unexpected end of data");
543
544 switch (n) {
545
546 case 0:
547 UTF8_ERROR("unexpected code byte");
548 break;
549
550 case 1:
551 UTF8_ERROR("internal error");
552 break;
553
554 case 2:
555 if ((s[1] & 0xc0) != 0x80)
556 UTF8_ERROR("invalid data");
557 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
558 if (ch < 0x80)
559 UTF8_ERROR("illegal encoding");
560 else
561 *p++ = ch;
562 break;
563
564 case 3:
565 if ((s[1] & 0xc0) != 0x80 ||
566 (s[2] & 0xc0) != 0x80)
567 UTF8_ERROR("invalid data");
568 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
569 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
570 UTF8_ERROR("illegal encoding");
571 else
572 *p++ = ch;
573 break;
574
575 default:
576 /* Other sizes are only needed for UCS-4 */
577 UTF8_ERROR("unsupported Unicode code range");
578 }
579 s += n;
580 }
581
582 /* Adjust length */
583 if (_PyUnicode_Resize(unicode, p - unicode->str))
584 goto onError;
585
586 return (PyObject *)unicode;
587
588onError:
589 Py_DECREF(unicode);
590 return NULL;
591}
592
593#undef UTF8_ERROR
594
595static
596int utf8_encoding_error(const Py_UNICODE **source,
597 char **dest,
598 const char *errors,
599 const char *details)
600{
601 if ((errors == NULL) ||
602 (strcmp(errors,"strict") == 0)) {
603 PyErr_Format(PyExc_UnicodeError,
604 "UTF-8 encoding error: %s",
605 details);
606 return -1;
607 }
608 else if (strcmp(errors,"ignore") == 0) {
609 return 0;
610 }
611 else if (strcmp(errors,"replace") == 0) {
612 **dest = '?';
613 (*dest)++;
614 return 0;
615 }
616 else {
617 PyErr_Format(PyExc_ValueError,
618 "UTF-8 encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +0000619 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000620 errors);
621 return -1;
622 }
623}
624
625PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
626 int size,
627 const char *errors)
628{
629 PyObject *v;
630 char *p;
631 char *q;
632
633 v = PyString_FromStringAndSize(NULL, 3 * size);
634 if (v == NULL)
635 return NULL;
636 if (size == 0)
637 goto done;
638
639 p = q = PyString_AS_STRING(v);
640 while (size-- > 0) {
641 Py_UNICODE ch = *s++;
642 if (ch < 0x80)
643 *p++ = (char) ch;
644 else if (ch < 0x0800) {
645 *p++ = 0xc0 | (ch >> 6);
646 *p++ = 0x80 | (ch & 0x3f);
647 } else if (0xD800 <= ch && ch <= 0xDFFF) {
648 /* These byte ranges are reserved for UTF-16 surrogate
649 bytes which the Python implementation currently does
650 not support. */
651 printf("code range problem: U+%04x\n", ch);
652 if (utf8_encoding_error(&s, &p, errors,
653 "unsupported code range"))
654 goto onError;
655 } else {
656 *p++ = 0xe0 | (ch >> 12);
657 *p++ = 0x80 | ((ch >> 6) & 0x3f);
658 *p++ = 0x80 | (ch & 0x3f);
659 }
660 }
661 *p = '\0';
662 _PyString_Resize(&v, p - q);
663
664 done:
665 return v;
666
667 onError:
668 Py_DECREF(v);
669 return NULL;
670}
671
672/* Return a Python string holding the UTF-8 encoded value of the
673 Unicode object.
674
675 The resulting string is cached in the Unicode object for subsequent
676 usage by this function. The cached version is needed to implement
677 the character buffer interface.
678
679 The refcount of the string is *not* incremented.
680
681*/
682
683static
684PyObject *utf8_string(PyUnicodeObject *self,
685 const char *errors)
686{
687 PyObject *v = self->utf8str;
688
689 if (v)
690 return v;
691 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
692 PyUnicode_GET_SIZE(self),
693 errors);
694 if (v && errors == NULL)
695 self->utf8str = v;
696 return v;
697}
698
699PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
700{
701 PyObject *str;
702
703 if (!PyUnicode_Check(unicode)) {
704 PyErr_BadArgument();
705 return NULL;
706 }
707 str = utf8_string((PyUnicodeObject *)unicode, NULL);
708 if (str == NULL)
709 return NULL;
710 Py_INCREF(str);
711 return str;
712}
713
714/* --- UTF-16 Codec ------------------------------------------------------- */
715
716static
717int utf16_decoding_error(const Py_UNICODE **source,
718 Py_UNICODE **dest,
719 const char *errors,
720 const char *details)
721{
722 if ((errors == NULL) ||
723 (strcmp(errors,"strict") == 0)) {
724 PyErr_Format(PyExc_UnicodeError,
725 "UTF-16 decoding error: %s",
726 details);
727 return -1;
728 }
729 else if (strcmp(errors,"ignore") == 0) {
730 return 0;
731 }
732 else if (strcmp(errors,"replace") == 0) {
733 if (dest) {
734 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
735 (*dest)++;
736 }
737 return 0;
738 }
739 else {
740 PyErr_Format(PyExc_ValueError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000741 "UTF-16 decoding error; unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000742 errors);
743 return -1;
744 }
745}
746
747#define UTF16_ERROR(details) do { \
748 if (utf16_decoding_error(&q, &p, errors, details)) \
749 goto onError; \
750 continue; \
751} while(0)
752
753PyObject *PyUnicode_DecodeUTF16(const char *s,
754 int size,
755 const char *errors,
756 int *byteorder)
757{
758 PyUnicodeObject *unicode;
759 Py_UNICODE *p;
760 const Py_UNICODE *q, *e;
761 int bo = 0;
762
763 /* size should be an even number */
764 if (size % sizeof(Py_UNICODE) != 0) {
765 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
766 return NULL;
767 /* The remaining input chars are ignored if we fall through
768 here... */
769 }
770
771 /* Note: size will always be longer than the resulting Unicode
772 character count */
773 unicode = _PyUnicode_New(size);
774 if (!unicode)
775 return NULL;
776 if (size == 0)
777 return (PyObject *)unicode;
778
779 /* Unpack UTF-16 encoded data */
780 p = unicode->str;
781 q = (Py_UNICODE *)s;
782 e = q + (size / sizeof(Py_UNICODE));
783
784 if (byteorder)
785 bo = *byteorder;
786
787 while (q < e) {
788 register Py_UNICODE ch = *q++;
789
790 /* Check for BOM marks (U+FEFF) in the input and adjust
791 current byte order setting accordingly. Swap input
792 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
793 !) */
794#ifdef BYTEORDER_IS_LITTLE_ENDIAN
795 if (ch == 0xFEFF) {
796 bo = -1;
797 continue;
798 } else if (ch == 0xFFFE) {
799 bo = 1;
800 continue;
801 }
802 if (bo == 1)
803 ch = (ch >> 8) | (ch << 8);
804#else
805 if (ch == 0xFEFF) {
806 bo = 1;
807 continue;
808 } else if (ch == 0xFFFE) {
809 bo = -1;
810 continue;
811 }
812 if (bo == -1)
813 ch = (ch >> 8) | (ch << 8);
814#endif
815 if (ch < 0xD800 || ch > 0xDFFF) {
816 *p++ = ch;
817 continue;
818 }
819
820 /* UTF-16 code pair: */
821 if (q >= e)
822 UTF16_ERROR("unexpected end of data");
823 if (0xDC00 <= *q && *q <= 0xDFFF) {
824 q++;
825 if (0xD800 <= *q && *q <= 0xDBFF)
826 /* This is valid data (a UTF-16 surrogate pair), but
827 we are not able to store this information since our
828 Py_UNICODE type only has 16 bits... this might
829 change someday, even though it's unlikely. */
830 UTF16_ERROR("code pairs are not supported");
831 else
832 continue;
833 }
834 UTF16_ERROR("illegal encoding");
835 }
836
837 if (byteorder)
838 *byteorder = bo;
839
840 /* Adjust length */
841 if (_PyUnicode_Resize(unicode, p - unicode->str))
842 goto onError;
843
844 return (PyObject *)unicode;
845
846onError:
847 Py_DECREF(unicode);
848 return NULL;
849}
850
851#undef UTF16_ERROR
852
853PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
854 int size,
855 const char *errors,
856 int byteorder)
857{
858 PyObject *v;
859 Py_UNICODE *p;
860 char *q;
861
862 /* We don't create UTF-16 pairs... */
863 v = PyString_FromStringAndSize(NULL,
864 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
865 if (v == NULL)
866 return NULL;
867 if (size == 0)
868 goto done;
869
870 q = PyString_AS_STRING(v);
871 p = (Py_UNICODE *)q;
872
873 if (byteorder == 0)
874 *p++ = 0xFEFF;
875 if (byteorder == 0 ||
876#ifdef BYTEORDER_IS_LITTLE_ENDIAN
877 byteorder == -1
878#else
879 byteorder == 1
880#endif
881 )
882 memcpy(p, s, size * sizeof(Py_UNICODE));
883 else
884 while (size-- > 0) {
885 Py_UNICODE ch = *s++;
886 *p++ = (ch >> 8) | (ch << 8);
887 }
888 done:
889 return v;
890}
891
892PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
893{
894 if (!PyUnicode_Check(unicode)) {
895 PyErr_BadArgument();
896 return NULL;
897 }
898 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
899 PyUnicode_GET_SIZE(unicode),
900 NULL,
901 0);
902}
903
904/* --- Unicode Escape Codec ----------------------------------------------- */
905
906static
907int unicodeescape_decoding_error(const char **source,
908 unsigned int *x,
909 const char *errors,
910 const char *details)
911{
912 if ((errors == NULL) ||
913 (strcmp(errors,"strict") == 0)) {
914 PyErr_Format(PyExc_UnicodeError,
915 "Unicode-Escape decoding error: %s",
916 details);
917 return -1;
918 }
919 else if (strcmp(errors,"ignore") == 0) {
920 return 0;
921 }
922 else if (strcmp(errors,"replace") == 0) {
923 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
924 return 0;
925 }
926 else {
927 PyErr_Format(PyExc_ValueError,
928 "Unicode-Escape decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +0000929 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000930 errors);
931 return -1;
932 }
933}
934
935PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
936 int size,
937 const char *errors)
938{
939 PyUnicodeObject *v;
940 Py_UNICODE *p = NULL, *buf = NULL;
941 const char *end;
942
943 /* Escaped strings will always be longer than the resulting
944 Unicode string, so we start with size here and then reduce the
945 length after conversion to the true value. */
946 v = _PyUnicode_New(size);
947 if (v == NULL)
948 goto onError;
949 if (size == 0)
950 return (PyObject *)v;
951 p = buf = PyUnicode_AS_UNICODE(v);
952 end = s + size;
953 while (s < end) {
954 unsigned char c;
955 unsigned int x;
956 int i;
957
958 /* Non-escape characters are interpreted as Unicode ordinals */
959 if (*s != '\\') {
960 *p++ = (unsigned char)*s++;
961 continue;
962 }
963
964 /* \ - Escapes */
965 s++;
966 switch (*s++) {
967
968 /* \x escapes */
969 case '\n': break;
970 case '\\': *p++ = '\\'; break;
971 case '\'': *p++ = '\''; break;
972 case '\"': *p++ = '\"'; break;
973 case 'b': *p++ = '\b'; break;
974 case 'f': *p++ = '\014'; break; /* FF */
975 case 't': *p++ = '\t'; break;
976 case 'n': *p++ = '\n'; break;
977 case 'r': *p++ = '\r'; break;
978 case 'v': *p++ = '\013'; break; /* VT */
979 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
980
981 /* \OOO (octal) escapes */
982 case '0': case '1': case '2': case '3':
983 case '4': case '5': case '6': case '7':
984 c = s[-1] - '0';
985 if ('0' <= *s && *s <= '7') {
986 c = (c<<3) + *s++ - '0';
987 if ('0' <= *s && *s <= '7')
988 c = (c<<3) + *s++ - '0';
989 }
990 *p++ = c;
991 break;
992
993 /* \xXXXX escape with 0-4 hex digits */
994 case 'x':
995 x = 0;
996 c = (unsigned char)*s;
997 if (isxdigit(c)) {
998 do {
999 x = (x<<4) & ~0xF;
1000 if ('0' <= c && c <= '9')
1001 x += c - '0';
1002 else if ('a' <= c && c <= 'f')
1003 x += 10 + c - 'a';
1004 else
1005 x += 10 + c - 'A';
1006 c = (unsigned char)*++s;
1007 } while (isxdigit(c));
1008 *p++ = x;
1009 } else {
1010 *p++ = '\\';
1011 *p++ = (unsigned char)s[-1];
1012 }
1013 break;
1014
1015 /* \uXXXX with 4 hex digits */
1016 case 'u':
1017 for (x = 0, i = 0; i < 4; i++) {
1018 c = (unsigned char)s[i];
1019 if (!isxdigit(c)) {
1020 if (unicodeescape_decoding_error(&s, &x, errors,
1021 "truncated \\uXXXX"))
1022 goto onError;
1023 i++;
1024 break;
1025 }
1026 x = (x<<4) & ~0xF;
1027 if (c >= '0' && c <= '9')
1028 x += c - '0';
1029 else if (c >= 'a' && c <= 'f')
1030 x += 10 + c - 'a';
1031 else
1032 x += 10 + c - 'A';
1033 }
1034 s += i;
1035 *p++ = x;
1036 break;
1037
1038 default:
1039 *p++ = '\\';
1040 *p++ = (unsigned char)s[-1];
1041 break;
1042 }
1043 }
1044 _PyUnicode_Resize(v, (int)(p - buf));
1045 return (PyObject *)v;
1046
1047 onError:
1048 Py_XDECREF(v);
1049 return NULL;
1050}
1051
1052/* Return a Unicode-Escape string version of the Unicode object.
1053
1054 If quotes is true, the string is enclosed in u"" or u'' quotes as
1055 appropriate.
1056
1057*/
1058
Barry Warsaw51ac5802000-03-20 16:36:48 +00001059static const Py_UNICODE *findchar(const Py_UNICODE *s,
1060 int size,
1061 Py_UNICODE ch);
1062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063static
1064PyObject *unicodeescape_string(const Py_UNICODE *s,
1065 int size,
1066 int quotes)
1067{
1068 PyObject *repr;
1069 char *p;
1070 char *q;
1071
1072 static const char *hexdigit = "0123456789ABCDEF";
1073
1074 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1075 if (repr == NULL)
1076 return NULL;
1077
1078 p = q = PyString_AS_STRING(repr);
1079
1080 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001081 *p++ = 'u';
1082 *p++ = (findchar(s, size, '\'') &&
1083 !findchar(s, size, '"')) ? '"' : '\'';
1084 }
1085 while (size-- > 0) {
1086 Py_UNICODE ch = *s++;
1087 /* Escape quotes */
1088 if (quotes && (ch == q[1] || ch == '\\')) {
1089 *p++ = '\\';
1090 *p++ = (char) ch;
1091 }
1092 /* Map 16-bit characters to '\uxxxx' */
1093 else if (ch >= 256) {
1094 *p++ = '\\';
1095 *p++ = 'u';
1096 *p++ = hexdigit[(ch >> 12) & 0xf];
1097 *p++ = hexdigit[(ch >> 8) & 0xf];
1098 *p++ = hexdigit[(ch >> 4) & 0xf];
1099 *p++ = hexdigit[ch & 15];
1100 }
1101 /* Map non-printable US ASCII to '\ooo' */
1102 else if (ch < ' ' || ch >= 128) {
1103 *p++ = '\\';
1104 *p++ = hexdigit[(ch >> 6) & 7];
1105 *p++ = hexdigit[(ch >> 3) & 7];
1106 *p++ = hexdigit[ch & 7];
1107 }
1108 /* Copy everything else as-is */
1109 else
1110 *p++ = (char) ch;
1111 }
1112 if (quotes)
1113 *p++ = q[1];
1114
1115 *p = '\0';
1116 _PyString_Resize(&repr, p - q);
1117
1118 return repr;
1119}
1120
1121PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1122 int size)
1123{
1124 return unicodeescape_string(s, size, 0);
1125}
1126
1127PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1128{
1129 if (!PyUnicode_Check(unicode)) {
1130 PyErr_BadArgument();
1131 return NULL;
1132 }
1133 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1134 PyUnicode_GET_SIZE(unicode));
1135}
1136
1137/* --- Raw Unicode Escape Codec ------------------------------------------- */
1138
1139PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1140 int size,
1141 const char *errors)
1142{
1143 PyUnicodeObject *v;
1144 Py_UNICODE *p, *buf;
1145 const char *end;
1146 const char *bs;
1147
1148 /* Escaped strings will always be longer than the resulting
1149 Unicode string, so we start with size here and then reduce the
1150 length after conversion to the true value. */
1151 v = _PyUnicode_New(size);
1152 if (v == NULL)
1153 goto onError;
1154 if (size == 0)
1155 return (PyObject *)v;
1156 p = buf = PyUnicode_AS_UNICODE(v);
1157 end = s + size;
1158 while (s < end) {
1159 unsigned char c;
1160 unsigned int x;
1161 int i;
1162
1163 /* Non-escape characters are interpreted as Unicode ordinals */
1164 if (*s != '\\') {
1165 *p++ = (unsigned char)*s++;
1166 continue;
1167 }
1168
1169 /* \u-escapes are only interpreted iff the number of leading
1170 backslashes if odd */
1171 bs = s;
1172 for (;s < end;) {
1173 if (*s != '\\')
1174 break;
1175 *p++ = (unsigned char)*s++;
1176 }
1177 if (((s - bs) & 1) == 0 ||
1178 s >= end ||
1179 *s != 'u') {
1180 continue;
1181 }
1182 p--;
1183 s++;
1184
1185 /* \uXXXX with 4 hex digits */
1186 for (x = 0, i = 0; i < 4; i++) {
1187 c = (unsigned char)s[i];
1188 if (!isxdigit(c)) {
1189 if (unicodeescape_decoding_error(&s, &x, errors,
1190 "truncated \\uXXXX"))
1191 goto onError;
1192 i++;
1193 break;
1194 }
1195 x = (x<<4) & ~0xF;
1196 if (c >= '0' && c <= '9')
1197 x += c - '0';
1198 else if (c >= 'a' && c <= 'f')
1199 x += 10 + c - 'a';
1200 else
1201 x += 10 + c - 'A';
1202 }
1203 s += i;
1204 *p++ = x;
1205 }
1206 _PyUnicode_Resize(v, (int)(p - buf));
1207 return (PyObject *)v;
1208
1209 onError:
1210 Py_XDECREF(v);
1211 return NULL;
1212}
1213
1214PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1215 int size)
1216{
1217 PyObject *repr;
1218 char *p;
1219 char *q;
1220
1221 static const char *hexdigit = "0123456789ABCDEF";
1222
1223 repr = PyString_FromStringAndSize(NULL, 6 * size);
1224 if (repr == NULL)
1225 return NULL;
1226
1227 p = q = PyString_AS_STRING(repr);
1228 while (size-- > 0) {
1229 Py_UNICODE ch = *s++;
1230 /* Map 16-bit characters to '\uxxxx' */
1231 if (ch >= 256) {
1232 *p++ = '\\';
1233 *p++ = 'u';
1234 *p++ = hexdigit[(ch >> 12) & 0xf];
1235 *p++ = hexdigit[(ch >> 8) & 0xf];
1236 *p++ = hexdigit[(ch >> 4) & 0xf];
1237 *p++ = hexdigit[ch & 15];
1238 }
1239 /* Copy everything else as-is */
1240 else
1241 *p++ = (char) ch;
1242 }
1243 *p = '\0';
1244 _PyString_Resize(&repr, p - q);
1245
1246 return repr;
1247}
1248
1249PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1250{
1251 if (!PyUnicode_Check(unicode)) {
1252 PyErr_BadArgument();
1253 return NULL;
1254 }
1255 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1256 PyUnicode_GET_SIZE(unicode));
1257}
1258
1259/* --- Latin-1 Codec ------------------------------------------------------ */
1260
1261PyObject *PyUnicode_DecodeLatin1(const char *s,
1262 int size,
1263 const char *errors)
1264{
1265 PyUnicodeObject *v;
1266 Py_UNICODE *p;
1267
1268 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1269 v = _PyUnicode_New(size);
1270 if (v == NULL)
1271 goto onError;
1272 if (size == 0)
1273 return (PyObject *)v;
1274 p = PyUnicode_AS_UNICODE(v);
1275 while (size-- > 0)
1276 *p++ = (unsigned char)*s++;
1277 return (PyObject *)v;
1278
1279 onError:
1280 Py_XDECREF(v);
1281 return NULL;
1282}
1283
1284static
1285int latin1_encoding_error(const Py_UNICODE **source,
1286 char **dest,
1287 const char *errors,
1288 const char *details)
1289{
1290 if ((errors == NULL) ||
1291 (strcmp(errors,"strict") == 0)) {
1292 PyErr_Format(PyExc_UnicodeError,
1293 "Latin-1 encoding error: %s",
1294 details);
1295 return -1;
1296 }
1297 else if (strcmp(errors,"ignore") == 0) {
1298 return 0;
1299 }
1300 else if (strcmp(errors,"replace") == 0) {
1301 **dest = '?';
1302 return 0;
1303 }
1304 else {
1305 PyErr_Format(PyExc_ValueError,
1306 "Latin-1 encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001307 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 errors);
1309 return -1;
1310 }
1311}
1312
1313PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1314 int size,
1315 const char *errors)
1316{
1317 PyObject *repr;
1318 char *s;
1319 repr = PyString_FromStringAndSize(NULL, size);
1320 if (repr == NULL)
1321 return NULL;
1322
1323 s = PyString_AS_STRING(repr);
1324 while (size-- > 0) {
1325 Py_UNICODE ch = *p++;
1326 if (ch >= 256) {
1327 if (latin1_encoding_error(&p, &s, errors,
1328 "ordinal not in range(256)"))
1329 goto onError;
1330 }
1331 else
1332 *s++ = (char)ch;
1333 }
1334 return repr;
1335
1336 onError:
1337 Py_DECREF(repr);
1338 return NULL;
1339}
1340
1341PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1342{
1343 if (!PyUnicode_Check(unicode)) {
1344 PyErr_BadArgument();
1345 return NULL;
1346 }
1347 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1348 PyUnicode_GET_SIZE(unicode),
1349 NULL);
1350}
1351
1352/* --- 7-bit ASCII Codec -------------------------------------------------- */
1353
1354static
1355int ascii_decoding_error(const char **source,
1356 Py_UNICODE **dest,
1357 const char *errors,
1358 const char *details)
1359{
1360 if ((errors == NULL) ||
1361 (strcmp(errors,"strict") == 0)) {
1362 PyErr_Format(PyExc_UnicodeError,
1363 "ASCII decoding error: %s",
1364 details);
1365 return -1;
1366 }
1367 else if (strcmp(errors,"ignore") == 0) {
1368 return 0;
1369 }
1370 else if (strcmp(errors,"replace") == 0) {
1371 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1372 (*dest)++;
1373 return 0;
1374 }
1375 else {
1376 PyErr_Format(PyExc_ValueError,
1377 "ASCII decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001378 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 errors);
1380 return -1;
1381 }
1382}
1383
1384PyObject *PyUnicode_DecodeASCII(const char *s,
1385 int size,
1386 const char *errors)
1387{
1388 PyUnicodeObject *v;
1389 Py_UNICODE *p;
1390
1391 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1392 v = _PyUnicode_New(size);
1393 if (v == NULL)
1394 goto onError;
1395 if (size == 0)
1396 return (PyObject *)v;
1397 p = PyUnicode_AS_UNICODE(v);
1398 while (size-- > 0) {
1399 register unsigned char c;
1400
1401 c = (unsigned char)*s++;
1402 if (c < 128)
1403 *p++ = c;
1404 else if (ascii_decoding_error(&s, &p, errors,
1405 "ordinal not in range(128)"))
1406 goto onError;
1407 }
1408 if (p - PyUnicode_AS_UNICODE(v) < size)
1409 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1410 return (PyObject *)v;
1411
1412 onError:
1413 Py_XDECREF(v);
1414 return NULL;
1415}
1416
1417static
1418int ascii_encoding_error(const Py_UNICODE **source,
1419 char **dest,
1420 const char *errors,
1421 const char *details)
1422{
1423 if ((errors == NULL) ||
1424 (strcmp(errors,"strict") == 0)) {
1425 PyErr_Format(PyExc_UnicodeError,
1426 "ASCII encoding error: %s",
1427 details);
1428 return -1;
1429 }
1430 else if (strcmp(errors,"ignore") == 0) {
1431 return 0;
1432 }
1433 else if (strcmp(errors,"replace") == 0) {
1434 **dest = '?';
1435 return 0;
1436 }
1437 else {
1438 PyErr_Format(PyExc_ValueError,
1439 "ASCII encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001440 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 errors);
1442 return -1;
1443 }
1444}
1445
1446PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1447 int size,
1448 const char *errors)
1449{
1450 PyObject *repr;
1451 char *s;
1452 repr = PyString_FromStringAndSize(NULL, size);
1453 if (repr == NULL)
1454 return NULL;
1455
1456 s = PyString_AS_STRING(repr);
1457 while (size-- > 0) {
1458 Py_UNICODE ch = *p++;
1459 if (ch >= 128) {
1460 if (ascii_encoding_error(&p, &s, errors,
1461 "ordinal not in range(128)"))
1462 goto onError;
1463 }
1464 else
1465 *s++ = (char)ch;
1466 }
1467 return repr;
1468
1469 onError:
1470 Py_DECREF(repr);
1471 return NULL;
1472}
1473
1474PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1475{
1476 if (!PyUnicode_Check(unicode)) {
1477 PyErr_BadArgument();
1478 return NULL;
1479 }
1480 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1481 PyUnicode_GET_SIZE(unicode),
1482 NULL);
1483}
1484
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001485#ifdef MS_WIN32
1486/* --- MBCS codecs for Windows -------------------------------------------- */
1487PyObject *PyUnicode_DecodeMBCS(const char *s,
1488 int size,
1489 const char *errors)
1490{
1491 PyUnicodeObject *v;
1492 Py_UNICODE *p;
1493
1494 /* First get the size of the result */
1495 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1496 if (usize==0)
1497 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1498
1499 v = _PyUnicode_New(usize);
1500 if (v == NULL)
1501 return NULL;
1502 if (usize == 0)
1503 return (PyObject *)v;
1504 p = PyUnicode_AS_UNICODE(v);
1505 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1506 Py_DECREF(v);
1507 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1508 }
1509
1510 return (PyObject *)v;
1511}
1512
1513PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1514 int size,
1515 const char *errors)
1516{
1517 PyObject *repr;
1518 char *s;
1519
1520 /* First get the size of the result */
1521 DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1522 if (mbcssize==0)
1523 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1524
1525 repr = PyString_FromStringAndSize(NULL, mbcssize);
1526 if (repr == NULL)
1527 return NULL;
1528 if (mbcssize==0)
1529 return repr;
1530
1531 /* Do the conversion */
1532 s = PyString_AS_STRING(repr);
1533 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1534 Py_DECREF(repr);
1535 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1536 }
1537 return repr;
1538}
1539#endif /* MS_WIN32 */
1540
Guido van Rossumd57fd912000-03-10 22:53:23 +00001541/* --- Character Mapping Codec -------------------------------------------- */
1542
1543static
1544int charmap_decoding_error(const char **source,
1545 Py_UNICODE **dest,
1546 const char *errors,
1547 const char *details)
1548{
1549 if ((errors == NULL) ||
1550 (strcmp(errors,"strict") == 0)) {
1551 PyErr_Format(PyExc_UnicodeError,
1552 "charmap decoding error: %s",
1553 details);
1554 return -1;
1555 }
1556 else if (strcmp(errors,"ignore") == 0) {
1557 return 0;
1558 }
1559 else if (strcmp(errors,"replace") == 0) {
1560 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1561 (*dest)++;
1562 return 0;
1563 }
1564 else {
1565 PyErr_Format(PyExc_ValueError,
1566 "charmap decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001567 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001568 errors);
1569 return -1;
1570 }
1571}
1572
1573PyObject *PyUnicode_DecodeCharmap(const char *s,
1574 int size,
1575 PyObject *mapping,
1576 const char *errors)
1577{
1578 PyUnicodeObject *v;
1579 Py_UNICODE *p;
1580
1581 /* Default to Latin-1 */
1582 if (mapping == NULL)
1583 return PyUnicode_DecodeLatin1(s, size, errors);
1584
1585 v = _PyUnicode_New(size);
1586 if (v == NULL)
1587 goto onError;
1588 if (size == 0)
1589 return (PyObject *)v;
1590 p = PyUnicode_AS_UNICODE(v);
1591 while (size-- > 0) {
1592 unsigned char ch = *s++;
1593 PyObject *w, *x;
1594
1595 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1596 w = PyInt_FromLong((long)ch);
1597 if (w == NULL)
1598 goto onError;
1599 x = PyObject_GetItem(mapping, w);
1600 Py_DECREF(w);
1601 if (x == NULL) {
1602 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1603 /* No mapping found: default to Latin-1 mapping */
1604 PyErr_Clear();
1605 *p++ = (Py_UNICODE)ch;
1606 continue;
1607 }
1608 goto onError;
1609 }
1610
1611 /* Apply mapping */
1612 if (PyInt_Check(x)) {
1613 int value = PyInt_AS_LONG(x);
1614 if (value < 0 || value > 65535) {
1615 PyErr_SetString(PyExc_TypeError,
1616 "character mapping must be in range(65336)");
1617 Py_DECREF(x);
1618 goto onError;
1619 }
1620 *p++ = (Py_UNICODE)value;
1621 }
1622 else if (x == Py_None) {
1623 /* undefined mapping */
1624 if (charmap_decoding_error(&s, &p, errors,
1625 "character maps to <undefined>")) {
1626 Py_DECREF(x);
1627 goto onError;
1628 }
1629 }
1630 else if (PyUnicode_Check(x)) {
1631 if (PyUnicode_GET_SIZE(x) != 1) {
1632 /* 1-n mapping */
1633 PyErr_SetString(PyExc_NotImplementedError,
1634 "1-n mappings are currently not implemented");
1635 Py_DECREF(x);
1636 goto onError;
1637 }
1638 *p++ = *PyUnicode_AS_UNICODE(x);
1639 }
1640 else {
1641 /* wrong return value */
1642 PyErr_SetString(PyExc_TypeError,
1643 "character mapping must return integer, None or unicode");
1644 Py_DECREF(x);
1645 goto onError;
1646 }
1647 Py_DECREF(x);
1648 }
1649 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1650 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1651 goto onError;
1652 return (PyObject *)v;
1653
1654 onError:
1655 Py_XDECREF(v);
1656 return NULL;
1657}
1658
1659static
1660int charmap_encoding_error(const Py_UNICODE **source,
1661 char **dest,
1662 const char *errors,
1663 const char *details)
1664{
1665 if ((errors == NULL) ||
1666 (strcmp(errors,"strict") == 0)) {
1667 PyErr_Format(PyExc_UnicodeError,
1668 "charmap encoding error: %s",
1669 details);
1670 return -1;
1671 }
1672 else if (strcmp(errors,"ignore") == 0) {
1673 return 0;
1674 }
1675 else if (strcmp(errors,"replace") == 0) {
1676 **dest = '?';
1677 (*dest)++;
1678 return 0;
1679 }
1680 else {
1681 PyErr_Format(PyExc_ValueError,
1682 "charmap encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001683 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 errors);
1685 return -1;
1686 }
1687}
1688
1689PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1690 int size,
1691 PyObject *mapping,
1692 const char *errors)
1693{
1694 PyObject *v;
1695 char *s;
1696
1697 /* Default to Latin-1 */
1698 if (mapping == NULL)
1699 return PyUnicode_EncodeLatin1(p, size, errors);
1700
1701 v = PyString_FromStringAndSize(NULL, size);
1702 if (v == NULL)
1703 return NULL;
1704 s = PyString_AS_STRING(v);
1705 while (size-- > 0) {
1706 Py_UNICODE ch = *p++;
1707 PyObject *w, *x;
1708
1709 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1710 w = PyInt_FromLong((long)ch);
1711 if (w == NULL)
1712 goto onError;
1713 x = PyObject_GetItem(mapping, w);
1714 Py_DECREF(w);
1715 if (x == NULL) {
1716 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1717 /* No mapping found: default to Latin-1 mapping if possible */
1718 PyErr_Clear();
1719 if (ch < 256) {
1720 *s++ = (char)ch;
1721 continue;
1722 }
1723 else if (!charmap_encoding_error(&p, &s, errors,
1724 "missing character mapping"))
1725 continue;
1726 }
1727 goto onError;
1728 }
1729
1730 /* Apply mapping */
1731 if (PyInt_Check(x)) {
1732 int value = PyInt_AS_LONG(x);
1733 if (value < 0 || value > 255) {
1734 PyErr_SetString(PyExc_TypeError,
1735 "character mapping must be in range(256)");
1736 Py_DECREF(x);
1737 goto onError;
1738 }
1739 *s++ = (char)value;
1740 }
1741 else if (x == Py_None) {
1742 /* undefined mapping */
1743 if (charmap_encoding_error(&p, &s, errors,
1744 "character maps to <undefined>")) {
1745 Py_DECREF(x);
1746 goto onError;
1747 }
1748 }
1749 else if (PyString_Check(x)) {
1750 if (PyString_GET_SIZE(x) != 1) {
1751 /* 1-n mapping */
1752 PyErr_SetString(PyExc_NotImplementedError,
1753 "1-n mappings are currently not implemented");
1754 Py_DECREF(x);
1755 goto onError;
1756 }
1757 *s++ = *PyString_AS_STRING(x);
1758 }
1759 else {
1760 /* wrong return value */
1761 PyErr_SetString(PyExc_TypeError,
1762 "character mapping must return integer, None or unicode");
1763 Py_DECREF(x);
1764 goto onError;
1765 }
1766 Py_DECREF(x);
1767 }
1768 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1769 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1770 goto onError;
1771 return v;
1772
1773 onError:
1774 Py_DECREF(v);
1775 return NULL;
1776}
1777
1778PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1779 PyObject *mapping)
1780{
1781 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1782 PyErr_BadArgument();
1783 return NULL;
1784 }
1785 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1786 PyUnicode_GET_SIZE(unicode),
1787 mapping,
1788 NULL);
1789}
1790
1791static
1792int translate_error(const Py_UNICODE **source,
1793 Py_UNICODE **dest,
1794 const char *errors,
1795 const char *details)
1796{
1797 if ((errors == NULL) ||
1798 (strcmp(errors,"strict") == 0)) {
1799 PyErr_Format(PyExc_UnicodeError,
1800 "translate error: %s",
1801 details);
1802 return -1;
1803 }
1804 else if (strcmp(errors,"ignore") == 0) {
1805 return 0;
1806 }
1807 else if (strcmp(errors,"replace") == 0) {
1808 **dest = '?';
1809 (*dest)++;
1810 return 0;
1811 }
1812 else {
1813 PyErr_Format(PyExc_ValueError,
1814 "translate error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001815 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 errors);
1817 return -1;
1818 }
1819}
1820
1821PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1822 int size,
1823 PyObject *mapping,
1824 const char *errors)
1825{
1826 PyUnicodeObject *v;
1827 Py_UNICODE *p;
1828
1829 if (mapping == NULL) {
1830 PyErr_BadArgument();
1831 return NULL;
1832 }
1833
1834 /* Output will never be longer than input */
1835 v = _PyUnicode_New(size);
1836 if (v == NULL)
1837 goto onError;
1838 if (size == 0)
1839 goto done;
1840 p = PyUnicode_AS_UNICODE(v);
1841 while (size-- > 0) {
1842 Py_UNICODE ch = *s++;
1843 PyObject *w, *x;
1844
1845 /* Get mapping */
1846 w = PyInt_FromLong(ch);
1847 if (w == NULL)
1848 goto onError;
1849 x = PyObject_GetItem(mapping, w);
1850 Py_DECREF(w);
1851 if (x == NULL) {
1852 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1853 /* No mapping found: default to 1-1 mapping */
1854 PyErr_Clear();
1855 *p++ = ch;
1856 continue;
1857 }
1858 goto onError;
1859 }
1860
1861 /* Apply mapping */
1862 if (PyInt_Check(x))
1863 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1864 else if (x == Py_None) {
1865 /* undefined mapping */
1866 if (translate_error(&s, &p, errors,
1867 "character maps to <undefined>")) {
1868 Py_DECREF(x);
1869 goto onError;
1870 }
1871 }
1872 else if (PyUnicode_Check(x)) {
1873 if (PyUnicode_GET_SIZE(x) != 1) {
1874 /* 1-n mapping */
1875 PyErr_SetString(PyExc_NotImplementedError,
1876 "1-n mappings are currently not implemented");
1877 Py_DECREF(x);
1878 goto onError;
1879 }
1880 *p++ = *PyUnicode_AS_UNICODE(x);
1881 }
1882 else {
1883 /* wrong return value */
1884 PyErr_SetString(PyExc_TypeError,
1885 "translate mapping must return integer, None or unicode");
1886 Py_DECREF(x);
1887 goto onError;
1888 }
1889 Py_DECREF(x);
1890 }
1891 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1892 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1893
1894 done:
1895 return (PyObject *)v;
1896
1897 onError:
1898 Py_XDECREF(v);
1899 return NULL;
1900}
1901
1902PyObject *PyUnicode_Translate(PyObject *str,
1903 PyObject *mapping,
1904 const char *errors)
1905{
1906 PyObject *result;
1907
1908 str = PyUnicode_FromObject(str);
1909 if (str == NULL)
1910 goto onError;
1911 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1912 PyUnicode_GET_SIZE(str),
1913 mapping,
1914 errors);
1915 Py_DECREF(str);
1916 return result;
1917
1918 onError:
1919 Py_XDECREF(str);
1920 return NULL;
1921}
1922
1923/* --- Helpers ------------------------------------------------------------ */
1924
1925static
1926int count(PyUnicodeObject *self,
1927 int start,
1928 int end,
1929 PyUnicodeObject *substring)
1930{
1931 int count = 0;
1932
1933 end -= substring->length;
1934
1935 while (start <= end)
1936 if (Py_UNICODE_MATCH(self, start, substring)) {
1937 count++;
1938 start += substring->length;
1939 } else
1940 start++;
1941
1942 return count;
1943}
1944
1945int PyUnicode_Count(PyObject *str,
1946 PyObject *substr,
1947 int start,
1948 int end)
1949{
1950 int result;
1951
1952 str = PyUnicode_FromObject(str);
1953 if (str == NULL)
1954 return -1;
1955 substr = PyUnicode_FromObject(substr);
1956 if (substr == NULL) {
1957 Py_DECREF(substr);
1958 return -1;
1959 }
1960
1961 result = count((PyUnicodeObject *)str,
1962 start, end,
1963 (PyUnicodeObject *)substr);
1964
1965 Py_DECREF(str);
1966 Py_DECREF(substr);
1967 return result;
1968}
1969
1970static
1971int findstring(PyUnicodeObject *self,
1972 PyUnicodeObject *substring,
1973 int start,
1974 int end,
1975 int direction)
1976{
1977 if (start < 0)
1978 start += self->length;
1979 if (start < 0)
1980 start = 0;
1981
1982 if (substring->length == 0)
1983 return start;
1984
1985 if (end > self->length)
1986 end = self->length;
1987 if (end < 0)
1988 end += self->length;
1989 if (end < 0)
1990 end = 0;
1991
1992 end -= substring->length;
1993
1994 if (direction < 0) {
1995 for (; end >= start; end--)
1996 if (Py_UNICODE_MATCH(self, end, substring))
1997 return end;
1998 } else {
1999 for (; start <= end; start++)
2000 if (Py_UNICODE_MATCH(self, start, substring))
2001 return start;
2002 }
2003
2004 return -1;
2005}
2006
2007int PyUnicode_Find(PyObject *str,
2008 PyObject *substr,
2009 int start,
2010 int end,
2011 int direction)
2012{
2013 int result;
2014
2015 str = PyUnicode_FromObject(str);
2016 if (str == NULL)
2017 return -1;
2018 substr = PyUnicode_FromObject(substr);
2019 if (substr == NULL) {
2020 Py_DECREF(substr);
2021 return -1;
2022 }
2023
2024 result = findstring((PyUnicodeObject *)str,
2025 (PyUnicodeObject *)substr,
2026 start, end, direction);
2027 Py_DECREF(str);
2028 Py_DECREF(substr);
2029 return result;
2030}
2031
2032static
2033int tailmatch(PyUnicodeObject *self,
2034 PyUnicodeObject *substring,
2035 int start,
2036 int end,
2037 int direction)
2038{
2039 if (start < 0)
2040 start += self->length;
2041 if (start < 0)
2042 start = 0;
2043
2044 if (substring->length == 0)
2045 return 1;
2046
2047 if (end > self->length)
2048 end = self->length;
2049 if (end < 0)
2050 end += self->length;
2051 if (end < 0)
2052 end = 0;
2053
2054 end -= substring->length;
2055 if (end < start)
2056 return 0;
2057
2058 if (direction > 0) {
2059 if (Py_UNICODE_MATCH(self, end, substring))
2060 return 1;
2061 } else {
2062 if (Py_UNICODE_MATCH(self, start, substring))
2063 return 1;
2064 }
2065
2066 return 0;
2067}
2068
2069int PyUnicode_Tailmatch(PyObject *str,
2070 PyObject *substr,
2071 int start,
2072 int end,
2073 int direction)
2074{
2075 int result;
2076
2077 str = PyUnicode_FromObject(str);
2078 if (str == NULL)
2079 return -1;
2080 substr = PyUnicode_FromObject(substr);
2081 if (substr == NULL) {
2082 Py_DECREF(substr);
2083 return -1;
2084 }
2085
2086 result = tailmatch((PyUnicodeObject *)str,
2087 (PyUnicodeObject *)substr,
2088 start, end, direction);
2089 Py_DECREF(str);
2090 Py_DECREF(substr);
2091 return result;
2092}
2093
2094static
2095const Py_UNICODE *findchar(const Py_UNICODE *s,
2096 int size,
2097 Py_UNICODE ch)
2098{
2099 /* like wcschr, but doesn't stop at NULL characters */
2100
2101 while (size-- > 0) {
2102 if (*s == ch)
2103 return s;
2104 s++;
2105 }
2106
2107 return NULL;
2108}
2109
2110/* Apply fixfct filter to the Unicode object self and return a
2111 reference to the modified object */
2112
2113static
2114PyObject *fixup(PyUnicodeObject *self,
2115 int (*fixfct)(PyUnicodeObject *s))
2116{
2117
2118 PyUnicodeObject *u;
2119
2120 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2121 self->length);
2122 if (u == NULL)
2123 return NULL;
2124 if (!fixfct(u)) {
2125 /* fixfct should return TRUE if it modified the buffer. If
2126 FALSE, return a reference to the original buffer instead
2127 (to save space, not time) */
2128 Py_INCREF(self);
2129 Py_DECREF(u);
2130 return (PyObject*) self;
2131 }
2132 return (PyObject*) u;
2133}
2134
2135static
2136int fixupper(PyUnicodeObject *self)
2137{
2138 int len = self->length;
2139 Py_UNICODE *s = self->str;
2140 int status = 0;
2141
2142 while (len-- > 0) {
2143 register Py_UNICODE ch;
2144
2145 ch = Py_UNICODE_TOUPPER(*s);
2146 if (ch != *s) {
2147 status = 1;
2148 *s = ch;
2149 }
2150 s++;
2151 }
2152
2153 return status;
2154}
2155
2156static
2157int fixlower(PyUnicodeObject *self)
2158{
2159 int len = self->length;
2160 Py_UNICODE *s = self->str;
2161 int status = 0;
2162
2163 while (len-- > 0) {
2164 register Py_UNICODE ch;
2165
2166 ch = Py_UNICODE_TOLOWER(*s);
2167 if (ch != *s) {
2168 status = 1;
2169 *s = ch;
2170 }
2171 s++;
2172 }
2173
2174 return status;
2175}
2176
2177static
2178int fixswapcase(PyUnicodeObject *self)
2179{
2180 int len = self->length;
2181 Py_UNICODE *s = self->str;
2182 int status = 0;
2183
2184 while (len-- > 0) {
2185 if (Py_UNICODE_ISUPPER(*s)) {
2186 *s = Py_UNICODE_TOLOWER(*s);
2187 status = 1;
2188 } else if (Py_UNICODE_ISLOWER(*s)) {
2189 *s = Py_UNICODE_TOUPPER(*s);
2190 status = 1;
2191 }
2192 s++;
2193 }
2194
2195 return status;
2196}
2197
2198static
2199int fixcapitalize(PyUnicodeObject *self)
2200{
2201 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2202 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2203 return 1;
2204 }
2205 return 0;
2206}
2207
2208static
2209int fixtitle(PyUnicodeObject *self)
2210{
2211 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2212 register Py_UNICODE *e;
2213 int previous_is_cased;
2214
2215 /* Shortcut for single character strings */
2216 if (PyUnicode_GET_SIZE(self) == 1) {
2217 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2218 if (*p != ch) {
2219 *p = ch;
2220 return 1;
2221 }
2222 else
2223 return 0;
2224 }
2225
2226 e = p + PyUnicode_GET_SIZE(self);
2227 previous_is_cased = 0;
2228 for (; p < e; p++) {
2229 register const Py_UNICODE ch = *p;
2230
2231 if (previous_is_cased)
2232 *p = Py_UNICODE_TOLOWER(ch);
2233 else
2234 *p = Py_UNICODE_TOTITLE(ch);
2235
2236 if (Py_UNICODE_ISLOWER(ch) ||
2237 Py_UNICODE_ISUPPER(ch) ||
2238 Py_UNICODE_ISTITLE(ch))
2239 previous_is_cased = 1;
2240 else
2241 previous_is_cased = 0;
2242 }
2243 return 1;
2244}
2245
2246PyObject *PyUnicode_Join(PyObject *separator,
2247 PyObject *seq)
2248{
2249 Py_UNICODE *sep;
2250 int seplen;
2251 PyUnicodeObject *res = NULL;
2252 int reslen = 0;
2253 Py_UNICODE *p;
2254 int seqlen = 0;
2255 int sz = 100;
2256 int i;
2257
2258 seqlen = PySequence_Length(seq);
2259 if (seqlen < 0 && PyErr_Occurred())
2260 return NULL;
2261
2262 if (separator == NULL) {
2263 Py_UNICODE blank = ' ';
2264 sep = &blank;
2265 seplen = 1;
2266 }
2267 else {
2268 separator = PyUnicode_FromObject(separator);
2269 if (separator == NULL)
2270 return NULL;
2271 sep = PyUnicode_AS_UNICODE(separator);
2272 seplen = PyUnicode_GET_SIZE(separator);
2273 }
2274
2275 res = _PyUnicode_New(sz);
2276 if (res == NULL)
2277 goto onError;
2278 p = PyUnicode_AS_UNICODE(res);
2279 reslen = 0;
2280
2281 for (i = 0; i < seqlen; i++) {
2282 int itemlen;
2283 PyObject *item;
2284
2285 item = PySequence_GetItem(seq, i);
2286 if (item == NULL)
2287 goto onError;
2288 if (!PyUnicode_Check(item)) {
2289 PyObject *v;
2290 v = PyUnicode_FromObject(item);
2291 Py_DECREF(item);
2292 item = v;
2293 if (item == NULL)
2294 goto onError;
2295 }
2296 itemlen = PyUnicode_GET_SIZE(item);
2297 while (reslen + itemlen + seplen >= sz) {
2298 if (_PyUnicode_Resize(res, sz*2))
2299 goto onError;
2300 sz *= 2;
2301 p = PyUnicode_AS_UNICODE(res) + reslen;
2302 }
2303 if (i > 0) {
2304 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2305 p += seplen;
2306 reslen += seplen;
2307 }
2308 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2309 p += itemlen;
2310 reslen += itemlen;
2311 Py_DECREF(item);
2312 }
2313 if (_PyUnicode_Resize(res, reslen))
2314 goto onError;
2315
2316 Py_XDECREF(separator);
2317 return (PyObject *)res;
2318
2319 onError:
2320 Py_XDECREF(separator);
2321 Py_DECREF(res);
2322 return NULL;
2323}
2324
2325static
2326PyUnicodeObject *pad(PyUnicodeObject *self,
2327 int left,
2328 int right,
2329 Py_UNICODE fill)
2330{
2331 PyUnicodeObject *u;
2332
2333 if (left < 0)
2334 left = 0;
2335 if (right < 0)
2336 right = 0;
2337
2338 if (left == 0 && right == 0) {
2339 Py_INCREF(self);
2340 return self;
2341 }
2342
2343 u = _PyUnicode_New(left + self->length + right);
2344 if (u) {
2345 if (left)
2346 Py_UNICODE_FILL(u->str, fill, left);
2347 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2348 if (right)
2349 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2350 }
2351
2352 return u;
2353}
2354
2355#define SPLIT_APPEND(data, left, right) \
2356 str = PyUnicode_FromUnicode(data + left, right - left); \
2357 if (!str) \
2358 goto onError; \
2359 if (PyList_Append(list, str)) { \
2360 Py_DECREF(str); \
2361 goto onError; \
2362 } \
2363 else \
2364 Py_DECREF(str);
2365
2366static
2367PyObject *split_whitespace(PyUnicodeObject *self,
2368 PyObject *list,
2369 int maxcount)
2370{
2371 register int i;
2372 register int j;
2373 int len = self->length;
2374 PyObject *str;
2375
2376 for (i = j = 0; i < len; ) {
2377 /* find a token */
2378 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2379 i++;
2380 j = i;
2381 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2382 i++;
2383 if (j < i) {
2384 if (maxcount-- <= 0)
2385 break;
2386 SPLIT_APPEND(self->str, j, i);
2387 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2388 i++;
2389 j = i;
2390 }
2391 }
2392 if (j < len) {
2393 SPLIT_APPEND(self->str, j, len);
2394 }
2395 return list;
2396
2397 onError:
2398 Py_DECREF(list);
2399 return NULL;
2400}
2401
2402PyObject *PyUnicode_Splitlines(PyObject *string,
2403 int maxcount)
2404{
2405 register int i;
2406 register int j;
2407 int len;
2408 PyObject *list;
2409 PyObject *str;
2410 Py_UNICODE *data;
2411
2412 string = PyUnicode_FromObject(string);
2413 if (string == NULL)
2414 return NULL;
2415 data = PyUnicode_AS_UNICODE(string);
2416 len = PyUnicode_GET_SIZE(string);
2417
2418 if (maxcount < 0)
2419 maxcount = INT_MAX;
2420
2421 list = PyList_New(0);
2422 if (!list)
2423 goto onError;
2424
2425 for (i = j = 0; i < len; ) {
2426 /* Find a line and append it */
2427 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2428 i++;
2429 if (maxcount-- <= 0)
2430 break;
2431 SPLIT_APPEND(data, j, i);
2432
2433 /* Skip the line break reading CRLF as one line break */
2434 if (i < len) {
2435 if (data[i] == '\r' && i + 1 < len &&
2436 data[i+1] == '\n')
2437 i += 2;
2438 else
2439 i++;
2440 }
2441 j = i;
2442 }
2443 if (j < len) {
2444 SPLIT_APPEND(data, j, len);
2445 }
2446
2447 Py_DECREF(string);
2448 return list;
2449
2450 onError:
2451 Py_DECREF(list);
2452 Py_DECREF(string);
2453 return NULL;
2454}
2455
2456static
2457PyObject *split_char(PyUnicodeObject *self,
2458 PyObject *list,
2459 Py_UNICODE ch,
2460 int maxcount)
2461{
2462 register int i;
2463 register int j;
2464 int len = self->length;
2465 PyObject *str;
2466
2467 for (i = j = 0; i < len; ) {
2468 if (self->str[i] == ch) {
2469 if (maxcount-- <= 0)
2470 break;
2471 SPLIT_APPEND(self->str, j, i);
2472 i = j = i + 1;
2473 } else
2474 i++;
2475 }
2476 if (j <= len) {
2477 SPLIT_APPEND(self->str, j, len);
2478 }
2479 return list;
2480
2481 onError:
2482 Py_DECREF(list);
2483 return NULL;
2484}
2485
2486static
2487PyObject *split_substring(PyUnicodeObject *self,
2488 PyObject *list,
2489 PyUnicodeObject *substring,
2490 int maxcount)
2491{
2492 register int i;
2493 register int j;
2494 int len = self->length;
2495 int sublen = substring->length;
2496 PyObject *str;
2497
2498 for (i = j = 0; i < len - sublen; ) {
2499 if (Py_UNICODE_MATCH(self, i, substring)) {
2500 if (maxcount-- <= 0)
2501 break;
2502 SPLIT_APPEND(self->str, j, i);
2503 i = j = i + sublen;
2504 } else
2505 i++;
2506 }
2507 if (j <= len) {
2508 SPLIT_APPEND(self->str, j, len);
2509 }
2510 return list;
2511
2512 onError:
2513 Py_DECREF(list);
2514 return NULL;
2515}
2516
2517#undef SPLIT_APPEND
2518
2519static
2520PyObject *split(PyUnicodeObject *self,
2521 PyUnicodeObject *substring,
2522 int maxcount)
2523{
2524 PyObject *list;
2525
2526 if (maxcount < 0)
2527 maxcount = INT_MAX;
2528
2529 list = PyList_New(0);
2530 if (!list)
2531 return NULL;
2532
2533 if (substring == NULL)
2534 return split_whitespace(self,list,maxcount);
2535
2536 else if (substring->length == 1)
2537 return split_char(self,list,substring->str[0],maxcount);
2538
2539 else if (substring->length == 0) {
2540 Py_DECREF(list);
2541 PyErr_SetString(PyExc_ValueError, "empty separator");
2542 return NULL;
2543 }
2544 else
2545 return split_substring(self,list,substring,maxcount);
2546}
2547
2548static
2549PyObject *strip(PyUnicodeObject *self,
2550 int left,
2551 int right)
2552{
2553 Py_UNICODE *p = self->str;
2554 int start = 0;
2555 int end = self->length;
2556
2557 if (left)
2558 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2559 start++;
2560
2561 if (right)
2562 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2563 end--;
2564
2565 if (start == 0 && end == self->length) {
2566 /* couldn't strip anything off, return original string */
2567 Py_INCREF(self);
2568 return (PyObject*) self;
2569 }
2570
2571 return (PyObject*) PyUnicode_FromUnicode(
2572 self->str + start,
2573 end - start
2574 );
2575}
2576
2577static
2578PyObject *replace(PyUnicodeObject *self,
2579 PyUnicodeObject *str1,
2580 PyUnicodeObject *str2,
2581 int maxcount)
2582{
2583 PyUnicodeObject *u;
2584
2585 if (maxcount < 0)
2586 maxcount = INT_MAX;
2587
2588 if (str1->length == 1 && str2->length == 1) {
2589 int i;
2590
2591 /* replace characters */
2592 if (!findchar(self->str, self->length, str1->str[0])) {
2593 /* nothing to replace, return original string */
2594 Py_INCREF(self);
2595 u = self;
2596 } else {
2597 Py_UNICODE u1 = str1->str[0];
2598 Py_UNICODE u2 = str2->str[0];
2599
2600 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2601 self->str,
2602 self->length
2603 );
2604 if (u)
2605 for (i = 0; i < u->length; i++)
2606 if (u->str[i] == u1) {
2607 if (--maxcount < 0)
2608 break;
2609 u->str[i] = u2;
2610 }
2611 }
2612
2613 } else {
2614 int n, i;
2615 Py_UNICODE *p;
2616
2617 /* replace strings */
2618 n = count(self, 0, self->length, str1);
2619 if (n > maxcount)
2620 n = maxcount;
2621 if (n == 0) {
2622 /* nothing to replace, return original string */
2623 Py_INCREF(self);
2624 u = self;
2625 } else {
2626 u = _PyUnicode_New(
2627 self->length + n * (str2->length - str1->length));
2628 if (u) {
2629 i = 0;
2630 p = u->str;
2631 while (i <= self->length - str1->length)
2632 if (Py_UNICODE_MATCH(self, i, str1)) {
2633 /* replace string segment */
2634 Py_UNICODE_COPY(p, str2->str, str2->length);
2635 p += str2->length;
2636 i += str1->length;
2637 if (--n <= 0) {
2638 /* copy remaining part */
2639 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2640 break;
2641 }
2642 } else
2643 *p++ = self->str[i++];
2644 }
2645 }
2646 }
2647
2648 return (PyObject *) u;
2649}
2650
2651/* --- Unicode Object Methods --------------------------------------------- */
2652
2653static char title__doc__[] =
2654"S.title() -> unicode\n\
2655\n\
2656Return a titlecased version of S, i.e. words start with title case\n\
2657characters, all remaining cased characters have lower case.";
2658
2659static PyObject*
2660unicode_title(PyUnicodeObject *self, PyObject *args)
2661{
2662 if (!PyArg_NoArgs(args))
2663 return NULL;
2664 return fixup(self, fixtitle);
2665}
2666
2667static char capitalize__doc__[] =
2668"S.capitalize() -> unicode\n\
2669\n\
2670Return a capitalized version of S, i.e. make the first character\n\
2671have upper case.";
2672
2673static PyObject*
2674unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2675{
2676 if (!PyArg_NoArgs(args))
2677 return NULL;
2678 return fixup(self, fixcapitalize);
2679}
2680
2681#if 0
2682static char capwords__doc__[] =
2683"S.capwords() -> unicode\n\
2684\n\
2685Apply .capitalize() to all words in S and return the result with\n\
2686normalized whitespace (all whitespace strings are replaced by ' ').";
2687
2688static PyObject*
2689unicode_capwords(PyUnicodeObject *self, PyObject *args)
2690{
2691 PyObject *list;
2692 PyObject *item;
2693 int i;
2694
2695 if (!PyArg_NoArgs(args))
2696 return NULL;
2697
2698 /* Split into words */
2699 list = split(self, NULL, -1);
2700 if (!list)
2701 return NULL;
2702
2703 /* Capitalize each word */
2704 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2705 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2706 fixcapitalize);
2707 if (item == NULL)
2708 goto onError;
2709 Py_DECREF(PyList_GET_ITEM(list, i));
2710 PyList_SET_ITEM(list, i, item);
2711 }
2712
2713 /* Join the words to form a new string */
2714 item = PyUnicode_Join(NULL, list);
2715
2716onError:
2717 Py_DECREF(list);
2718 return (PyObject *)item;
2719}
2720#endif
2721
2722static char center__doc__[] =
2723"S.center(width) -> unicode\n\
2724\n\
2725Return S centered in a Unicode string of length width. Padding is done\n\
2726using spaces.";
2727
2728static PyObject *
2729unicode_center(PyUnicodeObject *self, PyObject *args)
2730{
2731 int marg, left;
2732 int width;
2733
2734 if (!PyArg_ParseTuple(args, "i:center", &width))
2735 return NULL;
2736
2737 if (self->length >= width) {
2738 Py_INCREF(self);
2739 return (PyObject*) self;
2740 }
2741
2742 marg = width - self->length;
2743 left = marg / 2 + (marg & width & 1);
2744
2745 return (PyObject*) pad(self, left, marg - left, ' ');
2746}
2747
2748static int
2749unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2750{
2751 int len1, len2;
2752 Py_UNICODE *s1 = str1->str;
2753 Py_UNICODE *s2 = str2->str;
2754
2755 len1 = str1->length;
2756 len2 = str2->length;
2757
2758 while (len1 > 0 && len2 > 0) {
2759 int cmp = (*s1++) - (*s2++);
2760 if (cmp)
2761 /* This should make Christian happy! */
2762 return (cmp < 0) ? -1 : (cmp != 0);
2763 len1--, len2--;
2764 }
2765
2766 return (len1 < len2) ? -1 : (len1 != len2);
2767}
2768
2769int PyUnicode_Compare(PyObject *left,
2770 PyObject *right)
2771{
2772 PyUnicodeObject *u = NULL, *v = NULL;
2773 int result;
2774
2775 /* Coerce the two arguments */
2776 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2777 if (u == NULL)
2778 goto onError;
2779 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2780 if (v == NULL)
2781 goto onError;
2782
2783 /* Shortcut for emtpy or interned objects */
2784 if (v == u) {
2785 Py_DECREF(u);
2786 Py_DECREF(v);
2787 return 0;
2788 }
2789
2790 result = unicode_compare(u, v);
2791
2792 Py_DECREF(u);
2793 Py_DECREF(v);
2794 return result;
2795
2796onError:
2797 Py_XDECREF(u);
2798 Py_XDECREF(v);
2799 return -1;
2800}
2801
Guido van Rossum403d68b2000-03-13 15:55:09 +00002802int PyUnicode_Contains(PyObject *container,
2803 PyObject *element)
2804{
2805 PyUnicodeObject *u = NULL, *v = NULL;
2806 int result;
2807 register const Py_UNICODE *p, *e;
2808 register Py_UNICODE ch;
2809
2810 /* Coerce the two arguments */
2811 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2812 if (u == NULL)
2813 goto onError;
2814 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2815 if (v == NULL)
2816 goto onError;
2817
2818 /* Check v in u */
2819 if (PyUnicode_GET_SIZE(v) != 1) {
2820 PyErr_SetString(PyExc_TypeError,
2821 "string member test needs char left operand");
2822 goto onError;
2823 }
2824 ch = *PyUnicode_AS_UNICODE(v);
2825 p = PyUnicode_AS_UNICODE(u);
2826 e = p + PyUnicode_GET_SIZE(u);
2827 result = 0;
2828 while (p < e) {
2829 if (*p++ == ch) {
2830 result = 1;
2831 break;
2832 }
2833 }
2834
2835 Py_DECREF(u);
2836 Py_DECREF(v);
2837 return result;
2838
2839onError:
2840 Py_XDECREF(u);
2841 Py_XDECREF(v);
2842 return -1;
2843}
2844
Guido van Rossumd57fd912000-03-10 22:53:23 +00002845/* Concat to string or Unicode object giving a new Unicode object. */
2846
2847PyObject *PyUnicode_Concat(PyObject *left,
2848 PyObject *right)
2849{
2850 PyUnicodeObject *u = NULL, *v = NULL, *w;
2851
2852 /* Coerce the two arguments */
2853 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2854 if (u == NULL)
2855 goto onError;
2856 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2857 if (v == NULL)
2858 goto onError;
2859
2860 /* Shortcuts */
2861 if (v == unicode_empty) {
2862 Py_DECREF(v);
2863 return (PyObject *)u;
2864 }
2865 if (u == unicode_empty) {
2866 Py_DECREF(u);
2867 return (PyObject *)v;
2868 }
2869
2870 /* Concat the two Unicode strings */
2871 w = _PyUnicode_New(u->length + v->length);
2872 if (w == NULL)
2873 goto onError;
2874 Py_UNICODE_COPY(w->str, u->str, u->length);
2875 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
2876
2877 Py_DECREF(u);
2878 Py_DECREF(v);
2879 return (PyObject *)w;
2880
2881onError:
2882 Py_XDECREF(u);
2883 Py_XDECREF(v);
2884 return NULL;
2885}
2886
2887static char count__doc__[] =
2888"S.count(sub[, start[, end]]) -> int\n\
2889\n\
2890Return the number of occurrences of substring sub in Unicode string\n\
2891S[start:end]. Optional arguments start and end are\n\
2892interpreted as in slice notation.";
2893
2894static PyObject *
2895unicode_count(PyUnicodeObject *self, PyObject *args)
2896{
2897 PyUnicodeObject *substring;
2898 int start = 0;
2899 int end = INT_MAX;
2900 PyObject *result;
2901
2902 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
2903 return NULL;
2904
2905 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2906 (PyObject *)substring);
2907 if (substring == NULL)
2908 return NULL;
2909
2910 if (substring->length == 0) {
2911 Py_DECREF(substring);
2912 return PyInt_FromLong((long) 0);
2913 }
2914
2915 if (start < 0)
2916 start += self->length;
2917 if (start < 0)
2918 start = 0;
2919 if (end > self->length)
2920 end = self->length;
2921 if (end < 0)
2922 end += self->length;
2923 if (end < 0)
2924 end = 0;
2925
2926 result = PyInt_FromLong((long) count(self, start, end, substring));
2927
2928 Py_DECREF(substring);
2929 return result;
2930}
2931
2932static char encode__doc__[] =
2933"S.encode([encoding[,errors]]) -> string\n\
2934\n\
2935Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
2936errors may be given to set a different error handling scheme. Default\n\
2937is 'strict' meaning that encoding errors raise a ValueError. Other\n\
2938possible values are 'ignore' and 'replace'.";
2939
2940static PyObject *
2941unicode_encode(PyUnicodeObject *self, PyObject *args)
2942{
2943 char *encoding = NULL;
2944 char *errors = NULL;
2945 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
2946 return NULL;
2947 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
2948}
2949
2950static char expandtabs__doc__[] =
2951"S.expandtabs([tabsize]) -> unicode\n\
2952\n\
2953Return a copy of S where all tab characters are expanded using spaces.\n\
2954If tabsize is not given, a tab size of 8 characters is assumed.";
2955
2956static PyObject*
2957unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
2958{
2959 Py_UNICODE *e;
2960 Py_UNICODE *p;
2961 Py_UNICODE *q;
2962 int i, j;
2963 PyUnicodeObject *u;
2964 int tabsize = 8;
2965
2966 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
2967 return NULL;
2968
2969 /* First pass: determine size of ouput string */
2970 i = j = 0;
2971 e = self->str + self->length;
2972 for (p = self->str; p < e; p++)
2973 if (*p == '\t') {
2974 if (tabsize > 0)
2975 j += tabsize - (j % tabsize);
2976 }
2977 else {
2978 j++;
2979 if (*p == '\n' || *p == '\r') {
2980 i += j;
2981 j = 0;
2982 }
2983 }
2984
2985 /* Second pass: create output string and fill it */
2986 u = _PyUnicode_New(i + j);
2987 if (!u)
2988 return NULL;
2989
2990 j = 0;
2991 q = u->str;
2992
2993 for (p = self->str; p < e; p++)
2994 if (*p == '\t') {
2995 if (tabsize > 0) {
2996 i = tabsize - (j % tabsize);
2997 j += i;
2998 while (i--)
2999 *q++ = ' ';
3000 }
3001 }
3002 else {
3003 j++;
3004 *q++ = *p;
3005 if (*p == '\n' || *p == '\r')
3006 j = 0;
3007 }
3008
3009 return (PyObject*) u;
3010}
3011
3012static char find__doc__[] =
3013"S.find(sub [,start [,end]]) -> int\n\
3014\n\
3015Return the lowest index in S where substring sub is found,\n\
3016such that sub is contained within s[start,end]. Optional\n\
3017arguments start and end are interpreted as in slice notation.\n\
3018\n\
3019Return -1 on failure.";
3020
3021static PyObject *
3022unicode_find(PyUnicodeObject *self, PyObject *args)
3023{
3024 PyUnicodeObject *substring;
3025 int start = 0;
3026 int end = INT_MAX;
3027 PyObject *result;
3028
3029 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
3030 return NULL;
3031 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3032 (PyObject *)substring);
3033 if (substring == NULL)
3034 return NULL;
3035
3036 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3037
3038 Py_DECREF(substring);
3039 return result;
3040}
3041
3042static PyObject *
3043unicode_getitem(PyUnicodeObject *self, int index)
3044{
3045 if (index < 0 || index >= self->length) {
3046 PyErr_SetString(PyExc_IndexError, "string index out of range");
3047 return NULL;
3048 }
3049
3050 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3051}
3052
3053static long
3054unicode_hash(PyUnicodeObject *self)
3055{
3056 long hash;
3057 PyObject *utf8;
3058
3059 /* Since Unicode objects compare equal to their UTF-8 string
3060 counterparts, they should also use the UTF-8 strings as basis
3061 for their hash value. This is needed to assure that strings and
3062 Unicode objects behave in the same way as dictionary
3063 keys. Unfortunately, this costs some performance and also some
3064 memory if the cached UTF-8 representation is not used later
3065 on. */
3066 if (self->hash != -1)
3067 return self->hash;
3068 utf8 = utf8_string(self, NULL);
3069 if (utf8 == NULL)
3070 return -1;
3071 hash = PyObject_Hash(utf8);
3072 if (hash == -1)
3073 return -1;
3074 self->hash = hash;
3075 return hash;
3076}
3077
3078static char index__doc__[] =
3079"S.index(sub [,start [,end]]) -> int\n\
3080\n\
3081Like S.find() but raise ValueError when the substring is not found.";
3082
3083static PyObject *
3084unicode_index(PyUnicodeObject *self, PyObject *args)
3085{
3086 int result;
3087 PyUnicodeObject *substring;
3088 int start = 0;
3089 int end = INT_MAX;
3090
3091 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
3092 return NULL;
3093
3094 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3095 (PyObject *)substring);
3096 if (substring == NULL)
3097 return NULL;
3098
3099 result = findstring(self, substring, start, end, 1);
3100
3101 Py_DECREF(substring);
3102 if (result < 0) {
3103 PyErr_SetString(PyExc_ValueError, "substring not found");
3104 return NULL;
3105 }
3106 return PyInt_FromLong(result);
3107}
3108
3109static char islower__doc__[] =
3110"S.islower() -> int\n\
3111\n\
3112Return 1 if all cased characters in S are lowercase and there is\n\
3113at least one cased character in S, 0 otherwise.";
3114
3115static PyObject*
3116unicode_islower(PyUnicodeObject *self, PyObject *args)
3117{
3118 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3119 register const Py_UNICODE *e;
3120 int cased;
3121
3122 if (!PyArg_NoArgs(args))
3123 return NULL;
3124
3125 /* Shortcut for single character strings */
3126 if (PyUnicode_GET_SIZE(self) == 1)
3127 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3128
3129 e = p + PyUnicode_GET_SIZE(self);
3130 cased = 0;
3131 for (; p < e; p++) {
3132 register const Py_UNICODE ch = *p;
3133
3134 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3135 return PyInt_FromLong(0);
3136 else if (!cased && Py_UNICODE_ISLOWER(ch))
3137 cased = 1;
3138 }
3139 return PyInt_FromLong(cased);
3140}
3141
3142static char isupper__doc__[] =
3143"S.isupper() -> int\n\
3144\n\
3145Return 1 if all cased characters in S are uppercase and there is\n\
3146at least one cased character in S, 0 otherwise.";
3147
3148static PyObject*
3149unicode_isupper(PyUnicodeObject *self, PyObject *args)
3150{
3151 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3152 register const Py_UNICODE *e;
3153 int cased;
3154
3155 if (!PyArg_NoArgs(args))
3156 return NULL;
3157
3158 /* Shortcut for single character strings */
3159 if (PyUnicode_GET_SIZE(self) == 1)
3160 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3161
3162 e = p + PyUnicode_GET_SIZE(self);
3163 cased = 0;
3164 for (; p < e; p++) {
3165 register const Py_UNICODE ch = *p;
3166
3167 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3168 return PyInt_FromLong(0);
3169 else if (!cased && Py_UNICODE_ISUPPER(ch))
3170 cased = 1;
3171 }
3172 return PyInt_FromLong(cased);
3173}
3174
3175static char istitle__doc__[] =
3176"S.istitle() -> int\n\
3177\n\
3178Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3179may only follow uncased characters and lowercase characters only cased\n\
3180ones. Return 0 otherwise.";
3181
3182static PyObject*
3183unicode_istitle(PyUnicodeObject *self, PyObject *args)
3184{
3185 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3186 register const Py_UNICODE *e;
3187 int cased, previous_is_cased;
3188
3189 if (!PyArg_NoArgs(args))
3190 return NULL;
3191
3192 /* Shortcut for single character strings */
3193 if (PyUnicode_GET_SIZE(self) == 1)
3194 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3195 (Py_UNICODE_ISUPPER(*p) != 0));
3196
3197 e = p + PyUnicode_GET_SIZE(self);
3198 cased = 0;
3199 previous_is_cased = 0;
3200 for (; p < e; p++) {
3201 register const Py_UNICODE ch = *p;
3202
3203 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3204 if (previous_is_cased)
3205 return PyInt_FromLong(0);
3206 previous_is_cased = 1;
3207 cased = 1;
3208 }
3209 else if (Py_UNICODE_ISLOWER(ch)) {
3210 if (!previous_is_cased)
3211 return PyInt_FromLong(0);
3212 previous_is_cased = 1;
3213 cased = 1;
3214 }
3215 else
3216 previous_is_cased = 0;
3217 }
3218 return PyInt_FromLong(cased);
3219}
3220
3221static char isspace__doc__[] =
3222"S.isspace() -> int\n\
3223\n\
3224Return 1 if there are only whitespace characters in S,\n\
32250 otherwise.";
3226
3227static PyObject*
3228unicode_isspace(PyUnicodeObject *self, PyObject *args)
3229{
3230 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3231 register const Py_UNICODE *e;
3232
3233 if (!PyArg_NoArgs(args))
3234 return NULL;
3235
3236 /* Shortcut for single character strings */
3237 if (PyUnicode_GET_SIZE(self) == 1 &&
3238 Py_UNICODE_ISSPACE(*p))
3239 return PyInt_FromLong(1);
3240
3241 e = p + PyUnicode_GET_SIZE(self);
3242 for (; p < e; p++) {
3243 if (!Py_UNICODE_ISSPACE(*p))
3244 return PyInt_FromLong(0);
3245 }
3246 return PyInt_FromLong(1);
3247}
3248
3249static char isdecimal__doc__[] =
3250"S.isdecimal() -> int\n\
3251\n\
3252Return 1 if there are only decimal characters in S,\n\
32530 otherwise.";
3254
3255static PyObject*
3256unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3257{
3258 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3259 register const Py_UNICODE *e;
3260
3261 if (!PyArg_NoArgs(args))
3262 return NULL;
3263
3264 /* Shortcut for single character strings */
3265 if (PyUnicode_GET_SIZE(self) == 1 &&
3266 Py_UNICODE_ISDECIMAL(*p))
3267 return PyInt_FromLong(1);
3268
3269 e = p + PyUnicode_GET_SIZE(self);
3270 for (; p < e; p++) {
3271 if (!Py_UNICODE_ISDECIMAL(*p))
3272 return PyInt_FromLong(0);
3273 }
3274 return PyInt_FromLong(1);
3275}
3276
3277static char isdigit__doc__[] =
3278"S.isdigit() -> int\n\
3279\n\
3280Return 1 if there are only digit characters in S,\n\
32810 otherwise.";
3282
3283static PyObject*
3284unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3285{
3286 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3287 register const Py_UNICODE *e;
3288
3289 if (!PyArg_NoArgs(args))
3290 return NULL;
3291
3292 /* Shortcut for single character strings */
3293 if (PyUnicode_GET_SIZE(self) == 1 &&
3294 Py_UNICODE_ISDIGIT(*p))
3295 return PyInt_FromLong(1);
3296
3297 e = p + PyUnicode_GET_SIZE(self);
3298 for (; p < e; p++) {
3299 if (!Py_UNICODE_ISDIGIT(*p))
3300 return PyInt_FromLong(0);
3301 }
3302 return PyInt_FromLong(1);
3303}
3304
3305static char isnumeric__doc__[] =
3306"S.isnumeric() -> int\n\
3307\n\
3308Return 1 if there are only numeric characters in S,\n\
33090 otherwise.";
3310
3311static PyObject*
3312unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3313{
3314 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3315 register const Py_UNICODE *e;
3316
3317 if (!PyArg_NoArgs(args))
3318 return NULL;
3319
3320 /* Shortcut for single character strings */
3321 if (PyUnicode_GET_SIZE(self) == 1 &&
3322 Py_UNICODE_ISNUMERIC(*p))
3323 return PyInt_FromLong(1);
3324
3325 e = p + PyUnicode_GET_SIZE(self);
3326 for (; p < e; p++) {
3327 if (!Py_UNICODE_ISNUMERIC(*p))
3328 return PyInt_FromLong(0);
3329 }
3330 return PyInt_FromLong(1);
3331}
3332
3333static char join__doc__[] =
3334"S.join(sequence) -> unicode\n\
3335\n\
3336Return a string which is the concatenation of the strings in the\n\
3337sequence. The separator between elements is S.";
3338
3339static PyObject*
3340unicode_join(PyUnicodeObject *self, PyObject *args)
3341{
3342 PyObject *data;
3343 if (!PyArg_ParseTuple(args, "O:join", &data))
3344 return NULL;
3345
3346 return PyUnicode_Join((PyObject *)self, data);
3347}
3348
3349static int
3350unicode_length(PyUnicodeObject *self)
3351{
3352 return self->length;
3353}
3354
3355static char ljust__doc__[] =
3356"S.ljust(width) -> unicode\n\
3357\n\
3358Return S left justified in a Unicode string of length width. Padding is\n\
3359done using spaces.";
3360
3361static PyObject *
3362unicode_ljust(PyUnicodeObject *self, PyObject *args)
3363{
3364 int width;
3365 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3366 return NULL;
3367
3368 if (self->length >= width) {
3369 Py_INCREF(self);
3370 return (PyObject*) self;
3371 }
3372
3373 return (PyObject*) pad(self, 0, width - self->length, ' ');
3374}
3375
3376static char lower__doc__[] =
3377"S.lower() -> unicode\n\
3378\n\
3379Return a copy of the string S converted to lowercase.";
3380
3381static PyObject*
3382unicode_lower(PyUnicodeObject *self, PyObject *args)
3383{
3384 if (!PyArg_NoArgs(args))
3385 return NULL;
3386 return fixup(self, fixlower);
3387}
3388
3389static char lstrip__doc__[] =
3390"S.lstrip() -> unicode\n\
3391\n\
3392Return a copy of the string S with leading whitespace removed.";
3393
3394static PyObject *
3395unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3396{
3397 if (!PyArg_NoArgs(args))
3398 return NULL;
3399 return strip(self, 1, 0);
3400}
3401
3402static PyObject*
3403unicode_repeat(PyUnicodeObject *str, int len)
3404{
3405 PyUnicodeObject *u;
3406 Py_UNICODE *p;
3407
3408 if (len < 0)
3409 len = 0;
3410
3411 if (len == 1) {
3412 /* no repeat, return original string */
3413 Py_INCREF(str);
3414 return (PyObject*) str;
3415 }
3416
3417 u = _PyUnicode_New(len * str->length);
3418 if (!u)
3419 return NULL;
3420
3421 p = u->str;
3422
3423 while (len-- > 0) {
3424 Py_UNICODE_COPY(p, str->str, str->length);
3425 p += str->length;
3426 }
3427
3428 return (PyObject*) u;
3429}
3430
3431PyObject *PyUnicode_Replace(PyObject *obj,
3432 PyObject *subobj,
3433 PyObject *replobj,
3434 int maxcount)
3435{
3436 PyObject *self;
3437 PyObject *str1;
3438 PyObject *str2;
3439 PyObject *result;
3440
3441 self = PyUnicode_FromObject(obj);
3442 if (self == NULL)
3443 return NULL;
3444 str1 = PyUnicode_FromObject(subobj);
3445 if (str1 == NULL) {
3446 Py_DECREF(self);
3447 return NULL;
3448 }
3449 str2 = PyUnicode_FromObject(replobj);
3450 if (str2 == NULL) {
3451 Py_DECREF(self);
3452 Py_DECREF(str1);
3453 return NULL;
3454 }
3455 result = replace((PyUnicodeObject *)self,
3456 (PyUnicodeObject *)str1,
3457 (PyUnicodeObject *)str2,
3458 maxcount);
3459 Py_DECREF(self);
3460 Py_DECREF(str1);
3461 Py_DECREF(str2);
3462 return result;
3463}
3464
3465static char replace__doc__[] =
3466"S.replace (old, new[, maxsplit]) -> unicode\n\
3467\n\
3468Return a copy of S with all occurrences of substring\n\
3469old replaced by new. If the optional argument maxsplit is\n\
3470given, only the first maxsplit occurrences are replaced.";
3471
3472static PyObject*
3473unicode_replace(PyUnicodeObject *self, PyObject *args)
3474{
3475 PyUnicodeObject *str1;
3476 PyUnicodeObject *str2;
3477 int maxcount = -1;
3478 PyObject *result;
3479
3480 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3481 return NULL;
3482 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3483 if (str1 == NULL)
3484 return NULL;
3485 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3486 if (str2 == NULL)
3487 return NULL;
3488
3489 result = replace(self, str1, str2, maxcount);
3490
3491 Py_DECREF(str1);
3492 Py_DECREF(str2);
3493 return result;
3494}
3495
3496static
3497PyObject *unicode_repr(PyObject *unicode)
3498{
3499 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3500 PyUnicode_GET_SIZE(unicode),
3501 1);
3502}
3503
3504static char rfind__doc__[] =
3505"S.rfind(sub [,start [,end]]) -> int\n\
3506\n\
3507Return the highest index in S where substring sub is found,\n\
3508such that sub is contained within s[start,end]. Optional\n\
3509arguments start and end are interpreted as in slice notation.\n\
3510\n\
3511Return -1 on failure.";
3512
3513static PyObject *
3514unicode_rfind(PyUnicodeObject *self, PyObject *args)
3515{
3516 PyUnicodeObject *substring;
3517 int start = 0;
3518 int end = INT_MAX;
3519 PyObject *result;
3520
3521 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3522 return NULL;
3523 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3524 (PyObject *)substring);
3525 if (substring == NULL)
3526 return NULL;
3527
3528 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3529
3530 Py_DECREF(substring);
3531 return result;
3532}
3533
3534static char rindex__doc__[] =
3535"S.rindex(sub [,start [,end]]) -> int\n\
3536\n\
3537Like S.rfind() but raise ValueError when the substring is not found.";
3538
3539static PyObject *
3540unicode_rindex(PyUnicodeObject *self, PyObject *args)
3541{
3542 int result;
3543 PyUnicodeObject *substring;
3544 int start = 0;
3545 int end = INT_MAX;
3546
3547 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3548 return NULL;
3549 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3550 (PyObject *)substring);
3551 if (substring == NULL)
3552 return NULL;
3553
3554 result = findstring(self, substring, start, end, -1);
3555
3556 Py_DECREF(substring);
3557 if (result < 0) {
3558 PyErr_SetString(PyExc_ValueError, "substring not found");
3559 return NULL;
3560 }
3561 return PyInt_FromLong(result);
3562}
3563
3564static char rjust__doc__[] =
3565"S.rjust(width) -> unicode\n\
3566\n\
3567Return S right justified in a Unicode string of length width. Padding is\n\
3568done using spaces.";
3569
3570static PyObject *
3571unicode_rjust(PyUnicodeObject *self, PyObject *args)
3572{
3573 int width;
3574 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3575 return NULL;
3576
3577 if (self->length >= width) {
3578 Py_INCREF(self);
3579 return (PyObject*) self;
3580 }
3581
3582 return (PyObject*) pad(self, width - self->length, 0, ' ');
3583}
3584
3585static char rstrip__doc__[] =
3586"S.rstrip() -> unicode\n\
3587\n\
3588Return a copy of the string S with trailing whitespace removed.";
3589
3590static PyObject *
3591unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3592{
3593 if (!PyArg_NoArgs(args))
3594 return NULL;
3595 return strip(self, 0, 1);
3596}
3597
3598static PyObject*
3599unicode_slice(PyUnicodeObject *self, int start, int end)
3600{
3601 /* standard clamping */
3602 if (start < 0)
3603 start = 0;
3604 if (end < 0)
3605 end = 0;
3606 if (end > self->length)
3607 end = self->length;
3608 if (start == 0 && end == self->length) {
3609 /* full slice, return original string */
3610 Py_INCREF(self);
3611 return (PyObject*) self;
3612 }
3613 if (start > end)
3614 start = end;
3615 /* copy slice */
3616 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3617 end - start);
3618}
3619
3620PyObject *PyUnicode_Split(PyObject *s,
3621 PyObject *sep,
3622 int maxsplit)
3623{
3624 PyObject *result;
3625
3626 s = PyUnicode_FromObject(s);
3627 if (s == NULL)
3628 return NULL;
3629 if (sep != NULL) {
3630 sep = PyUnicode_FromObject(sep);
3631 if (sep == NULL) {
3632 Py_DECREF(s);
3633 return NULL;
3634 }
3635 }
3636
3637 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3638
3639 Py_DECREF(s);
3640 Py_XDECREF(sep);
3641 return result;
3642}
3643
3644static char split__doc__[] =
3645"S.split([sep [,maxsplit]]) -> list of strings\n\
3646\n\
3647Return a list of the words in S, using sep as the\n\
3648delimiter string. If maxsplit is given, at most maxsplit\n\
3649splits are done. If sep is not specified, any whitespace string\n\
3650is a separator.";
3651
3652static PyObject*
3653unicode_split(PyUnicodeObject *self, PyObject *args)
3654{
3655 PyObject *substring = Py_None;
3656 int maxcount = -1;
3657
3658 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3659 return NULL;
3660
3661 if (substring == Py_None)
3662 return split(self, NULL, maxcount);
3663 else if (PyUnicode_Check(substring))
3664 return split(self, (PyUnicodeObject *)substring, maxcount);
3665 else
3666 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3667}
3668
3669static char splitlines__doc__[] =
3670"S.splitlines([maxsplit]]) -> list of strings\n\
3671\n\
3672Return a list of the lines in S, breaking at line boundaries.\n\
3673If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
3674included in the resulting list.";
3675
3676static PyObject*
3677unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3678{
3679 int maxcount = -1;
3680
3681 if (!PyArg_ParseTuple(args, "|i:splitlines", &maxcount))
3682 return NULL;
3683
3684 return PyUnicode_Splitlines((PyObject *)self, maxcount);
3685}
3686
3687static
3688PyObject *unicode_str(PyUnicodeObject *self)
3689{
3690 return PyUnicode_AsUTF8String((PyObject *)self);
3691}
3692
3693static char strip__doc__[] =
3694"S.strip() -> unicode\n\
3695\n\
3696Return a copy of S with leading and trailing whitespace removed.";
3697
3698static PyObject *
3699unicode_strip(PyUnicodeObject *self, PyObject *args)
3700{
3701 if (!PyArg_NoArgs(args))
3702 return NULL;
3703 return strip(self, 1, 1);
3704}
3705
3706static char swapcase__doc__[] =
3707"S.swapcase() -> unicode\n\
3708\n\
3709Return a copy of S with uppercase characters converted to lowercase\n\
3710and vice versa.";
3711
3712static PyObject*
3713unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3714{
3715 if (!PyArg_NoArgs(args))
3716 return NULL;
3717 return fixup(self, fixswapcase);
3718}
3719
3720static char translate__doc__[] =
3721"S.translate(table) -> unicode\n\
3722\n\
3723Return a copy of the string S, where all characters have been mapped\n\
3724through the given translation table, which must be a mapping of\n\
3725Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3726are left untouched. Characters mapped to None are deleted.";
3727
3728static PyObject*
3729unicode_translate(PyUnicodeObject *self, PyObject *args)
3730{
3731 PyObject *table;
3732
3733 if (!PyArg_ParseTuple(args, "O:translate", &table))
3734 return NULL;
3735 return PyUnicode_TranslateCharmap(self->str,
3736 self->length,
3737 table,
3738 "ignore");
3739}
3740
3741static char upper__doc__[] =
3742"S.upper() -> unicode\n\
3743\n\
3744Return a copy of S converted to uppercase.";
3745
3746static PyObject*
3747unicode_upper(PyUnicodeObject *self, PyObject *args)
3748{
3749 if (!PyArg_NoArgs(args))
3750 return NULL;
3751 return fixup(self, fixupper);
3752}
3753
3754#if 0
3755static char zfill__doc__[] =
3756"S.zfill(width) -> unicode\n\
3757\n\
3758Pad a numeric string x with zeros on the left, to fill a field\n\
3759of the specified width. The string x is never truncated.";
3760
3761static PyObject *
3762unicode_zfill(PyUnicodeObject *self, PyObject *args)
3763{
3764 int fill;
3765 PyUnicodeObject *u;
3766
3767 int width;
3768 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3769 return NULL;
3770
3771 if (self->length >= width) {
3772 Py_INCREF(self);
3773 return (PyObject*) self;
3774 }
3775
3776 fill = width - self->length;
3777
3778 u = pad(self, fill, 0, '0');
3779
3780 if (u->str[fill] == '+' || u->str[fill] == '-') {
3781 /* move sign to beginning of string */
3782 u->str[0] = u->str[fill];
3783 u->str[fill] = '0';
3784 }
3785
3786 return (PyObject*) u;
3787}
3788#endif
3789
3790#if 0
3791static PyObject*
3792unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3793{
3794 if (!PyArg_NoArgs(args))
3795 return NULL;
3796 return PyInt_FromLong(unicode_freelist_size);
3797}
3798#endif
3799
3800static char startswith__doc__[] =
3801"S.startswith(prefix[, start[, end]]) -> int\n\
3802\n\
3803Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3804optional start, test S beginning at that position. With optional end, stop\n\
3805comparing S at that position.";
3806
3807static PyObject *
3808unicode_startswith(PyUnicodeObject *self,
3809 PyObject *args)
3810{
3811 PyUnicodeObject *substring;
3812 int start = 0;
3813 int end = INT_MAX;
3814 PyObject *result;
3815
3816 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3817 return NULL;
3818 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3819 (PyObject *)substring);
3820 if (substring == NULL)
3821 return NULL;
3822
3823 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3824
3825 Py_DECREF(substring);
3826 return result;
3827}
3828
3829
3830static char endswith__doc__[] =
3831"S.endswith(suffix[, start[, end]]) -> int\n\
3832\n\
3833Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3834optional start, test S beginning at that position. With optional end, stop\n\
3835comparing S at that position.";
3836
3837static PyObject *
3838unicode_endswith(PyUnicodeObject *self,
3839 PyObject *args)
3840{
3841 PyUnicodeObject *substring;
3842 int start = 0;
3843 int end = INT_MAX;
3844 PyObject *result;
3845
3846 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3847 return NULL;
3848 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3849 (PyObject *)substring);
3850 if (substring == NULL)
3851 return NULL;
3852
3853 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3854
3855 Py_DECREF(substring);
3856 return result;
3857}
3858
3859
3860static PyMethodDef unicode_methods[] = {
3861
3862 /* Order is according to common usage: often used methods should
3863 appear first, since lookup is done sequentially. */
3864
3865 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3866 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3867 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3868 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3869 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3870 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3871 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3872 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3873 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3874 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3875 {"index", (PyCFunction) unicode_index, 1, index__doc__},
3876 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
3877 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
3878 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
3879/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
3880 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
3881 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
3882 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
3883 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
3884 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
3885 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
3886 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
3887 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
3888 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
3889 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
3890 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
3891 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
3892 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
3893 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
3894 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
3895 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
3896 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
3897 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
3898#if 0
3899 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
3900 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
3901#endif
3902
3903#if 0
3904 /* This one is just used for debugging the implementation. */
3905 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
3906#endif
3907
3908 {NULL, NULL}
3909};
3910
3911static PyObject *
3912unicode_getattr(PyUnicodeObject *self, char *name)
3913{
3914 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
3915}
3916
3917static PySequenceMethods unicode_as_sequence = {
3918 (inquiry) unicode_length, /* sq_length */
3919 (binaryfunc) PyUnicode_Concat, /* sq_concat */
3920 (intargfunc) unicode_repeat, /* sq_repeat */
3921 (intargfunc) unicode_getitem, /* sq_item */
3922 (intintargfunc) unicode_slice, /* sq_slice */
3923 0, /* sq_ass_item */
3924 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003925 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00003926};
3927
3928static int
3929unicode_buffer_getreadbuf(PyUnicodeObject *self,
3930 int index,
3931 const void **ptr)
3932{
3933 if (index != 0) {
3934 PyErr_SetString(PyExc_SystemError,
3935 "accessing non-existent unicode segment");
3936 return -1;
3937 }
3938 *ptr = (void *) self->str;
3939 return PyUnicode_GET_DATA_SIZE(self);
3940}
3941
3942static int
3943unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
3944 const void **ptr)
3945{
3946 PyErr_SetString(PyExc_TypeError,
3947 "cannot use unicode as modifyable buffer");
3948 return -1;
3949}
3950
3951static int
3952unicode_buffer_getsegcount(PyUnicodeObject *self,
3953 int *lenp)
3954{
3955 if (lenp)
3956 *lenp = PyUnicode_GET_DATA_SIZE(self);
3957 return 1;
3958}
3959
3960static int
3961unicode_buffer_getcharbuf(PyUnicodeObject *self,
3962 int index,
3963 const void **ptr)
3964{
3965 PyObject *str;
3966
3967 if (index != 0) {
3968 PyErr_SetString(PyExc_SystemError,
3969 "accessing non-existent unicode segment");
3970 return -1;
3971 }
3972 str = utf8_string(self, NULL);
3973 if (str == NULL)
3974 return -1;
3975 *ptr = (void *) PyString_AS_STRING(str);
3976 return PyString_GET_SIZE(str);
3977}
3978
3979/* Helpers for PyUnicode_Format() */
3980
3981static PyObject *
3982getnextarg(args, arglen, p_argidx)
3983 PyObject *args;
3984int arglen;
3985int *p_argidx;
3986{
3987 int argidx = *p_argidx;
3988 if (argidx < arglen) {
3989 (*p_argidx)++;
3990 if (arglen < 0)
3991 return args;
3992 else
3993 return PyTuple_GetItem(args, argidx);
3994 }
3995 PyErr_SetString(PyExc_TypeError,
3996 "not enough arguments for format string");
3997 return NULL;
3998}
3999
4000#define F_LJUST (1<<0)
4001#define F_SIGN (1<<1)
4002#define F_BLANK (1<<2)
4003#define F_ALT (1<<3)
4004#define F_ZERO (1<<4)
4005
4006static
4007#ifdef HAVE_STDARG_PROTOTYPES
4008int usprintf(register Py_UNICODE *buffer, char *format, ...)
4009#else
4010int usprintf(va_alist) va_dcl
4011#endif
4012{
4013 register int i;
4014 int len;
4015 va_list va;
4016 char *charbuffer;
4017#ifdef HAVE_STDARG_PROTOTYPES
4018 va_start(va, format);
4019#else
4020 Py_UNICODE *args;
4021 char *format;
4022
4023 va_start(va);
4024 buffer = va_arg(va, Py_UNICODE *);
4025 format = va_arg(va, char *);
4026#endif
4027
4028 /* First, format the string as char array, then expand to Py_UNICODE
4029 array. */
4030 charbuffer = (char *)buffer;
4031 len = vsprintf(charbuffer, format, va);
4032 for (i = len - 1; i >= 0; i--)
4033 buffer[i] = (Py_UNICODE) charbuffer[i];
4034
4035 va_end(va);
4036 return len;
4037}
4038
4039static int
4040formatfloat(Py_UNICODE *buf,
4041 int flags,
4042 int prec,
4043 int type,
4044 PyObject *v)
4045{
4046 char fmt[20];
4047 double x;
4048
4049 x = PyFloat_AsDouble(v);
4050 if (x == -1.0 && PyErr_Occurred())
4051 return -1;
4052 if (prec < 0)
4053 prec = 6;
4054 if (prec > 50)
4055 prec = 50; /* Arbitrary limitation */
4056 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4057 type = 'g';
4058 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4059 return usprintf(buf, fmt, x);
4060}
4061
4062static int
4063formatint(Py_UNICODE *buf,
4064 int flags,
4065 int prec,
4066 int type,
4067 PyObject *v)
4068{
4069 char fmt[20];
4070 long x;
4071
4072 x = PyInt_AsLong(v);
4073 if (x == -1 && PyErr_Occurred())
4074 return -1;
4075 if (prec < 0)
4076 prec = 1;
4077 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4078 return usprintf(buf, fmt, x);
4079}
4080
4081static int
4082formatchar(Py_UNICODE *buf,
4083 PyObject *v)
4084{
4085 if (PyUnicode_Check(v))
4086 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4087
4088 else if (PyString_Check(v))
4089 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4090
4091 else {
4092 /* Integer input truncated to a character */
4093 long x;
4094 x = PyInt_AsLong(v);
4095 if (x == -1 && PyErr_Occurred())
4096 return -1;
4097 buf[0] = (char) x;
4098 }
4099 buf[1] = '\0';
4100 return 1;
4101}
4102
4103PyObject *PyUnicode_Format(PyObject *format,
4104 PyObject *args)
4105{
4106 Py_UNICODE *fmt, *res;
4107 int fmtcnt, rescnt, reslen, arglen, argidx;
4108 int args_owned = 0;
4109 PyUnicodeObject *result = NULL;
4110 PyObject *dict = NULL;
4111 PyObject *uformat;
4112
4113 if (format == NULL || args == NULL) {
4114 PyErr_BadInternalCall();
4115 return NULL;
4116 }
4117 uformat = PyUnicode_FromObject(format);
4118 fmt = PyUnicode_AS_UNICODE(uformat);
4119 fmtcnt = PyUnicode_GET_SIZE(uformat);
4120
4121 reslen = rescnt = fmtcnt + 100;
4122 result = _PyUnicode_New(reslen);
4123 if (result == NULL)
4124 goto onError;
4125 res = PyUnicode_AS_UNICODE(result);
4126
4127 if (PyTuple_Check(args)) {
4128 arglen = PyTuple_Size(args);
4129 argidx = 0;
4130 }
4131 else {
4132 arglen = -1;
4133 argidx = -2;
4134 }
4135 if (args->ob_type->tp_as_mapping)
4136 dict = args;
4137
4138 while (--fmtcnt >= 0) {
4139 if (*fmt != '%') {
4140 if (--rescnt < 0) {
4141 rescnt = fmtcnt + 100;
4142 reslen += rescnt;
4143 if (_PyUnicode_Resize(result, reslen) < 0)
4144 return NULL;
4145 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4146 --rescnt;
4147 }
4148 *res++ = *fmt++;
4149 }
4150 else {
4151 /* Got a format specifier */
4152 int flags = 0;
4153 int width = -1;
4154 int prec = -1;
4155 int size = 0;
4156 Py_UNICODE c = '\0';
4157 Py_UNICODE fill;
4158 PyObject *v = NULL;
4159 PyObject *temp = NULL;
4160 Py_UNICODE *buf;
4161 Py_UNICODE sign;
4162 int len;
4163 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4164
4165 fmt++;
4166 if (*fmt == '(') {
4167 Py_UNICODE *keystart;
4168 int keylen;
4169 PyObject *key;
4170 int pcount = 1;
4171
4172 if (dict == NULL) {
4173 PyErr_SetString(PyExc_TypeError,
4174 "format requires a mapping");
4175 goto onError;
4176 }
4177 ++fmt;
4178 --fmtcnt;
4179 keystart = fmt;
4180 /* Skip over balanced parentheses */
4181 while (pcount > 0 && --fmtcnt >= 0) {
4182 if (*fmt == ')')
4183 --pcount;
4184 else if (*fmt == '(')
4185 ++pcount;
4186 fmt++;
4187 }
4188 keylen = fmt - keystart - 1;
4189 if (fmtcnt < 0 || pcount > 0) {
4190 PyErr_SetString(PyExc_ValueError,
4191 "incomplete format key");
4192 goto onError;
4193 }
4194 /* keys are converted to strings (using UTF-8) and
4195 then looked up since Python uses strings to hold
4196 variables names etc. in its namespaces and we
4197 wouldn't want to break common idioms. The
4198 alternative would be using Unicode objects for the
4199 lookup but u"abc" and "abc" have different hash
4200 values (on purpose). */
4201 key = PyUnicode_EncodeUTF8(keystart,
4202 keylen,
4203 NULL);
4204 if (key == NULL)
4205 goto onError;
4206 if (args_owned) {
4207 Py_DECREF(args);
4208 args_owned = 0;
4209 }
4210 args = PyObject_GetItem(dict, key);
4211 Py_DECREF(key);
4212 if (args == NULL) {
4213 goto onError;
4214 }
4215 args_owned = 1;
4216 arglen = -1;
4217 argidx = -2;
4218 }
4219 while (--fmtcnt >= 0) {
4220 switch (c = *fmt++) {
4221 case '-': flags |= F_LJUST; continue;
4222 case '+': flags |= F_SIGN; continue;
4223 case ' ': flags |= F_BLANK; continue;
4224 case '#': flags |= F_ALT; continue;
4225 case '0': flags |= F_ZERO; continue;
4226 }
4227 break;
4228 }
4229 if (c == '*') {
4230 v = getnextarg(args, arglen, &argidx);
4231 if (v == NULL)
4232 goto onError;
4233 if (!PyInt_Check(v)) {
4234 PyErr_SetString(PyExc_TypeError,
4235 "* wants int");
4236 goto onError;
4237 }
4238 width = PyInt_AsLong(v);
4239 if (width < 0) {
4240 flags |= F_LJUST;
4241 width = -width;
4242 }
4243 if (--fmtcnt >= 0)
4244 c = *fmt++;
4245 }
4246 else if (c >= '0' && c <= '9') {
4247 width = c - '0';
4248 while (--fmtcnt >= 0) {
4249 c = *fmt++;
4250 if (c < '0' || c > '9')
4251 break;
4252 if ((width*10) / 10 != width) {
4253 PyErr_SetString(PyExc_ValueError,
4254 "width too big");
4255 goto onError;
4256 }
4257 width = width*10 + (c - '0');
4258 }
4259 }
4260 if (c == '.') {
4261 prec = 0;
4262 if (--fmtcnt >= 0)
4263 c = *fmt++;
4264 if (c == '*') {
4265 v = getnextarg(args, arglen, &argidx);
4266 if (v == NULL)
4267 goto onError;
4268 if (!PyInt_Check(v)) {
4269 PyErr_SetString(PyExc_TypeError,
4270 "* wants int");
4271 goto onError;
4272 }
4273 prec = PyInt_AsLong(v);
4274 if (prec < 0)
4275 prec = 0;
4276 if (--fmtcnt >= 0)
4277 c = *fmt++;
4278 }
4279 else if (c >= '0' && c <= '9') {
4280 prec = c - '0';
4281 while (--fmtcnt >= 0) {
4282 c = Py_CHARMASK(*fmt++);
4283 if (c < '0' || c > '9')
4284 break;
4285 if ((prec*10) / 10 != prec) {
4286 PyErr_SetString(PyExc_ValueError,
4287 "prec too big");
4288 goto onError;
4289 }
4290 prec = prec*10 + (c - '0');
4291 }
4292 }
4293 } /* prec */
4294 if (fmtcnt >= 0) {
4295 if (c == 'h' || c == 'l' || c == 'L') {
4296 size = c;
4297 if (--fmtcnt >= 0)
4298 c = *fmt++;
4299 }
4300 }
4301 if (fmtcnt < 0) {
4302 PyErr_SetString(PyExc_ValueError,
4303 "incomplete format");
4304 goto onError;
4305 }
4306 if (c != '%') {
4307 v = getnextarg(args, arglen, &argidx);
4308 if (v == NULL)
4309 goto onError;
4310 }
4311 sign = 0;
4312 fill = ' ';
4313 switch (c) {
4314
4315 case '%':
4316 buf = tmpbuf;
4317 buf[0] = '%';
4318 len = 1;
4319 break;
4320
4321 case 's':
4322 case 'r':
4323 if (PyUnicode_Check(v) && c == 's') {
4324 temp = v;
4325 Py_INCREF(temp);
4326 }
4327 else {
4328 PyObject *unicode;
4329 if (c == 's')
4330 temp = PyObject_Str(v);
4331 else
4332 temp = PyObject_Repr(v);
4333 if (temp == NULL)
4334 goto onError;
4335 if (!PyString_Check(temp)) {
4336 /* XXX Note: this should never happen, since
4337 PyObject_Repr() and PyObject_Str() assure
4338 this */
4339 Py_DECREF(temp);
4340 PyErr_SetString(PyExc_TypeError,
4341 "%s argument has non-string str()");
4342 goto onError;
4343 }
4344 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4345 PyString_GET_SIZE(temp),
4346 "strict");
4347 Py_DECREF(temp);
4348 temp = unicode;
4349 if (temp == NULL)
4350 goto onError;
4351 }
4352 buf = PyUnicode_AS_UNICODE(temp);
4353 len = PyUnicode_GET_SIZE(temp);
4354 if (prec >= 0 && len > prec)
4355 len = prec;
4356 break;
4357
4358 case 'i':
4359 case 'd':
4360 case 'u':
4361 case 'o':
4362 case 'x':
4363 case 'X':
4364 if (c == 'i')
4365 c = 'd';
4366 buf = tmpbuf;
4367 len = formatint(buf, flags, prec, c, v);
4368 if (len < 0)
4369 goto onError;
4370 sign = (c == 'd');
4371 if (flags & F_ZERO) {
4372 fill = '0';
4373 if ((flags&F_ALT) &&
4374 (c == 'x' || c == 'X') &&
4375 buf[0] == '0' && buf[1] == c) {
4376 *res++ = *buf++;
4377 *res++ = *buf++;
4378 rescnt -= 2;
4379 len -= 2;
4380 width -= 2;
4381 if (width < 0)
4382 width = 0;
4383 }
4384 }
4385 break;
4386
4387 case 'e':
4388 case 'E':
4389 case 'f':
4390 case 'g':
4391 case 'G':
4392 buf = tmpbuf;
4393 len = formatfloat(buf, flags, prec, c, v);
4394 if (len < 0)
4395 goto onError;
4396 sign = 1;
4397 if (flags&F_ZERO)
4398 fill = '0';
4399 break;
4400
4401 case 'c':
4402 buf = tmpbuf;
4403 len = formatchar(buf, v);
4404 if (len < 0)
4405 goto onError;
4406 break;
4407
4408 default:
4409 PyErr_Format(PyExc_ValueError,
4410 "unsupported format character '%c' (0x%x)",
4411 c, c);
4412 goto onError;
4413 }
4414 if (sign) {
4415 if (*buf == '-' || *buf == '+') {
4416 sign = *buf++;
4417 len--;
4418 }
4419 else if (flags & F_SIGN)
4420 sign = '+';
4421 else if (flags & F_BLANK)
4422 sign = ' ';
4423 else
4424 sign = 0;
4425 }
4426 if (width < len)
4427 width = len;
4428 if (rescnt < width + (sign != 0)) {
4429 reslen -= rescnt;
4430 rescnt = width + fmtcnt + 100;
4431 reslen += rescnt;
4432 if (_PyUnicode_Resize(result, reslen) < 0)
4433 return NULL;
4434 res = PyUnicode_AS_UNICODE(result)
4435 + reslen - rescnt;
4436 }
4437 if (sign) {
4438 if (fill != ' ')
4439 *res++ = sign;
4440 rescnt--;
4441 if (width > len)
4442 width--;
4443 }
4444 if (width > len && !(flags & F_LJUST)) {
4445 do {
4446 --rescnt;
4447 *res++ = fill;
4448 } while (--width > len);
4449 }
4450 if (sign && fill == ' ')
4451 *res++ = sign;
4452 memcpy(res, buf, len * sizeof(Py_UNICODE));
4453 res += len;
4454 rescnt -= len;
4455 while (--width >= len) {
4456 --rescnt;
4457 *res++ = ' ';
4458 }
4459 if (dict && (argidx < arglen) && c != '%') {
4460 PyErr_SetString(PyExc_TypeError,
4461 "not all arguments converted");
4462 goto onError;
4463 }
4464 Py_XDECREF(temp);
4465 } /* '%' */
4466 } /* until end */
4467 if (argidx < arglen && !dict) {
4468 PyErr_SetString(PyExc_TypeError,
4469 "not all arguments converted");
4470 goto onError;
4471 }
4472
4473 if (args_owned) {
4474 Py_DECREF(args);
4475 }
4476 Py_DECREF(uformat);
4477 _PyUnicode_Resize(result, reslen - rescnt);
4478 return (PyObject *)result;
4479
4480 onError:
4481 Py_XDECREF(result);
4482 Py_DECREF(uformat);
4483 if (args_owned) {
4484 Py_DECREF(args);
4485 }
4486 return NULL;
4487}
4488
4489static PyBufferProcs unicode_as_buffer = {
4490 (getreadbufferproc) unicode_buffer_getreadbuf,
4491 (getwritebufferproc) unicode_buffer_getwritebuf,
4492 (getsegcountproc) unicode_buffer_getsegcount,
4493 (getcharbufferproc) unicode_buffer_getcharbuf,
4494};
4495
4496PyTypeObject PyUnicode_Type = {
4497 PyObject_HEAD_INIT(&PyType_Type)
4498 0, /* ob_size */
4499 "unicode", /* tp_name */
4500 sizeof(PyUnicodeObject), /* tp_size */
4501 0, /* tp_itemsize */
4502 /* Slots */
4503 (destructor)_PyUnicode_Free, /* tp_dealloc */
4504 0, /* tp_print */
4505 (getattrfunc)unicode_getattr, /* tp_getattr */
4506 0, /* tp_setattr */
4507 (cmpfunc) unicode_compare, /* tp_compare */
4508 (reprfunc) unicode_repr, /* tp_repr */
4509 0, /* tp_as_number */
4510 &unicode_as_sequence, /* tp_as_sequence */
4511 0, /* tp_as_mapping */
4512 (hashfunc) unicode_hash, /* tp_hash*/
4513 0, /* tp_call*/
4514 (reprfunc) unicode_str, /* tp_str */
4515 (getattrofunc) NULL, /* tp_getattro */
4516 (setattrofunc) NULL, /* tp_setattro */
4517 &unicode_as_buffer, /* tp_as_buffer */
4518 Py_TPFLAGS_DEFAULT, /* tp_flags */
4519};
4520
4521/* Initialize the Unicode implementation */
4522
4523void _PyUnicode_Init()
4524{
4525 /* Doublecheck the configuration... */
4526 if (sizeof(Py_UNICODE) != 2)
4527 Py_FatalError("Unicode configuration error: "
4528 "sizeof(Py_UNICODE) != 2 bytes");
4529
4530 unicode_empty = _PyUnicode_New(0);
4531}
4532
4533/* Finalize the Unicode implementation */
4534
4535void
4536_PyUnicode_Fini()
4537{
4538 PyUnicodeObject *u = unicode_freelist;
4539
4540 while (u != NULL) {
4541 PyUnicodeObject *v = u;
4542 u = *(PyUnicodeObject **)u;
4543 free(v);
4544 }
4545 Py_XDECREF(unicode_empty);
4546}