blob: 9c35e2d54457943334d967614dff7b35f52f828d [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +000079/* Limit for the Unicode object free list */
80
81#define MAX_UNICODE_FREELIST_SIZE 1024
82
83/* Limit for the Unicode object free list stay alive optimization.
84
85 The implementation will keep allocated Unicode memory intact for
86 all objects on the free list having a size less than this
87 limit. This reduces malloc() overhead for small Unicode objects.
88
Barry Warsaw51ac5802000-03-20 16:36:48 +000089 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumd57fd912000-03-10 22:53:23 +000090 (sizeof(PyUnicodeObject) + STAYALIVE_SIZE_LIMIT +
91 malloc()-overhead) bytes of unused garbage.
92
93 Setting the limit to 0 effectively turns the feature off.
94
95 XXX The feature is currently turned off because there are
96 apparently some lingering bugs in its implementation which I
97 haven't yet been able to sort out.
98
99*/
100
101#define STAYALIVE_SIZE_LIMIT 0
102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------ */
112
113/* The empty Unicode object */
114static PyUnicodeObject *unicode_empty = NULL;
115
116/* Free list for Unicode objects */
117static PyUnicodeObject *unicode_freelist = NULL;
118static int unicode_freelist_size = 0;
119
120/* --- Unicode Object ----------------------------------------------------- */
121
122static
123int _PyUnicode_Resize(register PyUnicodeObject *unicode,
124 int length)
125{
126 void *oldstr;
127
128 /* Shortcut if there's nothing to do. */
129 if (unicode->length == length)
130 return 0;
131
132 /* Resizing unicode_empty is not allowed. */
133 if (unicode == unicode_empty) {
134 PyErr_SetString(PyExc_SystemError,
135 "can't resize empty unicode object");
136 return -1;
137 }
138
139 /* We allocate one more byte to make sure the string is
140 Ux0000 terminated -- XXX is this needed ? */
141 oldstr = unicode->str;
142 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
143 if (!unicode->str) {
144 unicode->str = oldstr;
145 PyErr_NoMemory();
146 return -1;
147 }
148 unicode->str[length] = 0;
149 unicode->length = length;
150
151 /* Reset the object caches */
152 if (unicode->utf8str) {
153 Py_DECREF(unicode->utf8str);
154 unicode->utf8str = NULL;
155 }
156 unicode->hash = -1;
157
158 return 0;
159}
160
161/* We allocate one more byte to make sure the string is
162 Ux0000 terminated -- XXX is this needed ?
163
164 XXX This allocator could further be enhanced by assuring that the
165 free list never reduces its size below 1.
166
167*/
168
169static
170PyUnicodeObject *_PyUnicode_New(int length)
171{
172 register PyUnicodeObject *unicode;
173
174 /* Optimization for empty strings */
175 if (length == 0 && unicode_empty != NULL) {
176 Py_INCREF(unicode_empty);
177 return unicode_empty;
178 }
179
180 /* Unicode freelist & memory allocation */
181 if (unicode_freelist) {
182 unicode = unicode_freelist;
183 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
184 unicode_freelist_size--;
185 unicode->ob_type = &PyUnicode_Type;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000186 _Py_NewReference((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000187 if (unicode->str) {
188 if (unicode->length < length &&
189 _PyUnicode_Resize(unicode, length)) {
190 free(unicode->str);
191 PyMem_DEL(unicode);
192 return NULL;
193 }
194 }
195 else
196 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
197 }
198 else {
199 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
200 if (unicode == NULL)
201 return NULL;
202 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
203 }
204
Barry Warsaw51ac5802000-03-20 16:36:48 +0000205 if (!unicode->str)
206 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000207 unicode->str[length] = 0;
208 unicode->length = length;
209 unicode->hash = -1;
210 unicode->utf8str = NULL;
211 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000212
213 onError:
214 _Py_ForgetReference((PyObject *)unicode);
215 PyMem_DEL(unicode);
216 PyErr_NoMemory();
217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218}
219
220static
221void _PyUnicode_Free(register PyUnicodeObject *unicode)
222{
223 Py_XDECREF(unicode->utf8str);
224 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
225 if (unicode->length >= STAYALIVE_SIZE_LIMIT) {
226 free(unicode->str);
227 unicode->str = NULL;
228 unicode->length = 0;
229 }
230 *(PyUnicodeObject **)unicode = unicode_freelist;
231 unicode_freelist = unicode;
232 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000233 }
234 else {
235 free(unicode->str);
236 PyMem_DEL(unicode);
237 }
238}
239
240PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
241 int size)
242{
243 PyUnicodeObject *unicode;
244
245 unicode = _PyUnicode_New(size);
246 if (!unicode)
247 return NULL;
248
249 /* Copy the Unicode data into the new object */
250 if (u != NULL)
251 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
252
253 return (PyObject *)unicode;
254}
255
256#ifdef HAVE_WCHAR_H
257
258PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
259 int size)
260{
261 PyUnicodeObject *unicode;
262
263 if (w == NULL) {
264 PyErr_BadInternalCall();
265 return NULL;
266 }
267
268 unicode = _PyUnicode_New(size);
269 if (!unicode)
270 return NULL;
271
272 /* Copy the wchar_t data into the new object */
273#ifdef HAVE_USABLE_WCHAR_T
274 memcpy(unicode->str, w, size * sizeof(wchar_t));
275#else
276 {
277 register Py_UNICODE *u;
278 register int i;
279 u = PyUnicode_AS_UNICODE(unicode);
280 for (i = size; i >= 0; i--)
281 *u++ = *w++;
282 }
283#endif
284
285 return (PyObject *)unicode;
286}
287
288int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
289 register wchar_t *w,
290 int size)
291{
292 if (unicode == NULL) {
293 PyErr_BadInternalCall();
294 return -1;
295 }
296 if (size > PyUnicode_GET_SIZE(unicode))
297 size = PyUnicode_GET_SIZE(unicode);
298#ifdef HAVE_USABLE_WCHAR_T
299 memcpy(w, unicode->str, size * sizeof(wchar_t));
300#else
301 {
302 register Py_UNICODE *u;
303 register int i;
304 u = PyUnicode_AS_UNICODE(unicode);
305 for (i = size; i >= 0; i--)
306 *w++ = *u++;
307 }
308#endif
309
310 return size;
311}
312
313#endif
314
315PyObject *PyUnicode_FromObject(register PyObject *obj)
316{
317 const char *s;
318 int len;
319
320 if (obj == NULL) {
321 PyErr_BadInternalCall();
322 return NULL;
323 }
324 else if (PyUnicode_Check(obj)) {
325 Py_INCREF(obj);
326 return obj;
327 }
328 else if (PyString_Check(obj)) {
329 s = PyString_AS_STRING(obj);
330 len = PyString_GET_SIZE(obj);
331 }
332 else if (PyObject_AsCharBuffer(obj, &s, &len))
333 return NULL;
334 if (len == 0) {
335 Py_INCREF(unicode_empty);
336 return (PyObject *)unicode_empty;
337 }
338 return PyUnicode_DecodeUTF8(s, len, "strict");
339}
340
341PyObject *PyUnicode_Decode(const char *s,
342 int size,
343 const char *encoding,
344 const char *errors)
345{
346 PyObject *buffer = NULL, *unicode;
347
348 /* Shortcut for the default encoding UTF-8 */
349 if (encoding == NULL ||
350 (strcmp(encoding, "utf-8") == 0))
351 return PyUnicode_DecodeUTF8(s, size, errors);
352
353 /* Decode via the codec registry */
354 buffer = PyBuffer_FromMemory((void *)s, size);
355 if (buffer == NULL)
356 goto onError;
357 unicode = PyCodec_Decode(buffer, encoding, errors);
358 if (unicode == NULL)
359 goto onError;
360 if (!PyUnicode_Check(unicode)) {
361 PyErr_Format(PyExc_TypeError,
362 "decoder did not return an unicode object (type=%s)",
363 unicode->ob_type->tp_name);
364 Py_DECREF(unicode);
365 goto onError;
366 }
367 Py_DECREF(buffer);
368 return unicode;
369
370 onError:
371 Py_XDECREF(buffer);
372 return NULL;
373}
374
375PyObject *PyUnicode_Encode(const Py_UNICODE *s,
376 int size,
377 const char *encoding,
378 const char *errors)
379{
380 PyObject *v, *unicode;
381
382 unicode = PyUnicode_FromUnicode(s, size);
383 if (unicode == NULL)
384 return NULL;
385 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
386 Py_DECREF(unicode);
387 return v;
388}
389
390PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
391 const char *encoding,
392 const char *errors)
393{
394 PyObject *v;
395
396 if (!PyUnicode_Check(unicode)) {
397 PyErr_BadArgument();
398 goto onError;
399 }
400 /* Shortcut for the default encoding UTF-8 */
401 if ((encoding == NULL ||
402 (strcmp(encoding, "utf-8") == 0)) &&
403 errors == NULL)
404 return PyUnicode_AsUTF8String(unicode);
405
406 /* Encode via the codec registry */
407 v = PyCodec_Encode(unicode, encoding, errors);
408 if (v == NULL)
409 goto onError;
410 /* XXX Should we really enforce this ? */
411 if (!PyString_Check(v)) {
412 PyErr_Format(PyExc_TypeError,
413 "encoder did not return a string object (type=%s)",
414 v->ob_type->tp_name);
415 Py_DECREF(v);
416 goto onError;
417 }
418 return v;
419
420 onError:
421 return NULL;
422}
423
424Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
425{
426 if (!PyUnicode_Check(unicode)) {
427 PyErr_BadArgument();
428 goto onError;
429 }
430 return PyUnicode_AS_UNICODE(unicode);
431
432 onError:
433 return NULL;
434}
435
436int PyUnicode_GetSize(PyObject *unicode)
437{
438 if (!PyUnicode_Check(unicode)) {
439 PyErr_BadArgument();
440 goto onError;
441 }
442 return PyUnicode_GET_SIZE(unicode);
443
444 onError:
445 return -1;
446}
447
448/* --- UTF-8 Codec -------------------------------------------------------- */
449
450static
451char utf8_code_length[256] = {
452 /* Map UTF-8 encoded prefix byte to sequence length. zero means
453 illegal prefix. see RFC 2279 for details */
454 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
455 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
456 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
457 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
458 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
459 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
460 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
461 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
462 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
463 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
464 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
466 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
467 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
468 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
469 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
470};
471
472static
473int utf8_decoding_error(const char **source,
474 Py_UNICODE **dest,
475 const char *errors,
476 const char *details)
477{
478 if ((errors == NULL) ||
479 (strcmp(errors,"strict") == 0)) {
480 PyErr_Format(PyExc_UnicodeError,
481 "UTF-8 decoding error: %s",
482 details);
483 return -1;
484 }
485 else if (strcmp(errors,"ignore") == 0) {
486 (*source)++;
487 return 0;
488 }
489 else if (strcmp(errors,"replace") == 0) {
490 (*source)++;
491 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
492 (*dest)++;
493 return 0;
494 }
495 else {
496 PyErr_Format(PyExc_ValueError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000497 "UTF-8 decoding error; unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000498 errors);
499 return -1;
500 }
501}
502
503#define UTF8_ERROR(details) do { \
504 if (utf8_decoding_error(&s, &p, errors, details)) \
505 goto onError; \
506 continue; \
507} while (0)
508
509PyObject *PyUnicode_DecodeUTF8(const char *s,
510 int size,
511 const char *errors)
512{
513 int n;
514 const char *e;
515 PyUnicodeObject *unicode;
516 Py_UNICODE *p;
517
518 /* Note: size will always be longer than the resulting Unicode
519 character count */
520 unicode = _PyUnicode_New(size);
521 if (!unicode)
522 return NULL;
523 if (size == 0)
524 return (PyObject *)unicode;
525
526 /* Unpack UTF-8 encoded data */
527 p = unicode->str;
528 e = s + size;
529
530 while (s < e) {
531 register Py_UNICODE ch = (unsigned char)*s;
532
533 if (ch < 0x80) {
534 *p++ = ch;
535 s++;
536 continue;
537 }
538
539 n = utf8_code_length[ch];
540
541 if (s + n > e)
542 UTF8_ERROR("unexpected end of data");
543
544 switch (n) {
545
546 case 0:
547 UTF8_ERROR("unexpected code byte");
548 break;
549
550 case 1:
551 UTF8_ERROR("internal error");
552 break;
553
554 case 2:
555 if ((s[1] & 0xc0) != 0x80)
556 UTF8_ERROR("invalid data");
557 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
558 if (ch < 0x80)
559 UTF8_ERROR("illegal encoding");
560 else
561 *p++ = ch;
562 break;
563
564 case 3:
565 if ((s[1] & 0xc0) != 0x80 ||
566 (s[2] & 0xc0) != 0x80)
567 UTF8_ERROR("invalid data");
568 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
569 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
570 UTF8_ERROR("illegal encoding");
571 else
572 *p++ = ch;
573 break;
574
575 default:
576 /* Other sizes are only needed for UCS-4 */
577 UTF8_ERROR("unsupported Unicode code range");
578 }
579 s += n;
580 }
581
582 /* Adjust length */
583 if (_PyUnicode_Resize(unicode, p - unicode->str))
584 goto onError;
585
586 return (PyObject *)unicode;
587
588onError:
589 Py_DECREF(unicode);
590 return NULL;
591}
592
593#undef UTF8_ERROR
594
595static
596int utf8_encoding_error(const Py_UNICODE **source,
597 char **dest,
598 const char *errors,
599 const char *details)
600{
601 if ((errors == NULL) ||
602 (strcmp(errors,"strict") == 0)) {
603 PyErr_Format(PyExc_UnicodeError,
604 "UTF-8 encoding error: %s",
605 details);
606 return -1;
607 }
608 else if (strcmp(errors,"ignore") == 0) {
609 return 0;
610 }
611 else if (strcmp(errors,"replace") == 0) {
612 **dest = '?';
613 (*dest)++;
614 return 0;
615 }
616 else {
617 PyErr_Format(PyExc_ValueError,
618 "UTF-8 encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +0000619 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000620 errors);
621 return -1;
622 }
623}
624
625PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
626 int size,
627 const char *errors)
628{
629 PyObject *v;
630 char *p;
631 char *q;
632
633 v = PyString_FromStringAndSize(NULL, 3 * size);
634 if (v == NULL)
635 return NULL;
636 if (size == 0)
637 goto done;
638
639 p = q = PyString_AS_STRING(v);
640 while (size-- > 0) {
641 Py_UNICODE ch = *s++;
642 if (ch < 0x80)
643 *p++ = (char) ch;
644 else if (ch < 0x0800) {
645 *p++ = 0xc0 | (ch >> 6);
646 *p++ = 0x80 | (ch & 0x3f);
647 } else if (0xD800 <= ch && ch <= 0xDFFF) {
648 /* These byte ranges are reserved for UTF-16 surrogate
649 bytes which the Python implementation currently does
650 not support. */
651 printf("code range problem: U+%04x\n", ch);
652 if (utf8_encoding_error(&s, &p, errors,
653 "unsupported code range"))
654 goto onError;
655 } else {
656 *p++ = 0xe0 | (ch >> 12);
657 *p++ = 0x80 | ((ch >> 6) & 0x3f);
658 *p++ = 0x80 | (ch & 0x3f);
659 }
660 }
661 *p = '\0';
662 _PyString_Resize(&v, p - q);
663
664 done:
665 return v;
666
667 onError:
668 Py_DECREF(v);
669 return NULL;
670}
671
672/* Return a Python string holding the UTF-8 encoded value of the
673 Unicode object.
674
675 The resulting string is cached in the Unicode object for subsequent
676 usage by this function. The cached version is needed to implement
677 the character buffer interface.
678
679 The refcount of the string is *not* incremented.
680
681*/
682
683static
684PyObject *utf8_string(PyUnicodeObject *self,
685 const char *errors)
686{
687 PyObject *v = self->utf8str;
688
689 if (v)
690 return v;
691 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
692 PyUnicode_GET_SIZE(self),
693 errors);
694 if (v && errors == NULL)
695 self->utf8str = v;
696 return v;
697}
698
699PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
700{
701 PyObject *str;
702
703 if (!PyUnicode_Check(unicode)) {
704 PyErr_BadArgument();
705 return NULL;
706 }
707 str = utf8_string((PyUnicodeObject *)unicode, NULL);
708 if (str == NULL)
709 return NULL;
710 Py_INCREF(str);
711 return str;
712}
713
714/* --- UTF-16 Codec ------------------------------------------------------- */
715
716static
717int utf16_decoding_error(const Py_UNICODE **source,
718 Py_UNICODE **dest,
719 const char *errors,
720 const char *details)
721{
722 if ((errors == NULL) ||
723 (strcmp(errors,"strict") == 0)) {
724 PyErr_Format(PyExc_UnicodeError,
725 "UTF-16 decoding error: %s",
726 details);
727 return -1;
728 }
729 else if (strcmp(errors,"ignore") == 0) {
730 return 0;
731 }
732 else if (strcmp(errors,"replace") == 0) {
733 if (dest) {
734 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
735 (*dest)++;
736 }
737 return 0;
738 }
739 else {
740 PyErr_Format(PyExc_ValueError,
Barry Warsaw51ac5802000-03-20 16:36:48 +0000741 "UTF-16 decoding error; unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000742 errors);
743 return -1;
744 }
745}
746
747#define UTF16_ERROR(details) do { \
748 if (utf16_decoding_error(&q, &p, errors, details)) \
749 goto onError; \
750 continue; \
751} while(0)
752
753PyObject *PyUnicode_DecodeUTF16(const char *s,
754 int size,
755 const char *errors,
756 int *byteorder)
757{
758 PyUnicodeObject *unicode;
759 Py_UNICODE *p;
760 const Py_UNICODE *q, *e;
761 int bo = 0;
762
763 /* size should be an even number */
764 if (size % sizeof(Py_UNICODE) != 0) {
765 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
766 return NULL;
767 /* The remaining input chars are ignored if we fall through
768 here... */
769 }
770
771 /* Note: size will always be longer than the resulting Unicode
772 character count */
773 unicode = _PyUnicode_New(size);
774 if (!unicode)
775 return NULL;
776 if (size == 0)
777 return (PyObject *)unicode;
778
779 /* Unpack UTF-16 encoded data */
780 p = unicode->str;
781 q = (Py_UNICODE *)s;
782 e = q + (size / sizeof(Py_UNICODE));
783
784 if (byteorder)
785 bo = *byteorder;
786
787 while (q < e) {
788 register Py_UNICODE ch = *q++;
789
790 /* Check for BOM marks (U+FEFF) in the input and adjust
791 current byte order setting accordingly. Swap input
792 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
793 !) */
794#ifdef BYTEORDER_IS_LITTLE_ENDIAN
795 if (ch == 0xFEFF) {
796 bo = -1;
797 continue;
798 } else if (ch == 0xFFFE) {
799 bo = 1;
800 continue;
801 }
802 if (bo == 1)
803 ch = (ch >> 8) | (ch << 8);
804#else
805 if (ch == 0xFEFF) {
806 bo = 1;
807 continue;
808 } else if (ch == 0xFFFE) {
809 bo = -1;
810 continue;
811 }
812 if (bo == -1)
813 ch = (ch >> 8) | (ch << 8);
814#endif
815 if (ch < 0xD800 || ch > 0xDFFF) {
816 *p++ = ch;
817 continue;
818 }
819
820 /* UTF-16 code pair: */
821 if (q >= e)
822 UTF16_ERROR("unexpected end of data");
823 if (0xDC00 <= *q && *q <= 0xDFFF) {
824 q++;
825 if (0xD800 <= *q && *q <= 0xDBFF)
826 /* This is valid data (a UTF-16 surrogate pair), but
827 we are not able to store this information since our
828 Py_UNICODE type only has 16 bits... this might
829 change someday, even though it's unlikely. */
830 UTF16_ERROR("code pairs are not supported");
831 else
832 continue;
833 }
834 UTF16_ERROR("illegal encoding");
835 }
836
837 if (byteorder)
838 *byteorder = bo;
839
840 /* Adjust length */
841 if (_PyUnicode_Resize(unicode, p - unicode->str))
842 goto onError;
843
844 return (PyObject *)unicode;
845
846onError:
847 Py_DECREF(unicode);
848 return NULL;
849}
850
851#undef UTF16_ERROR
852
853PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
854 int size,
855 const char *errors,
856 int byteorder)
857{
858 PyObject *v;
859 Py_UNICODE *p;
860 char *q;
861
862 /* We don't create UTF-16 pairs... */
863 v = PyString_FromStringAndSize(NULL,
864 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
865 if (v == NULL)
866 return NULL;
867 if (size == 0)
868 goto done;
869
870 q = PyString_AS_STRING(v);
871 p = (Py_UNICODE *)q;
872
873 if (byteorder == 0)
874 *p++ = 0xFEFF;
875 if (byteorder == 0 ||
876#ifdef BYTEORDER_IS_LITTLE_ENDIAN
877 byteorder == -1
878#else
879 byteorder == 1
880#endif
881 )
882 memcpy(p, s, size * sizeof(Py_UNICODE));
883 else
884 while (size-- > 0) {
885 Py_UNICODE ch = *s++;
886 *p++ = (ch >> 8) | (ch << 8);
887 }
888 done:
889 return v;
890}
891
892PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
893{
894 if (!PyUnicode_Check(unicode)) {
895 PyErr_BadArgument();
896 return NULL;
897 }
898 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
899 PyUnicode_GET_SIZE(unicode),
900 NULL,
901 0);
902}
903
904/* --- Unicode Escape Codec ----------------------------------------------- */
905
906static
907int unicodeescape_decoding_error(const char **source,
908 unsigned int *x,
909 const char *errors,
910 const char *details)
911{
912 if ((errors == NULL) ||
913 (strcmp(errors,"strict") == 0)) {
914 PyErr_Format(PyExc_UnicodeError,
915 "Unicode-Escape decoding error: %s",
916 details);
917 return -1;
918 }
919 else if (strcmp(errors,"ignore") == 0) {
920 return 0;
921 }
922 else if (strcmp(errors,"replace") == 0) {
923 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
924 return 0;
925 }
926 else {
927 PyErr_Format(PyExc_ValueError,
928 "Unicode-Escape decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +0000929 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000930 errors);
931 return -1;
932 }
933}
934
935PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
936 int size,
937 const char *errors)
938{
939 PyUnicodeObject *v;
940 Py_UNICODE *p = NULL, *buf = NULL;
941 const char *end;
942
943 /* Escaped strings will always be longer than the resulting
944 Unicode string, so we start with size here and then reduce the
945 length after conversion to the true value. */
946 v = _PyUnicode_New(size);
947 if (v == NULL)
948 goto onError;
949 if (size == 0)
950 return (PyObject *)v;
951 p = buf = PyUnicode_AS_UNICODE(v);
952 end = s + size;
953 while (s < end) {
954 unsigned char c;
955 unsigned int x;
956 int i;
957
958 /* Non-escape characters are interpreted as Unicode ordinals */
959 if (*s != '\\') {
960 *p++ = (unsigned char)*s++;
961 continue;
962 }
963
964 /* \ - Escapes */
965 s++;
966 switch (*s++) {
967
968 /* \x escapes */
969 case '\n': break;
970 case '\\': *p++ = '\\'; break;
971 case '\'': *p++ = '\''; break;
972 case '\"': *p++ = '\"'; break;
973 case 'b': *p++ = '\b'; break;
974 case 'f': *p++ = '\014'; break; /* FF */
975 case 't': *p++ = '\t'; break;
976 case 'n': *p++ = '\n'; break;
977 case 'r': *p++ = '\r'; break;
978 case 'v': *p++ = '\013'; break; /* VT */
979 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
980
981 /* \OOO (octal) escapes */
982 case '0': case '1': case '2': case '3':
983 case '4': case '5': case '6': case '7':
984 c = s[-1] - '0';
985 if ('0' <= *s && *s <= '7') {
986 c = (c<<3) + *s++ - '0';
987 if ('0' <= *s && *s <= '7')
988 c = (c<<3) + *s++ - '0';
989 }
990 *p++ = c;
991 break;
992
993 /* \xXXXX escape with 0-4 hex digits */
994 case 'x':
995 x = 0;
996 c = (unsigned char)*s;
997 if (isxdigit(c)) {
998 do {
999 x = (x<<4) & ~0xF;
1000 if ('0' <= c && c <= '9')
1001 x += c - '0';
1002 else if ('a' <= c && c <= 'f')
1003 x += 10 + c - 'a';
1004 else
1005 x += 10 + c - 'A';
1006 c = (unsigned char)*++s;
1007 } while (isxdigit(c));
1008 *p++ = x;
1009 } else {
1010 *p++ = '\\';
1011 *p++ = (unsigned char)s[-1];
1012 }
1013 break;
1014
1015 /* \uXXXX with 4 hex digits */
1016 case 'u':
1017 for (x = 0, i = 0; i < 4; i++) {
1018 c = (unsigned char)s[i];
1019 if (!isxdigit(c)) {
1020 if (unicodeescape_decoding_error(&s, &x, errors,
1021 "truncated \\uXXXX"))
1022 goto onError;
1023 i++;
1024 break;
1025 }
1026 x = (x<<4) & ~0xF;
1027 if (c >= '0' && c <= '9')
1028 x += c - '0';
1029 else if (c >= 'a' && c <= 'f')
1030 x += 10 + c - 'a';
1031 else
1032 x += 10 + c - 'A';
1033 }
1034 s += i;
1035 *p++ = x;
1036 break;
1037
1038 default:
1039 *p++ = '\\';
1040 *p++ = (unsigned char)s[-1];
1041 break;
1042 }
1043 }
1044 _PyUnicode_Resize(v, (int)(p - buf));
1045 return (PyObject *)v;
1046
1047 onError:
1048 Py_XDECREF(v);
1049 return NULL;
1050}
1051
1052/* Return a Unicode-Escape string version of the Unicode object.
1053
1054 If quotes is true, the string is enclosed in u"" or u'' quotes as
1055 appropriate.
1056
1057*/
1058
Barry Warsaw51ac5802000-03-20 16:36:48 +00001059static const Py_UNICODE *findchar(const Py_UNICODE *s,
1060 int size,
1061 Py_UNICODE ch);
1062
Guido van Rossumd57fd912000-03-10 22:53:23 +00001063static
1064PyObject *unicodeescape_string(const Py_UNICODE *s,
1065 int size,
1066 int quotes)
1067{
1068 PyObject *repr;
1069 char *p;
1070 char *q;
1071
1072 static const char *hexdigit = "0123456789ABCDEF";
1073
1074 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1075 if (repr == NULL)
1076 return NULL;
1077
1078 p = q = PyString_AS_STRING(repr);
1079
1080 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001081 *p++ = 'u';
1082 *p++ = (findchar(s, size, '\'') &&
1083 !findchar(s, size, '"')) ? '"' : '\'';
1084 }
1085 while (size-- > 0) {
1086 Py_UNICODE ch = *s++;
1087 /* Escape quotes */
1088 if (quotes && (ch == q[1] || ch == '\\')) {
1089 *p++ = '\\';
1090 *p++ = (char) ch;
1091 }
1092 /* Map 16-bit characters to '\uxxxx' */
1093 else if (ch >= 256) {
1094 *p++ = '\\';
1095 *p++ = 'u';
1096 *p++ = hexdigit[(ch >> 12) & 0xf];
1097 *p++ = hexdigit[(ch >> 8) & 0xf];
1098 *p++ = hexdigit[(ch >> 4) & 0xf];
1099 *p++ = hexdigit[ch & 15];
1100 }
1101 /* Map non-printable US ASCII to '\ooo' */
1102 else if (ch < ' ' || ch >= 128) {
1103 *p++ = '\\';
1104 *p++ = hexdigit[(ch >> 6) & 7];
1105 *p++ = hexdigit[(ch >> 3) & 7];
1106 *p++ = hexdigit[ch & 7];
1107 }
1108 /* Copy everything else as-is */
1109 else
1110 *p++ = (char) ch;
1111 }
1112 if (quotes)
1113 *p++ = q[1];
1114
1115 *p = '\0';
1116 _PyString_Resize(&repr, p - q);
1117
1118 return repr;
1119}
1120
1121PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1122 int size)
1123{
1124 return unicodeescape_string(s, size, 0);
1125}
1126
1127PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1128{
1129 if (!PyUnicode_Check(unicode)) {
1130 PyErr_BadArgument();
1131 return NULL;
1132 }
1133 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1134 PyUnicode_GET_SIZE(unicode));
1135}
1136
1137/* --- Raw Unicode Escape Codec ------------------------------------------- */
1138
1139PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1140 int size,
1141 const char *errors)
1142{
1143 PyUnicodeObject *v;
1144 Py_UNICODE *p, *buf;
1145 const char *end;
1146 const char *bs;
1147
1148 /* Escaped strings will always be longer than the resulting
1149 Unicode string, so we start with size here and then reduce the
1150 length after conversion to the true value. */
1151 v = _PyUnicode_New(size);
1152 if (v == NULL)
1153 goto onError;
1154 if (size == 0)
1155 return (PyObject *)v;
1156 p = buf = PyUnicode_AS_UNICODE(v);
1157 end = s + size;
1158 while (s < end) {
1159 unsigned char c;
1160 unsigned int x;
1161 int i;
1162
1163 /* Non-escape characters are interpreted as Unicode ordinals */
1164 if (*s != '\\') {
1165 *p++ = (unsigned char)*s++;
1166 continue;
1167 }
1168
1169 /* \u-escapes are only interpreted iff the number of leading
1170 backslashes if odd */
1171 bs = s;
1172 for (;s < end;) {
1173 if (*s != '\\')
1174 break;
1175 *p++ = (unsigned char)*s++;
1176 }
1177 if (((s - bs) & 1) == 0 ||
1178 s >= end ||
1179 *s != 'u') {
1180 continue;
1181 }
1182 p--;
1183 s++;
1184
1185 /* \uXXXX with 4 hex digits */
1186 for (x = 0, i = 0; i < 4; i++) {
1187 c = (unsigned char)s[i];
1188 if (!isxdigit(c)) {
1189 if (unicodeescape_decoding_error(&s, &x, errors,
1190 "truncated \\uXXXX"))
1191 goto onError;
1192 i++;
1193 break;
1194 }
1195 x = (x<<4) & ~0xF;
1196 if (c >= '0' && c <= '9')
1197 x += c - '0';
1198 else if (c >= 'a' && c <= 'f')
1199 x += 10 + c - 'a';
1200 else
1201 x += 10 + c - 'A';
1202 }
1203 s += i;
1204 *p++ = x;
1205 }
1206 _PyUnicode_Resize(v, (int)(p - buf));
1207 return (PyObject *)v;
1208
1209 onError:
1210 Py_XDECREF(v);
1211 return NULL;
1212}
1213
1214PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1215 int size)
1216{
1217 PyObject *repr;
1218 char *p;
1219 char *q;
1220
1221 static const char *hexdigit = "0123456789ABCDEF";
1222
1223 repr = PyString_FromStringAndSize(NULL, 6 * size);
1224 if (repr == NULL)
1225 return NULL;
1226
1227 p = q = PyString_AS_STRING(repr);
1228 while (size-- > 0) {
1229 Py_UNICODE ch = *s++;
1230 /* Map 16-bit characters to '\uxxxx' */
1231 if (ch >= 256) {
1232 *p++ = '\\';
1233 *p++ = 'u';
1234 *p++ = hexdigit[(ch >> 12) & 0xf];
1235 *p++ = hexdigit[(ch >> 8) & 0xf];
1236 *p++ = hexdigit[(ch >> 4) & 0xf];
1237 *p++ = hexdigit[ch & 15];
1238 }
1239 /* Copy everything else as-is */
1240 else
1241 *p++ = (char) ch;
1242 }
1243 *p = '\0';
1244 _PyString_Resize(&repr, p - q);
1245
1246 return repr;
1247}
1248
1249PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1250{
1251 if (!PyUnicode_Check(unicode)) {
1252 PyErr_BadArgument();
1253 return NULL;
1254 }
1255 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1256 PyUnicode_GET_SIZE(unicode));
1257}
1258
1259/* --- Latin-1 Codec ------------------------------------------------------ */
1260
1261PyObject *PyUnicode_DecodeLatin1(const char *s,
1262 int size,
1263 const char *errors)
1264{
1265 PyUnicodeObject *v;
1266 Py_UNICODE *p;
1267
1268 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1269 v = _PyUnicode_New(size);
1270 if (v == NULL)
1271 goto onError;
1272 if (size == 0)
1273 return (PyObject *)v;
1274 p = PyUnicode_AS_UNICODE(v);
1275 while (size-- > 0)
1276 *p++ = (unsigned char)*s++;
1277 return (PyObject *)v;
1278
1279 onError:
1280 Py_XDECREF(v);
1281 return NULL;
1282}
1283
1284static
1285int latin1_encoding_error(const Py_UNICODE **source,
1286 char **dest,
1287 const char *errors,
1288 const char *details)
1289{
1290 if ((errors == NULL) ||
1291 (strcmp(errors,"strict") == 0)) {
1292 PyErr_Format(PyExc_UnicodeError,
1293 "Latin-1 encoding error: %s",
1294 details);
1295 return -1;
1296 }
1297 else if (strcmp(errors,"ignore") == 0) {
1298 return 0;
1299 }
1300 else if (strcmp(errors,"replace") == 0) {
1301 **dest = '?';
1302 return 0;
1303 }
1304 else {
1305 PyErr_Format(PyExc_ValueError,
1306 "Latin-1 encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001307 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001308 errors);
1309 return -1;
1310 }
1311}
1312
1313PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1314 int size,
1315 const char *errors)
1316{
1317 PyObject *repr;
1318 char *s;
1319 repr = PyString_FromStringAndSize(NULL, size);
1320 if (repr == NULL)
1321 return NULL;
1322
1323 s = PyString_AS_STRING(repr);
1324 while (size-- > 0) {
1325 Py_UNICODE ch = *p++;
1326 if (ch >= 256) {
1327 if (latin1_encoding_error(&p, &s, errors,
1328 "ordinal not in range(256)"))
1329 goto onError;
1330 }
1331 else
1332 *s++ = (char)ch;
1333 }
1334 return repr;
1335
1336 onError:
1337 Py_DECREF(repr);
1338 return NULL;
1339}
1340
1341PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1342{
1343 if (!PyUnicode_Check(unicode)) {
1344 PyErr_BadArgument();
1345 return NULL;
1346 }
1347 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1348 PyUnicode_GET_SIZE(unicode),
1349 NULL);
1350}
1351
1352/* --- 7-bit ASCII Codec -------------------------------------------------- */
1353
1354static
1355int ascii_decoding_error(const char **source,
1356 Py_UNICODE **dest,
1357 const char *errors,
1358 const char *details)
1359{
1360 if ((errors == NULL) ||
1361 (strcmp(errors,"strict") == 0)) {
1362 PyErr_Format(PyExc_UnicodeError,
1363 "ASCII decoding error: %s",
1364 details);
1365 return -1;
1366 }
1367 else if (strcmp(errors,"ignore") == 0) {
1368 return 0;
1369 }
1370 else if (strcmp(errors,"replace") == 0) {
1371 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1372 (*dest)++;
1373 return 0;
1374 }
1375 else {
1376 PyErr_Format(PyExc_ValueError,
1377 "ASCII decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001378 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001379 errors);
1380 return -1;
1381 }
1382}
1383
1384PyObject *PyUnicode_DecodeASCII(const char *s,
1385 int size,
1386 const char *errors)
1387{
1388 PyUnicodeObject *v;
1389 Py_UNICODE *p;
1390
1391 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1392 v = _PyUnicode_New(size);
1393 if (v == NULL)
1394 goto onError;
1395 if (size == 0)
1396 return (PyObject *)v;
1397 p = PyUnicode_AS_UNICODE(v);
1398 while (size-- > 0) {
1399 register unsigned char c;
1400
1401 c = (unsigned char)*s++;
1402 if (c < 128)
1403 *p++ = c;
1404 else if (ascii_decoding_error(&s, &p, errors,
1405 "ordinal not in range(128)"))
1406 goto onError;
1407 }
1408 if (p - PyUnicode_AS_UNICODE(v) < size)
1409 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1410 return (PyObject *)v;
1411
1412 onError:
1413 Py_XDECREF(v);
1414 return NULL;
1415}
1416
1417static
1418int ascii_encoding_error(const Py_UNICODE **source,
1419 char **dest,
1420 const char *errors,
1421 const char *details)
1422{
1423 if ((errors == NULL) ||
1424 (strcmp(errors,"strict") == 0)) {
1425 PyErr_Format(PyExc_UnicodeError,
1426 "ASCII encoding error: %s",
1427 details);
1428 return -1;
1429 }
1430 else if (strcmp(errors,"ignore") == 0) {
1431 return 0;
1432 }
1433 else if (strcmp(errors,"replace") == 0) {
1434 **dest = '?';
1435 return 0;
1436 }
1437 else {
1438 PyErr_Format(PyExc_ValueError,
1439 "ASCII encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001440 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001441 errors);
1442 return -1;
1443 }
1444}
1445
1446PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1447 int size,
1448 const char *errors)
1449{
1450 PyObject *repr;
1451 char *s;
1452 repr = PyString_FromStringAndSize(NULL, size);
1453 if (repr == NULL)
1454 return NULL;
1455
1456 s = PyString_AS_STRING(repr);
1457 while (size-- > 0) {
1458 Py_UNICODE ch = *p++;
1459 if (ch >= 128) {
1460 if (ascii_encoding_error(&p, &s, errors,
1461 "ordinal not in range(128)"))
1462 goto onError;
1463 }
1464 else
1465 *s++ = (char)ch;
1466 }
1467 return repr;
1468
1469 onError:
1470 Py_DECREF(repr);
1471 return NULL;
1472}
1473
1474PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1475{
1476 if (!PyUnicode_Check(unicode)) {
1477 PyErr_BadArgument();
1478 return NULL;
1479 }
1480 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1481 PyUnicode_GET_SIZE(unicode),
1482 NULL);
1483}
1484
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001485#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001486
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001487/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001488
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001489PyObject *PyUnicode_DecodeMBCS(const char *s,
1490 int size,
1491 const char *errors)
1492{
1493 PyUnicodeObject *v;
1494 Py_UNICODE *p;
1495
1496 /* First get the size of the result */
1497 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1498 if (usize==0)
1499 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1500
1501 v = _PyUnicode_New(usize);
1502 if (v == NULL)
1503 return NULL;
1504 if (usize == 0)
1505 return (PyObject *)v;
1506 p = PyUnicode_AS_UNICODE(v);
1507 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1508 Py_DECREF(v);
1509 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1510 }
1511
1512 return (PyObject *)v;
1513}
1514
1515PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1516 int size,
1517 const char *errors)
1518{
1519 PyObject *repr;
1520 char *s;
1521
1522 /* First get the size of the result */
1523 DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1524 if (mbcssize==0)
1525 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1526
1527 repr = PyString_FromStringAndSize(NULL, mbcssize);
1528 if (repr == NULL)
1529 return NULL;
1530 if (mbcssize==0)
1531 return repr;
1532
1533 /* Do the conversion */
1534 s = PyString_AS_STRING(repr);
1535 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1536 Py_DECREF(repr);
1537 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1538 }
1539 return repr;
1540}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001541
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001542#endif /* MS_WIN32 */
1543
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544/* --- Character Mapping Codec -------------------------------------------- */
1545
1546static
1547int charmap_decoding_error(const char **source,
1548 Py_UNICODE **dest,
1549 const char *errors,
1550 const char *details)
1551{
1552 if ((errors == NULL) ||
1553 (strcmp(errors,"strict") == 0)) {
1554 PyErr_Format(PyExc_UnicodeError,
1555 "charmap decoding error: %s",
1556 details);
1557 return -1;
1558 }
1559 else if (strcmp(errors,"ignore") == 0) {
1560 return 0;
1561 }
1562 else if (strcmp(errors,"replace") == 0) {
1563 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1564 (*dest)++;
1565 return 0;
1566 }
1567 else {
1568 PyErr_Format(PyExc_ValueError,
1569 "charmap decoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001570 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001571 errors);
1572 return -1;
1573 }
1574}
1575
1576PyObject *PyUnicode_DecodeCharmap(const char *s,
1577 int size,
1578 PyObject *mapping,
1579 const char *errors)
1580{
1581 PyUnicodeObject *v;
1582 Py_UNICODE *p;
1583
1584 /* Default to Latin-1 */
1585 if (mapping == NULL)
1586 return PyUnicode_DecodeLatin1(s, size, errors);
1587
1588 v = _PyUnicode_New(size);
1589 if (v == NULL)
1590 goto onError;
1591 if (size == 0)
1592 return (PyObject *)v;
1593 p = PyUnicode_AS_UNICODE(v);
1594 while (size-- > 0) {
1595 unsigned char ch = *s++;
1596 PyObject *w, *x;
1597
1598 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1599 w = PyInt_FromLong((long)ch);
1600 if (w == NULL)
1601 goto onError;
1602 x = PyObject_GetItem(mapping, w);
1603 Py_DECREF(w);
1604 if (x == NULL) {
1605 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1606 /* No mapping found: default to Latin-1 mapping */
1607 PyErr_Clear();
1608 *p++ = (Py_UNICODE)ch;
1609 continue;
1610 }
1611 goto onError;
1612 }
1613
1614 /* Apply mapping */
1615 if (PyInt_Check(x)) {
1616 int value = PyInt_AS_LONG(x);
1617 if (value < 0 || value > 65535) {
1618 PyErr_SetString(PyExc_TypeError,
1619 "character mapping must be in range(65336)");
1620 Py_DECREF(x);
1621 goto onError;
1622 }
1623 *p++ = (Py_UNICODE)value;
1624 }
1625 else if (x == Py_None) {
1626 /* undefined mapping */
1627 if (charmap_decoding_error(&s, &p, errors,
1628 "character maps to <undefined>")) {
1629 Py_DECREF(x);
1630 goto onError;
1631 }
1632 }
1633 else if (PyUnicode_Check(x)) {
1634 if (PyUnicode_GET_SIZE(x) != 1) {
1635 /* 1-n mapping */
1636 PyErr_SetString(PyExc_NotImplementedError,
1637 "1-n mappings are currently not implemented");
1638 Py_DECREF(x);
1639 goto onError;
1640 }
1641 *p++ = *PyUnicode_AS_UNICODE(x);
1642 }
1643 else {
1644 /* wrong return value */
1645 PyErr_SetString(PyExc_TypeError,
1646 "character mapping must return integer, None or unicode");
1647 Py_DECREF(x);
1648 goto onError;
1649 }
1650 Py_DECREF(x);
1651 }
1652 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1653 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1654 goto onError;
1655 return (PyObject *)v;
1656
1657 onError:
1658 Py_XDECREF(v);
1659 return NULL;
1660}
1661
1662static
1663int charmap_encoding_error(const Py_UNICODE **source,
1664 char **dest,
1665 const char *errors,
1666 const char *details)
1667{
1668 if ((errors == NULL) ||
1669 (strcmp(errors,"strict") == 0)) {
1670 PyErr_Format(PyExc_UnicodeError,
1671 "charmap encoding error: %s",
1672 details);
1673 return -1;
1674 }
1675 else if (strcmp(errors,"ignore") == 0) {
1676 return 0;
1677 }
1678 else if (strcmp(errors,"replace") == 0) {
1679 **dest = '?';
1680 (*dest)++;
1681 return 0;
1682 }
1683 else {
1684 PyErr_Format(PyExc_ValueError,
1685 "charmap encoding error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001686 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 errors);
1688 return -1;
1689 }
1690}
1691
1692PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1693 int size,
1694 PyObject *mapping,
1695 const char *errors)
1696{
1697 PyObject *v;
1698 char *s;
1699
1700 /* Default to Latin-1 */
1701 if (mapping == NULL)
1702 return PyUnicode_EncodeLatin1(p, size, errors);
1703
1704 v = PyString_FromStringAndSize(NULL, size);
1705 if (v == NULL)
1706 return NULL;
1707 s = PyString_AS_STRING(v);
1708 while (size-- > 0) {
1709 Py_UNICODE ch = *p++;
1710 PyObject *w, *x;
1711
1712 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1713 w = PyInt_FromLong((long)ch);
1714 if (w == NULL)
1715 goto onError;
1716 x = PyObject_GetItem(mapping, w);
1717 Py_DECREF(w);
1718 if (x == NULL) {
1719 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1720 /* No mapping found: default to Latin-1 mapping if possible */
1721 PyErr_Clear();
1722 if (ch < 256) {
1723 *s++ = (char)ch;
1724 continue;
1725 }
1726 else if (!charmap_encoding_error(&p, &s, errors,
1727 "missing character mapping"))
1728 continue;
1729 }
1730 goto onError;
1731 }
1732
1733 /* Apply mapping */
1734 if (PyInt_Check(x)) {
1735 int value = PyInt_AS_LONG(x);
1736 if (value < 0 || value > 255) {
1737 PyErr_SetString(PyExc_TypeError,
1738 "character mapping must be in range(256)");
1739 Py_DECREF(x);
1740 goto onError;
1741 }
1742 *s++ = (char)value;
1743 }
1744 else if (x == Py_None) {
1745 /* undefined mapping */
1746 if (charmap_encoding_error(&p, &s, errors,
1747 "character maps to <undefined>")) {
1748 Py_DECREF(x);
1749 goto onError;
1750 }
1751 }
1752 else if (PyString_Check(x)) {
1753 if (PyString_GET_SIZE(x) != 1) {
1754 /* 1-n mapping */
1755 PyErr_SetString(PyExc_NotImplementedError,
1756 "1-n mappings are currently not implemented");
1757 Py_DECREF(x);
1758 goto onError;
1759 }
1760 *s++ = *PyString_AS_STRING(x);
1761 }
1762 else {
1763 /* wrong return value */
1764 PyErr_SetString(PyExc_TypeError,
1765 "character mapping must return integer, None or unicode");
1766 Py_DECREF(x);
1767 goto onError;
1768 }
1769 Py_DECREF(x);
1770 }
1771 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1772 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1773 goto onError;
1774 return v;
1775
1776 onError:
1777 Py_DECREF(v);
1778 return NULL;
1779}
1780
1781PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1782 PyObject *mapping)
1783{
1784 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1785 PyErr_BadArgument();
1786 return NULL;
1787 }
1788 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1789 PyUnicode_GET_SIZE(unicode),
1790 mapping,
1791 NULL);
1792}
1793
1794static
1795int translate_error(const Py_UNICODE **source,
1796 Py_UNICODE **dest,
1797 const char *errors,
1798 const char *details)
1799{
1800 if ((errors == NULL) ||
1801 (strcmp(errors,"strict") == 0)) {
1802 PyErr_Format(PyExc_UnicodeError,
1803 "translate error: %s",
1804 details);
1805 return -1;
1806 }
1807 else if (strcmp(errors,"ignore") == 0) {
1808 return 0;
1809 }
1810 else if (strcmp(errors,"replace") == 0) {
1811 **dest = '?';
1812 (*dest)++;
1813 return 0;
1814 }
1815 else {
1816 PyErr_Format(PyExc_ValueError,
1817 "translate error; "
Barry Warsaw51ac5802000-03-20 16:36:48 +00001818 "unknown error handling code: %s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001819 errors);
1820 return -1;
1821 }
1822}
1823
1824PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1825 int size,
1826 PyObject *mapping,
1827 const char *errors)
1828{
1829 PyUnicodeObject *v;
1830 Py_UNICODE *p;
1831
1832 if (mapping == NULL) {
1833 PyErr_BadArgument();
1834 return NULL;
1835 }
1836
1837 /* Output will never be longer than input */
1838 v = _PyUnicode_New(size);
1839 if (v == NULL)
1840 goto onError;
1841 if (size == 0)
1842 goto done;
1843 p = PyUnicode_AS_UNICODE(v);
1844 while (size-- > 0) {
1845 Py_UNICODE ch = *s++;
1846 PyObject *w, *x;
1847
1848 /* Get mapping */
1849 w = PyInt_FromLong(ch);
1850 if (w == NULL)
1851 goto onError;
1852 x = PyObject_GetItem(mapping, w);
1853 Py_DECREF(w);
1854 if (x == NULL) {
1855 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1856 /* No mapping found: default to 1-1 mapping */
1857 PyErr_Clear();
1858 *p++ = ch;
1859 continue;
1860 }
1861 goto onError;
1862 }
1863
1864 /* Apply mapping */
1865 if (PyInt_Check(x))
1866 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1867 else if (x == Py_None) {
1868 /* undefined mapping */
1869 if (translate_error(&s, &p, errors,
1870 "character maps to <undefined>")) {
1871 Py_DECREF(x);
1872 goto onError;
1873 }
1874 }
1875 else if (PyUnicode_Check(x)) {
1876 if (PyUnicode_GET_SIZE(x) != 1) {
1877 /* 1-n mapping */
1878 PyErr_SetString(PyExc_NotImplementedError,
1879 "1-n mappings are currently not implemented");
1880 Py_DECREF(x);
1881 goto onError;
1882 }
1883 *p++ = *PyUnicode_AS_UNICODE(x);
1884 }
1885 else {
1886 /* wrong return value */
1887 PyErr_SetString(PyExc_TypeError,
1888 "translate mapping must return integer, None or unicode");
1889 Py_DECREF(x);
1890 goto onError;
1891 }
1892 Py_DECREF(x);
1893 }
1894 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1895 _PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v)));
1896
1897 done:
1898 return (PyObject *)v;
1899
1900 onError:
1901 Py_XDECREF(v);
1902 return NULL;
1903}
1904
1905PyObject *PyUnicode_Translate(PyObject *str,
1906 PyObject *mapping,
1907 const char *errors)
1908{
1909 PyObject *result;
1910
1911 str = PyUnicode_FromObject(str);
1912 if (str == NULL)
1913 goto onError;
1914 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1915 PyUnicode_GET_SIZE(str),
1916 mapping,
1917 errors);
1918 Py_DECREF(str);
1919 return result;
1920
1921 onError:
1922 Py_XDECREF(str);
1923 return NULL;
1924}
1925
1926/* --- Helpers ------------------------------------------------------------ */
1927
1928static
1929int count(PyUnicodeObject *self,
1930 int start,
1931 int end,
1932 PyUnicodeObject *substring)
1933{
1934 int count = 0;
1935
1936 end -= substring->length;
1937
1938 while (start <= end)
1939 if (Py_UNICODE_MATCH(self, start, substring)) {
1940 count++;
1941 start += substring->length;
1942 } else
1943 start++;
1944
1945 return count;
1946}
1947
1948int PyUnicode_Count(PyObject *str,
1949 PyObject *substr,
1950 int start,
1951 int end)
1952{
1953 int result;
1954
1955 str = PyUnicode_FromObject(str);
1956 if (str == NULL)
1957 return -1;
1958 substr = PyUnicode_FromObject(substr);
1959 if (substr == NULL) {
1960 Py_DECREF(substr);
1961 return -1;
1962 }
1963
1964 result = count((PyUnicodeObject *)str,
1965 start, end,
1966 (PyUnicodeObject *)substr);
1967
1968 Py_DECREF(str);
1969 Py_DECREF(substr);
1970 return result;
1971}
1972
1973static
1974int findstring(PyUnicodeObject *self,
1975 PyUnicodeObject *substring,
1976 int start,
1977 int end,
1978 int direction)
1979{
1980 if (start < 0)
1981 start += self->length;
1982 if (start < 0)
1983 start = 0;
1984
1985 if (substring->length == 0)
1986 return start;
1987
1988 if (end > self->length)
1989 end = self->length;
1990 if (end < 0)
1991 end += self->length;
1992 if (end < 0)
1993 end = 0;
1994
1995 end -= substring->length;
1996
1997 if (direction < 0) {
1998 for (; end >= start; end--)
1999 if (Py_UNICODE_MATCH(self, end, substring))
2000 return end;
2001 } else {
2002 for (; start <= end; start++)
2003 if (Py_UNICODE_MATCH(self, start, substring))
2004 return start;
2005 }
2006
2007 return -1;
2008}
2009
2010int PyUnicode_Find(PyObject *str,
2011 PyObject *substr,
2012 int start,
2013 int end,
2014 int direction)
2015{
2016 int result;
2017
2018 str = PyUnicode_FromObject(str);
2019 if (str == NULL)
2020 return -1;
2021 substr = PyUnicode_FromObject(substr);
2022 if (substr == NULL) {
2023 Py_DECREF(substr);
2024 return -1;
2025 }
2026
2027 result = findstring((PyUnicodeObject *)str,
2028 (PyUnicodeObject *)substr,
2029 start, end, direction);
2030 Py_DECREF(str);
2031 Py_DECREF(substr);
2032 return result;
2033}
2034
2035static
2036int tailmatch(PyUnicodeObject *self,
2037 PyUnicodeObject *substring,
2038 int start,
2039 int end,
2040 int direction)
2041{
2042 if (start < 0)
2043 start += self->length;
2044 if (start < 0)
2045 start = 0;
2046
2047 if (substring->length == 0)
2048 return 1;
2049
2050 if (end > self->length)
2051 end = self->length;
2052 if (end < 0)
2053 end += self->length;
2054 if (end < 0)
2055 end = 0;
2056
2057 end -= substring->length;
2058 if (end < start)
2059 return 0;
2060
2061 if (direction > 0) {
2062 if (Py_UNICODE_MATCH(self, end, substring))
2063 return 1;
2064 } else {
2065 if (Py_UNICODE_MATCH(self, start, substring))
2066 return 1;
2067 }
2068
2069 return 0;
2070}
2071
2072int PyUnicode_Tailmatch(PyObject *str,
2073 PyObject *substr,
2074 int start,
2075 int end,
2076 int direction)
2077{
2078 int result;
2079
2080 str = PyUnicode_FromObject(str);
2081 if (str == NULL)
2082 return -1;
2083 substr = PyUnicode_FromObject(substr);
2084 if (substr == NULL) {
2085 Py_DECREF(substr);
2086 return -1;
2087 }
2088
2089 result = tailmatch((PyUnicodeObject *)str,
2090 (PyUnicodeObject *)substr,
2091 start, end, direction);
2092 Py_DECREF(str);
2093 Py_DECREF(substr);
2094 return result;
2095}
2096
2097static
2098const Py_UNICODE *findchar(const Py_UNICODE *s,
2099 int size,
2100 Py_UNICODE ch)
2101{
2102 /* like wcschr, but doesn't stop at NULL characters */
2103
2104 while (size-- > 0) {
2105 if (*s == ch)
2106 return s;
2107 s++;
2108 }
2109
2110 return NULL;
2111}
2112
2113/* Apply fixfct filter to the Unicode object self and return a
2114 reference to the modified object */
2115
2116static
2117PyObject *fixup(PyUnicodeObject *self,
2118 int (*fixfct)(PyUnicodeObject *s))
2119{
2120
2121 PyUnicodeObject *u;
2122
2123 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2124 self->length);
2125 if (u == NULL)
2126 return NULL;
2127 if (!fixfct(u)) {
2128 /* fixfct should return TRUE if it modified the buffer. If
2129 FALSE, return a reference to the original buffer instead
2130 (to save space, not time) */
2131 Py_INCREF(self);
2132 Py_DECREF(u);
2133 return (PyObject*) self;
2134 }
2135 return (PyObject*) u;
2136}
2137
2138static
2139int fixupper(PyUnicodeObject *self)
2140{
2141 int len = self->length;
2142 Py_UNICODE *s = self->str;
2143 int status = 0;
2144
2145 while (len-- > 0) {
2146 register Py_UNICODE ch;
2147
2148 ch = Py_UNICODE_TOUPPER(*s);
2149 if (ch != *s) {
2150 status = 1;
2151 *s = ch;
2152 }
2153 s++;
2154 }
2155
2156 return status;
2157}
2158
2159static
2160int fixlower(PyUnicodeObject *self)
2161{
2162 int len = self->length;
2163 Py_UNICODE *s = self->str;
2164 int status = 0;
2165
2166 while (len-- > 0) {
2167 register Py_UNICODE ch;
2168
2169 ch = Py_UNICODE_TOLOWER(*s);
2170 if (ch != *s) {
2171 status = 1;
2172 *s = ch;
2173 }
2174 s++;
2175 }
2176
2177 return status;
2178}
2179
2180static
2181int fixswapcase(PyUnicodeObject *self)
2182{
2183 int len = self->length;
2184 Py_UNICODE *s = self->str;
2185 int status = 0;
2186
2187 while (len-- > 0) {
2188 if (Py_UNICODE_ISUPPER(*s)) {
2189 *s = Py_UNICODE_TOLOWER(*s);
2190 status = 1;
2191 } else if (Py_UNICODE_ISLOWER(*s)) {
2192 *s = Py_UNICODE_TOUPPER(*s);
2193 status = 1;
2194 }
2195 s++;
2196 }
2197
2198 return status;
2199}
2200
2201static
2202int fixcapitalize(PyUnicodeObject *self)
2203{
2204 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2205 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2206 return 1;
2207 }
2208 return 0;
2209}
2210
2211static
2212int fixtitle(PyUnicodeObject *self)
2213{
2214 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2215 register Py_UNICODE *e;
2216 int previous_is_cased;
2217
2218 /* Shortcut for single character strings */
2219 if (PyUnicode_GET_SIZE(self) == 1) {
2220 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2221 if (*p != ch) {
2222 *p = ch;
2223 return 1;
2224 }
2225 else
2226 return 0;
2227 }
2228
2229 e = p + PyUnicode_GET_SIZE(self);
2230 previous_is_cased = 0;
2231 for (; p < e; p++) {
2232 register const Py_UNICODE ch = *p;
2233
2234 if (previous_is_cased)
2235 *p = Py_UNICODE_TOLOWER(ch);
2236 else
2237 *p = Py_UNICODE_TOTITLE(ch);
2238
2239 if (Py_UNICODE_ISLOWER(ch) ||
2240 Py_UNICODE_ISUPPER(ch) ||
2241 Py_UNICODE_ISTITLE(ch))
2242 previous_is_cased = 1;
2243 else
2244 previous_is_cased = 0;
2245 }
2246 return 1;
2247}
2248
2249PyObject *PyUnicode_Join(PyObject *separator,
2250 PyObject *seq)
2251{
2252 Py_UNICODE *sep;
2253 int seplen;
2254 PyUnicodeObject *res = NULL;
2255 int reslen = 0;
2256 Py_UNICODE *p;
2257 int seqlen = 0;
2258 int sz = 100;
2259 int i;
2260
2261 seqlen = PySequence_Length(seq);
2262 if (seqlen < 0 && PyErr_Occurred())
2263 return NULL;
2264
2265 if (separator == NULL) {
2266 Py_UNICODE blank = ' ';
2267 sep = &blank;
2268 seplen = 1;
2269 }
2270 else {
2271 separator = PyUnicode_FromObject(separator);
2272 if (separator == NULL)
2273 return NULL;
2274 sep = PyUnicode_AS_UNICODE(separator);
2275 seplen = PyUnicode_GET_SIZE(separator);
2276 }
2277
2278 res = _PyUnicode_New(sz);
2279 if (res == NULL)
2280 goto onError;
2281 p = PyUnicode_AS_UNICODE(res);
2282 reslen = 0;
2283
2284 for (i = 0; i < seqlen; i++) {
2285 int itemlen;
2286 PyObject *item;
2287
2288 item = PySequence_GetItem(seq, i);
2289 if (item == NULL)
2290 goto onError;
2291 if (!PyUnicode_Check(item)) {
2292 PyObject *v;
2293 v = PyUnicode_FromObject(item);
2294 Py_DECREF(item);
2295 item = v;
2296 if (item == NULL)
2297 goto onError;
2298 }
2299 itemlen = PyUnicode_GET_SIZE(item);
2300 while (reslen + itemlen + seplen >= sz) {
2301 if (_PyUnicode_Resize(res, sz*2))
2302 goto onError;
2303 sz *= 2;
2304 p = PyUnicode_AS_UNICODE(res) + reslen;
2305 }
2306 if (i > 0) {
2307 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2308 p += seplen;
2309 reslen += seplen;
2310 }
2311 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2312 p += itemlen;
2313 reslen += itemlen;
2314 Py_DECREF(item);
2315 }
2316 if (_PyUnicode_Resize(res, reslen))
2317 goto onError;
2318
2319 Py_XDECREF(separator);
2320 return (PyObject *)res;
2321
2322 onError:
2323 Py_XDECREF(separator);
2324 Py_DECREF(res);
2325 return NULL;
2326}
2327
2328static
2329PyUnicodeObject *pad(PyUnicodeObject *self,
2330 int left,
2331 int right,
2332 Py_UNICODE fill)
2333{
2334 PyUnicodeObject *u;
2335
2336 if (left < 0)
2337 left = 0;
2338 if (right < 0)
2339 right = 0;
2340
2341 if (left == 0 && right == 0) {
2342 Py_INCREF(self);
2343 return self;
2344 }
2345
2346 u = _PyUnicode_New(left + self->length + right);
2347 if (u) {
2348 if (left)
2349 Py_UNICODE_FILL(u->str, fill, left);
2350 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2351 if (right)
2352 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2353 }
2354
2355 return u;
2356}
2357
2358#define SPLIT_APPEND(data, left, right) \
2359 str = PyUnicode_FromUnicode(data + left, right - left); \
2360 if (!str) \
2361 goto onError; \
2362 if (PyList_Append(list, str)) { \
2363 Py_DECREF(str); \
2364 goto onError; \
2365 } \
2366 else \
2367 Py_DECREF(str);
2368
2369static
2370PyObject *split_whitespace(PyUnicodeObject *self,
2371 PyObject *list,
2372 int maxcount)
2373{
2374 register int i;
2375 register int j;
2376 int len = self->length;
2377 PyObject *str;
2378
2379 for (i = j = 0; i < len; ) {
2380 /* find a token */
2381 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2382 i++;
2383 j = i;
2384 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2385 i++;
2386 if (j < i) {
2387 if (maxcount-- <= 0)
2388 break;
2389 SPLIT_APPEND(self->str, j, i);
2390 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2391 i++;
2392 j = i;
2393 }
2394 }
2395 if (j < len) {
2396 SPLIT_APPEND(self->str, j, len);
2397 }
2398 return list;
2399
2400 onError:
2401 Py_DECREF(list);
2402 return NULL;
2403}
2404
2405PyObject *PyUnicode_Splitlines(PyObject *string,
2406 int maxcount)
2407{
2408 register int i;
2409 register int j;
2410 int len;
2411 PyObject *list;
2412 PyObject *str;
2413 Py_UNICODE *data;
2414
2415 string = PyUnicode_FromObject(string);
2416 if (string == NULL)
2417 return NULL;
2418 data = PyUnicode_AS_UNICODE(string);
2419 len = PyUnicode_GET_SIZE(string);
2420
2421 if (maxcount < 0)
2422 maxcount = INT_MAX;
2423
2424 list = PyList_New(0);
2425 if (!list)
2426 goto onError;
2427
2428 for (i = j = 0; i < len; ) {
2429 /* Find a line and append it */
2430 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2431 i++;
2432 if (maxcount-- <= 0)
2433 break;
2434 SPLIT_APPEND(data, j, i);
2435
2436 /* Skip the line break reading CRLF as one line break */
2437 if (i < len) {
2438 if (data[i] == '\r' && i + 1 < len &&
2439 data[i+1] == '\n')
2440 i += 2;
2441 else
2442 i++;
2443 }
2444 j = i;
2445 }
2446 if (j < len) {
2447 SPLIT_APPEND(data, j, len);
2448 }
2449
2450 Py_DECREF(string);
2451 return list;
2452
2453 onError:
2454 Py_DECREF(list);
2455 Py_DECREF(string);
2456 return NULL;
2457}
2458
2459static
2460PyObject *split_char(PyUnicodeObject *self,
2461 PyObject *list,
2462 Py_UNICODE ch,
2463 int maxcount)
2464{
2465 register int i;
2466 register int j;
2467 int len = self->length;
2468 PyObject *str;
2469
2470 for (i = j = 0; i < len; ) {
2471 if (self->str[i] == ch) {
2472 if (maxcount-- <= 0)
2473 break;
2474 SPLIT_APPEND(self->str, j, i);
2475 i = j = i + 1;
2476 } else
2477 i++;
2478 }
2479 if (j <= len) {
2480 SPLIT_APPEND(self->str, j, len);
2481 }
2482 return list;
2483
2484 onError:
2485 Py_DECREF(list);
2486 return NULL;
2487}
2488
2489static
2490PyObject *split_substring(PyUnicodeObject *self,
2491 PyObject *list,
2492 PyUnicodeObject *substring,
2493 int maxcount)
2494{
2495 register int i;
2496 register int j;
2497 int len = self->length;
2498 int sublen = substring->length;
2499 PyObject *str;
2500
2501 for (i = j = 0; i < len - sublen; ) {
2502 if (Py_UNICODE_MATCH(self, i, substring)) {
2503 if (maxcount-- <= 0)
2504 break;
2505 SPLIT_APPEND(self->str, j, i);
2506 i = j = i + sublen;
2507 } else
2508 i++;
2509 }
2510 if (j <= len) {
2511 SPLIT_APPEND(self->str, j, len);
2512 }
2513 return list;
2514
2515 onError:
2516 Py_DECREF(list);
2517 return NULL;
2518}
2519
2520#undef SPLIT_APPEND
2521
2522static
2523PyObject *split(PyUnicodeObject *self,
2524 PyUnicodeObject *substring,
2525 int maxcount)
2526{
2527 PyObject *list;
2528
2529 if (maxcount < 0)
2530 maxcount = INT_MAX;
2531
2532 list = PyList_New(0);
2533 if (!list)
2534 return NULL;
2535
2536 if (substring == NULL)
2537 return split_whitespace(self,list,maxcount);
2538
2539 else if (substring->length == 1)
2540 return split_char(self,list,substring->str[0],maxcount);
2541
2542 else if (substring->length == 0) {
2543 Py_DECREF(list);
2544 PyErr_SetString(PyExc_ValueError, "empty separator");
2545 return NULL;
2546 }
2547 else
2548 return split_substring(self,list,substring,maxcount);
2549}
2550
2551static
2552PyObject *strip(PyUnicodeObject *self,
2553 int left,
2554 int right)
2555{
2556 Py_UNICODE *p = self->str;
2557 int start = 0;
2558 int end = self->length;
2559
2560 if (left)
2561 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2562 start++;
2563
2564 if (right)
2565 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2566 end--;
2567
2568 if (start == 0 && end == self->length) {
2569 /* couldn't strip anything off, return original string */
2570 Py_INCREF(self);
2571 return (PyObject*) self;
2572 }
2573
2574 return (PyObject*) PyUnicode_FromUnicode(
2575 self->str + start,
2576 end - start
2577 );
2578}
2579
2580static
2581PyObject *replace(PyUnicodeObject *self,
2582 PyUnicodeObject *str1,
2583 PyUnicodeObject *str2,
2584 int maxcount)
2585{
2586 PyUnicodeObject *u;
2587
2588 if (maxcount < 0)
2589 maxcount = INT_MAX;
2590
2591 if (str1->length == 1 && str2->length == 1) {
2592 int i;
2593
2594 /* replace characters */
2595 if (!findchar(self->str, self->length, str1->str[0])) {
2596 /* nothing to replace, return original string */
2597 Py_INCREF(self);
2598 u = self;
2599 } else {
2600 Py_UNICODE u1 = str1->str[0];
2601 Py_UNICODE u2 = str2->str[0];
2602
2603 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2604 self->str,
2605 self->length
2606 );
2607 if (u)
2608 for (i = 0; i < u->length; i++)
2609 if (u->str[i] == u1) {
2610 if (--maxcount < 0)
2611 break;
2612 u->str[i] = u2;
2613 }
2614 }
2615
2616 } else {
2617 int n, i;
2618 Py_UNICODE *p;
2619
2620 /* replace strings */
2621 n = count(self, 0, self->length, str1);
2622 if (n > maxcount)
2623 n = maxcount;
2624 if (n == 0) {
2625 /* nothing to replace, return original string */
2626 Py_INCREF(self);
2627 u = self;
2628 } else {
2629 u = _PyUnicode_New(
2630 self->length + n * (str2->length - str1->length));
2631 if (u) {
2632 i = 0;
2633 p = u->str;
2634 while (i <= self->length - str1->length)
2635 if (Py_UNICODE_MATCH(self, i, str1)) {
2636 /* replace string segment */
2637 Py_UNICODE_COPY(p, str2->str, str2->length);
2638 p += str2->length;
2639 i += str1->length;
2640 if (--n <= 0) {
2641 /* copy remaining part */
2642 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2643 break;
2644 }
2645 } else
2646 *p++ = self->str[i++];
2647 }
2648 }
2649 }
2650
2651 return (PyObject *) u;
2652}
2653
2654/* --- Unicode Object Methods --------------------------------------------- */
2655
2656static char title__doc__[] =
2657"S.title() -> unicode\n\
2658\n\
2659Return a titlecased version of S, i.e. words start with title case\n\
2660characters, all remaining cased characters have lower case.";
2661
2662static PyObject*
2663unicode_title(PyUnicodeObject *self, PyObject *args)
2664{
2665 if (!PyArg_NoArgs(args))
2666 return NULL;
2667 return fixup(self, fixtitle);
2668}
2669
2670static char capitalize__doc__[] =
2671"S.capitalize() -> unicode\n\
2672\n\
2673Return a capitalized version of S, i.e. make the first character\n\
2674have upper case.";
2675
2676static PyObject*
2677unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2678{
2679 if (!PyArg_NoArgs(args))
2680 return NULL;
2681 return fixup(self, fixcapitalize);
2682}
2683
2684#if 0
2685static char capwords__doc__[] =
2686"S.capwords() -> unicode\n\
2687\n\
2688Apply .capitalize() to all words in S and return the result with\n\
2689normalized whitespace (all whitespace strings are replaced by ' ').";
2690
2691static PyObject*
2692unicode_capwords(PyUnicodeObject *self, PyObject *args)
2693{
2694 PyObject *list;
2695 PyObject *item;
2696 int i;
2697
2698 if (!PyArg_NoArgs(args))
2699 return NULL;
2700
2701 /* Split into words */
2702 list = split(self, NULL, -1);
2703 if (!list)
2704 return NULL;
2705
2706 /* Capitalize each word */
2707 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2708 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2709 fixcapitalize);
2710 if (item == NULL)
2711 goto onError;
2712 Py_DECREF(PyList_GET_ITEM(list, i));
2713 PyList_SET_ITEM(list, i, item);
2714 }
2715
2716 /* Join the words to form a new string */
2717 item = PyUnicode_Join(NULL, list);
2718
2719onError:
2720 Py_DECREF(list);
2721 return (PyObject *)item;
2722}
2723#endif
2724
2725static char center__doc__[] =
2726"S.center(width) -> unicode\n\
2727\n\
2728Return S centered in a Unicode string of length width. Padding is done\n\
2729using spaces.";
2730
2731static PyObject *
2732unicode_center(PyUnicodeObject *self, PyObject *args)
2733{
2734 int marg, left;
2735 int width;
2736
2737 if (!PyArg_ParseTuple(args, "i:center", &width))
2738 return NULL;
2739
2740 if (self->length >= width) {
2741 Py_INCREF(self);
2742 return (PyObject*) self;
2743 }
2744
2745 marg = width - self->length;
2746 left = marg / 2 + (marg & width & 1);
2747
2748 return (PyObject*) pad(self, left, marg - left, ' ');
2749}
2750
2751static int
2752unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2753{
2754 int len1, len2;
2755 Py_UNICODE *s1 = str1->str;
2756 Py_UNICODE *s2 = str2->str;
2757
2758 len1 = str1->length;
2759 len2 = str2->length;
2760
2761 while (len1 > 0 && len2 > 0) {
2762 int cmp = (*s1++) - (*s2++);
2763 if (cmp)
2764 /* This should make Christian happy! */
2765 return (cmp < 0) ? -1 : (cmp != 0);
2766 len1--, len2--;
2767 }
2768
2769 return (len1 < len2) ? -1 : (len1 != len2);
2770}
2771
2772int PyUnicode_Compare(PyObject *left,
2773 PyObject *right)
2774{
2775 PyUnicodeObject *u = NULL, *v = NULL;
2776 int result;
2777
2778 /* Coerce the two arguments */
2779 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2780 if (u == NULL)
2781 goto onError;
2782 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2783 if (v == NULL)
2784 goto onError;
2785
2786 /* Shortcut for emtpy or interned objects */
2787 if (v == u) {
2788 Py_DECREF(u);
2789 Py_DECREF(v);
2790 return 0;
2791 }
2792
2793 result = unicode_compare(u, v);
2794
2795 Py_DECREF(u);
2796 Py_DECREF(v);
2797 return result;
2798
2799onError:
2800 Py_XDECREF(u);
2801 Py_XDECREF(v);
2802 return -1;
2803}
2804
Guido van Rossum403d68b2000-03-13 15:55:09 +00002805int PyUnicode_Contains(PyObject *container,
2806 PyObject *element)
2807{
2808 PyUnicodeObject *u = NULL, *v = NULL;
2809 int result;
2810 register const Py_UNICODE *p, *e;
2811 register Py_UNICODE ch;
2812
2813 /* Coerce the two arguments */
2814 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2815 if (u == NULL)
2816 goto onError;
2817 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2818 if (v == NULL)
2819 goto onError;
2820
2821 /* Check v in u */
2822 if (PyUnicode_GET_SIZE(v) != 1) {
2823 PyErr_SetString(PyExc_TypeError,
2824 "string member test needs char left operand");
2825 goto onError;
2826 }
2827 ch = *PyUnicode_AS_UNICODE(v);
2828 p = PyUnicode_AS_UNICODE(u);
2829 e = p + PyUnicode_GET_SIZE(u);
2830 result = 0;
2831 while (p < e) {
2832 if (*p++ == ch) {
2833 result = 1;
2834 break;
2835 }
2836 }
2837
2838 Py_DECREF(u);
2839 Py_DECREF(v);
2840 return result;
2841
2842onError:
2843 Py_XDECREF(u);
2844 Py_XDECREF(v);
2845 return -1;
2846}
2847
Guido van Rossumd57fd912000-03-10 22:53:23 +00002848/* Concat to string or Unicode object giving a new Unicode object. */
2849
2850PyObject *PyUnicode_Concat(PyObject *left,
2851 PyObject *right)
2852{
2853 PyUnicodeObject *u = NULL, *v = NULL, *w;
2854
2855 /* Coerce the two arguments */
2856 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2857 if (u == NULL)
2858 goto onError;
2859 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2860 if (v == NULL)
2861 goto onError;
2862
2863 /* Shortcuts */
2864 if (v == unicode_empty) {
2865 Py_DECREF(v);
2866 return (PyObject *)u;
2867 }
2868 if (u == unicode_empty) {
2869 Py_DECREF(u);
2870 return (PyObject *)v;
2871 }
2872
2873 /* Concat the two Unicode strings */
2874 w = _PyUnicode_New(u->length + v->length);
2875 if (w == NULL)
2876 goto onError;
2877 Py_UNICODE_COPY(w->str, u->str, u->length);
2878 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
2879
2880 Py_DECREF(u);
2881 Py_DECREF(v);
2882 return (PyObject *)w;
2883
2884onError:
2885 Py_XDECREF(u);
2886 Py_XDECREF(v);
2887 return NULL;
2888}
2889
2890static char count__doc__[] =
2891"S.count(sub[, start[, end]]) -> int\n\
2892\n\
2893Return the number of occurrences of substring sub in Unicode string\n\
2894S[start:end]. Optional arguments start and end are\n\
2895interpreted as in slice notation.";
2896
2897static PyObject *
2898unicode_count(PyUnicodeObject *self, PyObject *args)
2899{
2900 PyUnicodeObject *substring;
2901 int start = 0;
2902 int end = INT_MAX;
2903 PyObject *result;
2904
2905 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
2906 return NULL;
2907
2908 substring = (PyUnicodeObject *)PyUnicode_FromObject(
2909 (PyObject *)substring);
2910 if (substring == NULL)
2911 return NULL;
2912
2913 if (substring->length == 0) {
2914 Py_DECREF(substring);
2915 return PyInt_FromLong((long) 0);
2916 }
2917
2918 if (start < 0)
2919 start += self->length;
2920 if (start < 0)
2921 start = 0;
2922 if (end > self->length)
2923 end = self->length;
2924 if (end < 0)
2925 end += self->length;
2926 if (end < 0)
2927 end = 0;
2928
2929 result = PyInt_FromLong((long) count(self, start, end, substring));
2930
2931 Py_DECREF(substring);
2932 return result;
2933}
2934
2935static char encode__doc__[] =
2936"S.encode([encoding[,errors]]) -> string\n\
2937\n\
2938Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
2939errors may be given to set a different error handling scheme. Default\n\
2940is 'strict' meaning that encoding errors raise a ValueError. Other\n\
2941possible values are 'ignore' and 'replace'.";
2942
2943static PyObject *
2944unicode_encode(PyUnicodeObject *self, PyObject *args)
2945{
2946 char *encoding = NULL;
2947 char *errors = NULL;
2948 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
2949 return NULL;
2950 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
2951}
2952
2953static char expandtabs__doc__[] =
2954"S.expandtabs([tabsize]) -> unicode\n\
2955\n\
2956Return a copy of S where all tab characters are expanded using spaces.\n\
2957If tabsize is not given, a tab size of 8 characters is assumed.";
2958
2959static PyObject*
2960unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
2961{
2962 Py_UNICODE *e;
2963 Py_UNICODE *p;
2964 Py_UNICODE *q;
2965 int i, j;
2966 PyUnicodeObject *u;
2967 int tabsize = 8;
2968
2969 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
2970 return NULL;
2971
2972 /* First pass: determine size of ouput string */
2973 i = j = 0;
2974 e = self->str + self->length;
2975 for (p = self->str; p < e; p++)
2976 if (*p == '\t') {
2977 if (tabsize > 0)
2978 j += tabsize - (j % tabsize);
2979 }
2980 else {
2981 j++;
2982 if (*p == '\n' || *p == '\r') {
2983 i += j;
2984 j = 0;
2985 }
2986 }
2987
2988 /* Second pass: create output string and fill it */
2989 u = _PyUnicode_New(i + j);
2990 if (!u)
2991 return NULL;
2992
2993 j = 0;
2994 q = u->str;
2995
2996 for (p = self->str; p < e; p++)
2997 if (*p == '\t') {
2998 if (tabsize > 0) {
2999 i = tabsize - (j % tabsize);
3000 j += i;
3001 while (i--)
3002 *q++ = ' ';
3003 }
3004 }
3005 else {
3006 j++;
3007 *q++ = *p;
3008 if (*p == '\n' || *p == '\r')
3009 j = 0;
3010 }
3011
3012 return (PyObject*) u;
3013}
3014
3015static char find__doc__[] =
3016"S.find(sub [,start [,end]]) -> int\n\
3017\n\
3018Return the lowest index in S where substring sub is found,\n\
3019such that sub is contained within s[start,end]. Optional\n\
3020arguments start and end are interpreted as in slice notation.\n\
3021\n\
3022Return -1 on failure.";
3023
3024static PyObject *
3025unicode_find(PyUnicodeObject *self, PyObject *args)
3026{
3027 PyUnicodeObject *substring;
3028 int start = 0;
3029 int end = INT_MAX;
3030 PyObject *result;
3031
3032 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
3033 return NULL;
3034 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3035 (PyObject *)substring);
3036 if (substring == NULL)
3037 return NULL;
3038
3039 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3040
3041 Py_DECREF(substring);
3042 return result;
3043}
3044
3045static PyObject *
3046unicode_getitem(PyUnicodeObject *self, int index)
3047{
3048 if (index < 0 || index >= self->length) {
3049 PyErr_SetString(PyExc_IndexError, "string index out of range");
3050 return NULL;
3051 }
3052
3053 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3054}
3055
3056static long
3057unicode_hash(PyUnicodeObject *self)
3058{
3059 long hash;
3060 PyObject *utf8;
3061
3062 /* Since Unicode objects compare equal to their UTF-8 string
3063 counterparts, they should also use the UTF-8 strings as basis
3064 for their hash value. This is needed to assure that strings and
3065 Unicode objects behave in the same way as dictionary
3066 keys. Unfortunately, this costs some performance and also some
3067 memory if the cached UTF-8 representation is not used later
3068 on. */
3069 if (self->hash != -1)
3070 return self->hash;
3071 utf8 = utf8_string(self, NULL);
3072 if (utf8 == NULL)
3073 return -1;
3074 hash = PyObject_Hash(utf8);
3075 if (hash == -1)
3076 return -1;
3077 self->hash = hash;
3078 return hash;
3079}
3080
3081static char index__doc__[] =
3082"S.index(sub [,start [,end]]) -> int\n\
3083\n\
3084Like S.find() but raise ValueError when the substring is not found.";
3085
3086static PyObject *
3087unicode_index(PyUnicodeObject *self, PyObject *args)
3088{
3089 int result;
3090 PyUnicodeObject *substring;
3091 int start = 0;
3092 int end = INT_MAX;
3093
3094 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
3095 return NULL;
3096
3097 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3098 (PyObject *)substring);
3099 if (substring == NULL)
3100 return NULL;
3101
3102 result = findstring(self, substring, start, end, 1);
3103
3104 Py_DECREF(substring);
3105 if (result < 0) {
3106 PyErr_SetString(PyExc_ValueError, "substring not found");
3107 return NULL;
3108 }
3109 return PyInt_FromLong(result);
3110}
3111
3112static char islower__doc__[] =
3113"S.islower() -> int\n\
3114\n\
3115Return 1 if all cased characters in S are lowercase and there is\n\
3116at least one cased character in S, 0 otherwise.";
3117
3118static PyObject*
3119unicode_islower(PyUnicodeObject *self, PyObject *args)
3120{
3121 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3122 register const Py_UNICODE *e;
3123 int cased;
3124
3125 if (!PyArg_NoArgs(args))
3126 return NULL;
3127
3128 /* Shortcut for single character strings */
3129 if (PyUnicode_GET_SIZE(self) == 1)
3130 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3131
3132 e = p + PyUnicode_GET_SIZE(self);
3133 cased = 0;
3134 for (; p < e; p++) {
3135 register const Py_UNICODE ch = *p;
3136
3137 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3138 return PyInt_FromLong(0);
3139 else if (!cased && Py_UNICODE_ISLOWER(ch))
3140 cased = 1;
3141 }
3142 return PyInt_FromLong(cased);
3143}
3144
3145static char isupper__doc__[] =
3146"S.isupper() -> int\n\
3147\n\
3148Return 1 if all cased characters in S are uppercase and there is\n\
3149at least one cased character in S, 0 otherwise.";
3150
3151static PyObject*
3152unicode_isupper(PyUnicodeObject *self, PyObject *args)
3153{
3154 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3155 register const Py_UNICODE *e;
3156 int cased;
3157
3158 if (!PyArg_NoArgs(args))
3159 return NULL;
3160
3161 /* Shortcut for single character strings */
3162 if (PyUnicode_GET_SIZE(self) == 1)
3163 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3164
3165 e = p + PyUnicode_GET_SIZE(self);
3166 cased = 0;
3167 for (; p < e; p++) {
3168 register const Py_UNICODE ch = *p;
3169
3170 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3171 return PyInt_FromLong(0);
3172 else if (!cased && Py_UNICODE_ISUPPER(ch))
3173 cased = 1;
3174 }
3175 return PyInt_FromLong(cased);
3176}
3177
3178static char istitle__doc__[] =
3179"S.istitle() -> int\n\
3180\n\
3181Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3182may only follow uncased characters and lowercase characters only cased\n\
3183ones. Return 0 otherwise.";
3184
3185static PyObject*
3186unicode_istitle(PyUnicodeObject *self, PyObject *args)
3187{
3188 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3189 register const Py_UNICODE *e;
3190 int cased, previous_is_cased;
3191
3192 if (!PyArg_NoArgs(args))
3193 return NULL;
3194
3195 /* Shortcut for single character strings */
3196 if (PyUnicode_GET_SIZE(self) == 1)
3197 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3198 (Py_UNICODE_ISUPPER(*p) != 0));
3199
3200 e = p + PyUnicode_GET_SIZE(self);
3201 cased = 0;
3202 previous_is_cased = 0;
3203 for (; p < e; p++) {
3204 register const Py_UNICODE ch = *p;
3205
3206 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3207 if (previous_is_cased)
3208 return PyInt_FromLong(0);
3209 previous_is_cased = 1;
3210 cased = 1;
3211 }
3212 else if (Py_UNICODE_ISLOWER(ch)) {
3213 if (!previous_is_cased)
3214 return PyInt_FromLong(0);
3215 previous_is_cased = 1;
3216 cased = 1;
3217 }
3218 else
3219 previous_is_cased = 0;
3220 }
3221 return PyInt_FromLong(cased);
3222}
3223
3224static char isspace__doc__[] =
3225"S.isspace() -> int\n\
3226\n\
3227Return 1 if there are only whitespace characters in S,\n\
32280 otherwise.";
3229
3230static PyObject*
3231unicode_isspace(PyUnicodeObject *self, PyObject *args)
3232{
3233 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3234 register const Py_UNICODE *e;
3235
3236 if (!PyArg_NoArgs(args))
3237 return NULL;
3238
3239 /* Shortcut for single character strings */
3240 if (PyUnicode_GET_SIZE(self) == 1 &&
3241 Py_UNICODE_ISSPACE(*p))
3242 return PyInt_FromLong(1);
3243
3244 e = p + PyUnicode_GET_SIZE(self);
3245 for (; p < e; p++) {
3246 if (!Py_UNICODE_ISSPACE(*p))
3247 return PyInt_FromLong(0);
3248 }
3249 return PyInt_FromLong(1);
3250}
3251
3252static char isdecimal__doc__[] =
3253"S.isdecimal() -> int\n\
3254\n\
3255Return 1 if there are only decimal characters in S,\n\
32560 otherwise.";
3257
3258static PyObject*
3259unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3260{
3261 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3262 register const Py_UNICODE *e;
3263
3264 if (!PyArg_NoArgs(args))
3265 return NULL;
3266
3267 /* Shortcut for single character strings */
3268 if (PyUnicode_GET_SIZE(self) == 1 &&
3269 Py_UNICODE_ISDECIMAL(*p))
3270 return PyInt_FromLong(1);
3271
3272 e = p + PyUnicode_GET_SIZE(self);
3273 for (; p < e; p++) {
3274 if (!Py_UNICODE_ISDECIMAL(*p))
3275 return PyInt_FromLong(0);
3276 }
3277 return PyInt_FromLong(1);
3278}
3279
3280static char isdigit__doc__[] =
3281"S.isdigit() -> int\n\
3282\n\
3283Return 1 if there are only digit characters in S,\n\
32840 otherwise.";
3285
3286static PyObject*
3287unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3288{
3289 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3290 register const Py_UNICODE *e;
3291
3292 if (!PyArg_NoArgs(args))
3293 return NULL;
3294
3295 /* Shortcut for single character strings */
3296 if (PyUnicode_GET_SIZE(self) == 1 &&
3297 Py_UNICODE_ISDIGIT(*p))
3298 return PyInt_FromLong(1);
3299
3300 e = p + PyUnicode_GET_SIZE(self);
3301 for (; p < e; p++) {
3302 if (!Py_UNICODE_ISDIGIT(*p))
3303 return PyInt_FromLong(0);
3304 }
3305 return PyInt_FromLong(1);
3306}
3307
3308static char isnumeric__doc__[] =
3309"S.isnumeric() -> int\n\
3310\n\
3311Return 1 if there are only numeric characters in S,\n\
33120 otherwise.";
3313
3314static PyObject*
3315unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3316{
3317 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3318 register const Py_UNICODE *e;
3319
3320 if (!PyArg_NoArgs(args))
3321 return NULL;
3322
3323 /* Shortcut for single character strings */
3324 if (PyUnicode_GET_SIZE(self) == 1 &&
3325 Py_UNICODE_ISNUMERIC(*p))
3326 return PyInt_FromLong(1);
3327
3328 e = p + PyUnicode_GET_SIZE(self);
3329 for (; p < e; p++) {
3330 if (!Py_UNICODE_ISNUMERIC(*p))
3331 return PyInt_FromLong(0);
3332 }
3333 return PyInt_FromLong(1);
3334}
3335
3336static char join__doc__[] =
3337"S.join(sequence) -> unicode\n\
3338\n\
3339Return a string which is the concatenation of the strings in the\n\
3340sequence. The separator between elements is S.";
3341
3342static PyObject*
3343unicode_join(PyUnicodeObject *self, PyObject *args)
3344{
3345 PyObject *data;
3346 if (!PyArg_ParseTuple(args, "O:join", &data))
3347 return NULL;
3348
3349 return PyUnicode_Join((PyObject *)self, data);
3350}
3351
3352static int
3353unicode_length(PyUnicodeObject *self)
3354{
3355 return self->length;
3356}
3357
3358static char ljust__doc__[] =
3359"S.ljust(width) -> unicode\n\
3360\n\
3361Return S left justified in a Unicode string of length width. Padding is\n\
3362done using spaces.";
3363
3364static PyObject *
3365unicode_ljust(PyUnicodeObject *self, PyObject *args)
3366{
3367 int width;
3368 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3369 return NULL;
3370
3371 if (self->length >= width) {
3372 Py_INCREF(self);
3373 return (PyObject*) self;
3374 }
3375
3376 return (PyObject*) pad(self, 0, width - self->length, ' ');
3377}
3378
3379static char lower__doc__[] =
3380"S.lower() -> unicode\n\
3381\n\
3382Return a copy of the string S converted to lowercase.";
3383
3384static PyObject*
3385unicode_lower(PyUnicodeObject *self, PyObject *args)
3386{
3387 if (!PyArg_NoArgs(args))
3388 return NULL;
3389 return fixup(self, fixlower);
3390}
3391
3392static char lstrip__doc__[] =
3393"S.lstrip() -> unicode\n\
3394\n\
3395Return a copy of the string S with leading whitespace removed.";
3396
3397static PyObject *
3398unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3399{
3400 if (!PyArg_NoArgs(args))
3401 return NULL;
3402 return strip(self, 1, 0);
3403}
3404
3405static PyObject*
3406unicode_repeat(PyUnicodeObject *str, int len)
3407{
3408 PyUnicodeObject *u;
3409 Py_UNICODE *p;
3410
3411 if (len < 0)
3412 len = 0;
3413
3414 if (len == 1) {
3415 /* no repeat, return original string */
3416 Py_INCREF(str);
3417 return (PyObject*) str;
3418 }
3419
3420 u = _PyUnicode_New(len * str->length);
3421 if (!u)
3422 return NULL;
3423
3424 p = u->str;
3425
3426 while (len-- > 0) {
3427 Py_UNICODE_COPY(p, str->str, str->length);
3428 p += str->length;
3429 }
3430
3431 return (PyObject*) u;
3432}
3433
3434PyObject *PyUnicode_Replace(PyObject *obj,
3435 PyObject *subobj,
3436 PyObject *replobj,
3437 int maxcount)
3438{
3439 PyObject *self;
3440 PyObject *str1;
3441 PyObject *str2;
3442 PyObject *result;
3443
3444 self = PyUnicode_FromObject(obj);
3445 if (self == NULL)
3446 return NULL;
3447 str1 = PyUnicode_FromObject(subobj);
3448 if (str1 == NULL) {
3449 Py_DECREF(self);
3450 return NULL;
3451 }
3452 str2 = PyUnicode_FromObject(replobj);
3453 if (str2 == NULL) {
3454 Py_DECREF(self);
3455 Py_DECREF(str1);
3456 return NULL;
3457 }
3458 result = replace((PyUnicodeObject *)self,
3459 (PyUnicodeObject *)str1,
3460 (PyUnicodeObject *)str2,
3461 maxcount);
3462 Py_DECREF(self);
3463 Py_DECREF(str1);
3464 Py_DECREF(str2);
3465 return result;
3466}
3467
3468static char replace__doc__[] =
3469"S.replace (old, new[, maxsplit]) -> unicode\n\
3470\n\
3471Return a copy of S with all occurrences of substring\n\
3472old replaced by new. If the optional argument maxsplit is\n\
3473given, only the first maxsplit occurrences are replaced.";
3474
3475static PyObject*
3476unicode_replace(PyUnicodeObject *self, PyObject *args)
3477{
3478 PyUnicodeObject *str1;
3479 PyUnicodeObject *str2;
3480 int maxcount = -1;
3481 PyObject *result;
3482
3483 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3484 return NULL;
3485 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3486 if (str1 == NULL)
3487 return NULL;
3488 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3489 if (str2 == NULL)
3490 return NULL;
3491
3492 result = replace(self, str1, str2, maxcount);
3493
3494 Py_DECREF(str1);
3495 Py_DECREF(str2);
3496 return result;
3497}
3498
3499static
3500PyObject *unicode_repr(PyObject *unicode)
3501{
3502 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3503 PyUnicode_GET_SIZE(unicode),
3504 1);
3505}
3506
3507static char rfind__doc__[] =
3508"S.rfind(sub [,start [,end]]) -> int\n\
3509\n\
3510Return the highest index in S where substring sub is found,\n\
3511such that sub is contained within s[start,end]. Optional\n\
3512arguments start and end are interpreted as in slice notation.\n\
3513\n\
3514Return -1 on failure.";
3515
3516static PyObject *
3517unicode_rfind(PyUnicodeObject *self, PyObject *args)
3518{
3519 PyUnicodeObject *substring;
3520 int start = 0;
3521 int end = INT_MAX;
3522 PyObject *result;
3523
3524 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3525 return NULL;
3526 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3527 (PyObject *)substring);
3528 if (substring == NULL)
3529 return NULL;
3530
3531 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3532
3533 Py_DECREF(substring);
3534 return result;
3535}
3536
3537static char rindex__doc__[] =
3538"S.rindex(sub [,start [,end]]) -> int\n\
3539\n\
3540Like S.rfind() but raise ValueError when the substring is not found.";
3541
3542static PyObject *
3543unicode_rindex(PyUnicodeObject *self, PyObject *args)
3544{
3545 int result;
3546 PyUnicodeObject *substring;
3547 int start = 0;
3548 int end = INT_MAX;
3549
3550 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3551 return NULL;
3552 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3553 (PyObject *)substring);
3554 if (substring == NULL)
3555 return NULL;
3556
3557 result = findstring(self, substring, start, end, -1);
3558
3559 Py_DECREF(substring);
3560 if (result < 0) {
3561 PyErr_SetString(PyExc_ValueError, "substring not found");
3562 return NULL;
3563 }
3564 return PyInt_FromLong(result);
3565}
3566
3567static char rjust__doc__[] =
3568"S.rjust(width) -> unicode\n\
3569\n\
3570Return S right justified in a Unicode string of length width. Padding is\n\
3571done using spaces.";
3572
3573static PyObject *
3574unicode_rjust(PyUnicodeObject *self, PyObject *args)
3575{
3576 int width;
3577 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3578 return NULL;
3579
3580 if (self->length >= width) {
3581 Py_INCREF(self);
3582 return (PyObject*) self;
3583 }
3584
3585 return (PyObject*) pad(self, width - self->length, 0, ' ');
3586}
3587
3588static char rstrip__doc__[] =
3589"S.rstrip() -> unicode\n\
3590\n\
3591Return a copy of the string S with trailing whitespace removed.";
3592
3593static PyObject *
3594unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3595{
3596 if (!PyArg_NoArgs(args))
3597 return NULL;
3598 return strip(self, 0, 1);
3599}
3600
3601static PyObject*
3602unicode_slice(PyUnicodeObject *self, int start, int end)
3603{
3604 /* standard clamping */
3605 if (start < 0)
3606 start = 0;
3607 if (end < 0)
3608 end = 0;
3609 if (end > self->length)
3610 end = self->length;
3611 if (start == 0 && end == self->length) {
3612 /* full slice, return original string */
3613 Py_INCREF(self);
3614 return (PyObject*) self;
3615 }
3616 if (start > end)
3617 start = end;
3618 /* copy slice */
3619 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3620 end - start);
3621}
3622
3623PyObject *PyUnicode_Split(PyObject *s,
3624 PyObject *sep,
3625 int maxsplit)
3626{
3627 PyObject *result;
3628
3629 s = PyUnicode_FromObject(s);
3630 if (s == NULL)
3631 return NULL;
3632 if (sep != NULL) {
3633 sep = PyUnicode_FromObject(sep);
3634 if (sep == NULL) {
3635 Py_DECREF(s);
3636 return NULL;
3637 }
3638 }
3639
3640 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3641
3642 Py_DECREF(s);
3643 Py_XDECREF(sep);
3644 return result;
3645}
3646
3647static char split__doc__[] =
3648"S.split([sep [,maxsplit]]) -> list of strings\n\
3649\n\
3650Return a list of the words in S, using sep as the\n\
3651delimiter string. If maxsplit is given, at most maxsplit\n\
3652splits are done. If sep is not specified, any whitespace string\n\
3653is a separator.";
3654
3655static PyObject*
3656unicode_split(PyUnicodeObject *self, PyObject *args)
3657{
3658 PyObject *substring = Py_None;
3659 int maxcount = -1;
3660
3661 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3662 return NULL;
3663
3664 if (substring == Py_None)
3665 return split(self, NULL, maxcount);
3666 else if (PyUnicode_Check(substring))
3667 return split(self, (PyUnicodeObject *)substring, maxcount);
3668 else
3669 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3670}
3671
3672static char splitlines__doc__[] =
3673"S.splitlines([maxsplit]]) -> list of strings\n\
3674\n\
3675Return a list of the lines in S, breaking at line boundaries.\n\
3676If maxsplit is given, at most maxsplit are done. Line breaks are not\n\
3677included in the resulting list.";
3678
3679static PyObject*
3680unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3681{
3682 int maxcount = -1;
3683
3684 if (!PyArg_ParseTuple(args, "|i:splitlines", &maxcount))
3685 return NULL;
3686
3687 return PyUnicode_Splitlines((PyObject *)self, maxcount);
3688}
3689
3690static
3691PyObject *unicode_str(PyUnicodeObject *self)
3692{
3693 return PyUnicode_AsUTF8String((PyObject *)self);
3694}
3695
3696static char strip__doc__[] =
3697"S.strip() -> unicode\n\
3698\n\
3699Return a copy of S with leading and trailing whitespace removed.";
3700
3701static PyObject *
3702unicode_strip(PyUnicodeObject *self, PyObject *args)
3703{
3704 if (!PyArg_NoArgs(args))
3705 return NULL;
3706 return strip(self, 1, 1);
3707}
3708
3709static char swapcase__doc__[] =
3710"S.swapcase() -> unicode\n\
3711\n\
3712Return a copy of S with uppercase characters converted to lowercase\n\
3713and vice versa.";
3714
3715static PyObject*
3716unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3717{
3718 if (!PyArg_NoArgs(args))
3719 return NULL;
3720 return fixup(self, fixswapcase);
3721}
3722
3723static char translate__doc__[] =
3724"S.translate(table) -> unicode\n\
3725\n\
3726Return a copy of the string S, where all characters have been mapped\n\
3727through the given translation table, which must be a mapping of\n\
3728Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3729are left untouched. Characters mapped to None are deleted.";
3730
3731static PyObject*
3732unicode_translate(PyUnicodeObject *self, PyObject *args)
3733{
3734 PyObject *table;
3735
3736 if (!PyArg_ParseTuple(args, "O:translate", &table))
3737 return NULL;
3738 return PyUnicode_TranslateCharmap(self->str,
3739 self->length,
3740 table,
3741 "ignore");
3742}
3743
3744static char upper__doc__[] =
3745"S.upper() -> unicode\n\
3746\n\
3747Return a copy of S converted to uppercase.";
3748
3749static PyObject*
3750unicode_upper(PyUnicodeObject *self, PyObject *args)
3751{
3752 if (!PyArg_NoArgs(args))
3753 return NULL;
3754 return fixup(self, fixupper);
3755}
3756
3757#if 0
3758static char zfill__doc__[] =
3759"S.zfill(width) -> unicode\n\
3760\n\
3761Pad a numeric string x with zeros on the left, to fill a field\n\
3762of the specified width. The string x is never truncated.";
3763
3764static PyObject *
3765unicode_zfill(PyUnicodeObject *self, PyObject *args)
3766{
3767 int fill;
3768 PyUnicodeObject *u;
3769
3770 int width;
3771 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3772 return NULL;
3773
3774 if (self->length >= width) {
3775 Py_INCREF(self);
3776 return (PyObject*) self;
3777 }
3778
3779 fill = width - self->length;
3780
3781 u = pad(self, fill, 0, '0');
3782
3783 if (u->str[fill] == '+' || u->str[fill] == '-') {
3784 /* move sign to beginning of string */
3785 u->str[0] = u->str[fill];
3786 u->str[fill] = '0';
3787 }
3788
3789 return (PyObject*) u;
3790}
3791#endif
3792
3793#if 0
3794static PyObject*
3795unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3796{
3797 if (!PyArg_NoArgs(args))
3798 return NULL;
3799 return PyInt_FromLong(unicode_freelist_size);
3800}
3801#endif
3802
3803static char startswith__doc__[] =
3804"S.startswith(prefix[, start[, end]]) -> int\n\
3805\n\
3806Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3807optional start, test S beginning at that position. With optional end, stop\n\
3808comparing S at that position.";
3809
3810static PyObject *
3811unicode_startswith(PyUnicodeObject *self,
3812 PyObject *args)
3813{
3814 PyUnicodeObject *substring;
3815 int start = 0;
3816 int end = INT_MAX;
3817 PyObject *result;
3818
3819 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3820 return NULL;
3821 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3822 (PyObject *)substring);
3823 if (substring == NULL)
3824 return NULL;
3825
3826 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3827
3828 Py_DECREF(substring);
3829 return result;
3830}
3831
3832
3833static char endswith__doc__[] =
3834"S.endswith(suffix[, start[, end]]) -> int\n\
3835\n\
3836Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3837optional start, test S beginning at that position. With optional end, stop\n\
3838comparing S at that position.";
3839
3840static PyObject *
3841unicode_endswith(PyUnicodeObject *self,
3842 PyObject *args)
3843{
3844 PyUnicodeObject *substring;
3845 int start = 0;
3846 int end = INT_MAX;
3847 PyObject *result;
3848
3849 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3850 return NULL;
3851 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3852 (PyObject *)substring);
3853 if (substring == NULL)
3854 return NULL;
3855
3856 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3857
3858 Py_DECREF(substring);
3859 return result;
3860}
3861
3862
3863static PyMethodDef unicode_methods[] = {
3864
3865 /* Order is according to common usage: often used methods should
3866 appear first, since lookup is done sequentially. */
3867
3868 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3869 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3870 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3871 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3872 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3873 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3874 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3875 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3876 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3877 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3878 {"index", (PyCFunction) unicode_index, 1, index__doc__},
3879 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
3880 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
3881 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
3882/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
3883 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
3884 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
3885 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
3886 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
3887 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
3888 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
3889 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
3890 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
3891 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
3892 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
3893 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
3894 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
3895 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
3896 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
3897 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
3898 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
3899 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
3900 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
3901#if 0
3902 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
3903 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
3904#endif
3905
3906#if 0
3907 /* This one is just used for debugging the implementation. */
3908 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
3909#endif
3910
3911 {NULL, NULL}
3912};
3913
3914static PyObject *
3915unicode_getattr(PyUnicodeObject *self, char *name)
3916{
3917 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
3918}
3919
3920static PySequenceMethods unicode_as_sequence = {
3921 (inquiry) unicode_length, /* sq_length */
3922 (binaryfunc) PyUnicode_Concat, /* sq_concat */
3923 (intargfunc) unicode_repeat, /* sq_repeat */
3924 (intargfunc) unicode_getitem, /* sq_item */
3925 (intintargfunc) unicode_slice, /* sq_slice */
3926 0, /* sq_ass_item */
3927 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003928 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00003929};
3930
3931static int
3932unicode_buffer_getreadbuf(PyUnicodeObject *self,
3933 int index,
3934 const void **ptr)
3935{
3936 if (index != 0) {
3937 PyErr_SetString(PyExc_SystemError,
3938 "accessing non-existent unicode segment");
3939 return -1;
3940 }
3941 *ptr = (void *) self->str;
3942 return PyUnicode_GET_DATA_SIZE(self);
3943}
3944
3945static int
3946unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
3947 const void **ptr)
3948{
3949 PyErr_SetString(PyExc_TypeError,
3950 "cannot use unicode as modifyable buffer");
3951 return -1;
3952}
3953
3954static int
3955unicode_buffer_getsegcount(PyUnicodeObject *self,
3956 int *lenp)
3957{
3958 if (lenp)
3959 *lenp = PyUnicode_GET_DATA_SIZE(self);
3960 return 1;
3961}
3962
3963static int
3964unicode_buffer_getcharbuf(PyUnicodeObject *self,
3965 int index,
3966 const void **ptr)
3967{
3968 PyObject *str;
3969
3970 if (index != 0) {
3971 PyErr_SetString(PyExc_SystemError,
3972 "accessing non-existent unicode segment");
3973 return -1;
3974 }
3975 str = utf8_string(self, NULL);
3976 if (str == NULL)
3977 return -1;
3978 *ptr = (void *) PyString_AS_STRING(str);
3979 return PyString_GET_SIZE(str);
3980}
3981
3982/* Helpers for PyUnicode_Format() */
3983
3984static PyObject *
3985getnextarg(args, arglen, p_argidx)
3986 PyObject *args;
3987int arglen;
3988int *p_argidx;
3989{
3990 int argidx = *p_argidx;
3991 if (argidx < arglen) {
3992 (*p_argidx)++;
3993 if (arglen < 0)
3994 return args;
3995 else
3996 return PyTuple_GetItem(args, argidx);
3997 }
3998 PyErr_SetString(PyExc_TypeError,
3999 "not enough arguments for format string");
4000 return NULL;
4001}
4002
4003#define F_LJUST (1<<0)
4004#define F_SIGN (1<<1)
4005#define F_BLANK (1<<2)
4006#define F_ALT (1<<3)
4007#define F_ZERO (1<<4)
4008
4009static
4010#ifdef HAVE_STDARG_PROTOTYPES
4011int usprintf(register Py_UNICODE *buffer, char *format, ...)
4012#else
4013int usprintf(va_alist) va_dcl
4014#endif
4015{
4016 register int i;
4017 int len;
4018 va_list va;
4019 char *charbuffer;
4020#ifdef HAVE_STDARG_PROTOTYPES
4021 va_start(va, format);
4022#else
4023 Py_UNICODE *args;
4024 char *format;
4025
4026 va_start(va);
4027 buffer = va_arg(va, Py_UNICODE *);
4028 format = va_arg(va, char *);
4029#endif
4030
4031 /* First, format the string as char array, then expand to Py_UNICODE
4032 array. */
4033 charbuffer = (char *)buffer;
4034 len = vsprintf(charbuffer, format, va);
4035 for (i = len - 1; i >= 0; i--)
4036 buffer[i] = (Py_UNICODE) charbuffer[i];
4037
4038 va_end(va);
4039 return len;
4040}
4041
4042static int
4043formatfloat(Py_UNICODE *buf,
4044 int flags,
4045 int prec,
4046 int type,
4047 PyObject *v)
4048{
4049 char fmt[20];
4050 double x;
4051
4052 x = PyFloat_AsDouble(v);
4053 if (x == -1.0 && PyErr_Occurred())
4054 return -1;
4055 if (prec < 0)
4056 prec = 6;
4057 if (prec > 50)
4058 prec = 50; /* Arbitrary limitation */
4059 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4060 type = 'g';
4061 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4062 return usprintf(buf, fmt, x);
4063}
4064
4065static int
4066formatint(Py_UNICODE *buf,
4067 int flags,
4068 int prec,
4069 int type,
4070 PyObject *v)
4071{
4072 char fmt[20];
4073 long x;
4074
4075 x = PyInt_AsLong(v);
4076 if (x == -1 && PyErr_Occurred())
4077 return -1;
4078 if (prec < 0)
4079 prec = 1;
4080 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4081 return usprintf(buf, fmt, x);
4082}
4083
4084static int
4085formatchar(Py_UNICODE *buf,
4086 PyObject *v)
4087{
4088 if (PyUnicode_Check(v))
4089 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4090
4091 else if (PyString_Check(v))
4092 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4093
4094 else {
4095 /* Integer input truncated to a character */
4096 long x;
4097 x = PyInt_AsLong(v);
4098 if (x == -1 && PyErr_Occurred())
4099 return -1;
4100 buf[0] = (char) x;
4101 }
4102 buf[1] = '\0';
4103 return 1;
4104}
4105
4106PyObject *PyUnicode_Format(PyObject *format,
4107 PyObject *args)
4108{
4109 Py_UNICODE *fmt, *res;
4110 int fmtcnt, rescnt, reslen, arglen, argidx;
4111 int args_owned = 0;
4112 PyUnicodeObject *result = NULL;
4113 PyObject *dict = NULL;
4114 PyObject *uformat;
4115
4116 if (format == NULL || args == NULL) {
4117 PyErr_BadInternalCall();
4118 return NULL;
4119 }
4120 uformat = PyUnicode_FromObject(format);
4121 fmt = PyUnicode_AS_UNICODE(uformat);
4122 fmtcnt = PyUnicode_GET_SIZE(uformat);
4123
4124 reslen = rescnt = fmtcnt + 100;
4125 result = _PyUnicode_New(reslen);
4126 if (result == NULL)
4127 goto onError;
4128 res = PyUnicode_AS_UNICODE(result);
4129
4130 if (PyTuple_Check(args)) {
4131 arglen = PyTuple_Size(args);
4132 argidx = 0;
4133 }
4134 else {
4135 arglen = -1;
4136 argidx = -2;
4137 }
4138 if (args->ob_type->tp_as_mapping)
4139 dict = args;
4140
4141 while (--fmtcnt >= 0) {
4142 if (*fmt != '%') {
4143 if (--rescnt < 0) {
4144 rescnt = fmtcnt + 100;
4145 reslen += rescnt;
4146 if (_PyUnicode_Resize(result, reslen) < 0)
4147 return NULL;
4148 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4149 --rescnt;
4150 }
4151 *res++ = *fmt++;
4152 }
4153 else {
4154 /* Got a format specifier */
4155 int flags = 0;
4156 int width = -1;
4157 int prec = -1;
4158 int size = 0;
4159 Py_UNICODE c = '\0';
4160 Py_UNICODE fill;
4161 PyObject *v = NULL;
4162 PyObject *temp = NULL;
4163 Py_UNICODE *buf;
4164 Py_UNICODE sign;
4165 int len;
4166 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4167
4168 fmt++;
4169 if (*fmt == '(') {
4170 Py_UNICODE *keystart;
4171 int keylen;
4172 PyObject *key;
4173 int pcount = 1;
4174
4175 if (dict == NULL) {
4176 PyErr_SetString(PyExc_TypeError,
4177 "format requires a mapping");
4178 goto onError;
4179 }
4180 ++fmt;
4181 --fmtcnt;
4182 keystart = fmt;
4183 /* Skip over balanced parentheses */
4184 while (pcount > 0 && --fmtcnt >= 0) {
4185 if (*fmt == ')')
4186 --pcount;
4187 else if (*fmt == '(')
4188 ++pcount;
4189 fmt++;
4190 }
4191 keylen = fmt - keystart - 1;
4192 if (fmtcnt < 0 || pcount > 0) {
4193 PyErr_SetString(PyExc_ValueError,
4194 "incomplete format key");
4195 goto onError;
4196 }
4197 /* keys are converted to strings (using UTF-8) and
4198 then looked up since Python uses strings to hold
4199 variables names etc. in its namespaces and we
4200 wouldn't want to break common idioms. The
4201 alternative would be using Unicode objects for the
4202 lookup but u"abc" and "abc" have different hash
4203 values (on purpose). */
4204 key = PyUnicode_EncodeUTF8(keystart,
4205 keylen,
4206 NULL);
4207 if (key == NULL)
4208 goto onError;
4209 if (args_owned) {
4210 Py_DECREF(args);
4211 args_owned = 0;
4212 }
4213 args = PyObject_GetItem(dict, key);
4214 Py_DECREF(key);
4215 if (args == NULL) {
4216 goto onError;
4217 }
4218 args_owned = 1;
4219 arglen = -1;
4220 argidx = -2;
4221 }
4222 while (--fmtcnt >= 0) {
4223 switch (c = *fmt++) {
4224 case '-': flags |= F_LJUST; continue;
4225 case '+': flags |= F_SIGN; continue;
4226 case ' ': flags |= F_BLANK; continue;
4227 case '#': flags |= F_ALT; continue;
4228 case '0': flags |= F_ZERO; continue;
4229 }
4230 break;
4231 }
4232 if (c == '*') {
4233 v = getnextarg(args, arglen, &argidx);
4234 if (v == NULL)
4235 goto onError;
4236 if (!PyInt_Check(v)) {
4237 PyErr_SetString(PyExc_TypeError,
4238 "* wants int");
4239 goto onError;
4240 }
4241 width = PyInt_AsLong(v);
4242 if (width < 0) {
4243 flags |= F_LJUST;
4244 width = -width;
4245 }
4246 if (--fmtcnt >= 0)
4247 c = *fmt++;
4248 }
4249 else if (c >= '0' && c <= '9') {
4250 width = c - '0';
4251 while (--fmtcnt >= 0) {
4252 c = *fmt++;
4253 if (c < '0' || c > '9')
4254 break;
4255 if ((width*10) / 10 != width) {
4256 PyErr_SetString(PyExc_ValueError,
4257 "width too big");
4258 goto onError;
4259 }
4260 width = width*10 + (c - '0');
4261 }
4262 }
4263 if (c == '.') {
4264 prec = 0;
4265 if (--fmtcnt >= 0)
4266 c = *fmt++;
4267 if (c == '*') {
4268 v = getnextarg(args, arglen, &argidx);
4269 if (v == NULL)
4270 goto onError;
4271 if (!PyInt_Check(v)) {
4272 PyErr_SetString(PyExc_TypeError,
4273 "* wants int");
4274 goto onError;
4275 }
4276 prec = PyInt_AsLong(v);
4277 if (prec < 0)
4278 prec = 0;
4279 if (--fmtcnt >= 0)
4280 c = *fmt++;
4281 }
4282 else if (c >= '0' && c <= '9') {
4283 prec = c - '0';
4284 while (--fmtcnt >= 0) {
4285 c = Py_CHARMASK(*fmt++);
4286 if (c < '0' || c > '9')
4287 break;
4288 if ((prec*10) / 10 != prec) {
4289 PyErr_SetString(PyExc_ValueError,
4290 "prec too big");
4291 goto onError;
4292 }
4293 prec = prec*10 + (c - '0');
4294 }
4295 }
4296 } /* prec */
4297 if (fmtcnt >= 0) {
4298 if (c == 'h' || c == 'l' || c == 'L') {
4299 size = c;
4300 if (--fmtcnt >= 0)
4301 c = *fmt++;
4302 }
4303 }
4304 if (fmtcnt < 0) {
4305 PyErr_SetString(PyExc_ValueError,
4306 "incomplete format");
4307 goto onError;
4308 }
4309 if (c != '%') {
4310 v = getnextarg(args, arglen, &argidx);
4311 if (v == NULL)
4312 goto onError;
4313 }
4314 sign = 0;
4315 fill = ' ';
4316 switch (c) {
4317
4318 case '%':
4319 buf = tmpbuf;
4320 buf[0] = '%';
4321 len = 1;
4322 break;
4323
4324 case 's':
4325 case 'r':
4326 if (PyUnicode_Check(v) && c == 's') {
4327 temp = v;
4328 Py_INCREF(temp);
4329 }
4330 else {
4331 PyObject *unicode;
4332 if (c == 's')
4333 temp = PyObject_Str(v);
4334 else
4335 temp = PyObject_Repr(v);
4336 if (temp == NULL)
4337 goto onError;
4338 if (!PyString_Check(temp)) {
4339 /* XXX Note: this should never happen, since
4340 PyObject_Repr() and PyObject_Str() assure
4341 this */
4342 Py_DECREF(temp);
4343 PyErr_SetString(PyExc_TypeError,
4344 "%s argument has non-string str()");
4345 goto onError;
4346 }
4347 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4348 PyString_GET_SIZE(temp),
4349 "strict");
4350 Py_DECREF(temp);
4351 temp = unicode;
4352 if (temp == NULL)
4353 goto onError;
4354 }
4355 buf = PyUnicode_AS_UNICODE(temp);
4356 len = PyUnicode_GET_SIZE(temp);
4357 if (prec >= 0 && len > prec)
4358 len = prec;
4359 break;
4360
4361 case 'i':
4362 case 'd':
4363 case 'u':
4364 case 'o':
4365 case 'x':
4366 case 'X':
4367 if (c == 'i')
4368 c = 'd';
4369 buf = tmpbuf;
4370 len = formatint(buf, flags, prec, c, v);
4371 if (len < 0)
4372 goto onError;
4373 sign = (c == 'd');
4374 if (flags & F_ZERO) {
4375 fill = '0';
4376 if ((flags&F_ALT) &&
4377 (c == 'x' || c == 'X') &&
4378 buf[0] == '0' && buf[1] == c) {
4379 *res++ = *buf++;
4380 *res++ = *buf++;
4381 rescnt -= 2;
4382 len -= 2;
4383 width -= 2;
4384 if (width < 0)
4385 width = 0;
4386 }
4387 }
4388 break;
4389
4390 case 'e':
4391 case 'E':
4392 case 'f':
4393 case 'g':
4394 case 'G':
4395 buf = tmpbuf;
4396 len = formatfloat(buf, flags, prec, c, v);
4397 if (len < 0)
4398 goto onError;
4399 sign = 1;
4400 if (flags&F_ZERO)
4401 fill = '0';
4402 break;
4403
4404 case 'c':
4405 buf = tmpbuf;
4406 len = formatchar(buf, v);
4407 if (len < 0)
4408 goto onError;
4409 break;
4410
4411 default:
4412 PyErr_Format(PyExc_ValueError,
4413 "unsupported format character '%c' (0x%x)",
4414 c, c);
4415 goto onError;
4416 }
4417 if (sign) {
4418 if (*buf == '-' || *buf == '+') {
4419 sign = *buf++;
4420 len--;
4421 }
4422 else if (flags & F_SIGN)
4423 sign = '+';
4424 else if (flags & F_BLANK)
4425 sign = ' ';
4426 else
4427 sign = 0;
4428 }
4429 if (width < len)
4430 width = len;
4431 if (rescnt < width + (sign != 0)) {
4432 reslen -= rescnt;
4433 rescnt = width + fmtcnt + 100;
4434 reslen += rescnt;
4435 if (_PyUnicode_Resize(result, reslen) < 0)
4436 return NULL;
4437 res = PyUnicode_AS_UNICODE(result)
4438 + reslen - rescnt;
4439 }
4440 if (sign) {
4441 if (fill != ' ')
4442 *res++ = sign;
4443 rescnt--;
4444 if (width > len)
4445 width--;
4446 }
4447 if (width > len && !(flags & F_LJUST)) {
4448 do {
4449 --rescnt;
4450 *res++ = fill;
4451 } while (--width > len);
4452 }
4453 if (sign && fill == ' ')
4454 *res++ = sign;
4455 memcpy(res, buf, len * sizeof(Py_UNICODE));
4456 res += len;
4457 rescnt -= len;
4458 while (--width >= len) {
4459 --rescnt;
4460 *res++ = ' ';
4461 }
4462 if (dict && (argidx < arglen) && c != '%') {
4463 PyErr_SetString(PyExc_TypeError,
4464 "not all arguments converted");
4465 goto onError;
4466 }
4467 Py_XDECREF(temp);
4468 } /* '%' */
4469 } /* until end */
4470 if (argidx < arglen && !dict) {
4471 PyErr_SetString(PyExc_TypeError,
4472 "not all arguments converted");
4473 goto onError;
4474 }
4475
4476 if (args_owned) {
4477 Py_DECREF(args);
4478 }
4479 Py_DECREF(uformat);
4480 _PyUnicode_Resize(result, reslen - rescnt);
4481 return (PyObject *)result;
4482
4483 onError:
4484 Py_XDECREF(result);
4485 Py_DECREF(uformat);
4486 if (args_owned) {
4487 Py_DECREF(args);
4488 }
4489 return NULL;
4490}
4491
4492static PyBufferProcs unicode_as_buffer = {
4493 (getreadbufferproc) unicode_buffer_getreadbuf,
4494 (getwritebufferproc) unicode_buffer_getwritebuf,
4495 (getsegcountproc) unicode_buffer_getsegcount,
4496 (getcharbufferproc) unicode_buffer_getcharbuf,
4497};
4498
4499PyTypeObject PyUnicode_Type = {
4500 PyObject_HEAD_INIT(&PyType_Type)
4501 0, /* ob_size */
4502 "unicode", /* tp_name */
4503 sizeof(PyUnicodeObject), /* tp_size */
4504 0, /* tp_itemsize */
4505 /* Slots */
4506 (destructor)_PyUnicode_Free, /* tp_dealloc */
4507 0, /* tp_print */
4508 (getattrfunc)unicode_getattr, /* tp_getattr */
4509 0, /* tp_setattr */
4510 (cmpfunc) unicode_compare, /* tp_compare */
4511 (reprfunc) unicode_repr, /* tp_repr */
4512 0, /* tp_as_number */
4513 &unicode_as_sequence, /* tp_as_sequence */
4514 0, /* tp_as_mapping */
4515 (hashfunc) unicode_hash, /* tp_hash*/
4516 0, /* tp_call*/
4517 (reprfunc) unicode_str, /* tp_str */
4518 (getattrofunc) NULL, /* tp_getattro */
4519 (setattrofunc) NULL, /* tp_setattro */
4520 &unicode_as_buffer, /* tp_as_buffer */
4521 Py_TPFLAGS_DEFAULT, /* tp_flags */
4522};
4523
4524/* Initialize the Unicode implementation */
4525
4526void _PyUnicode_Init()
4527{
4528 /* Doublecheck the configuration... */
4529 if (sizeof(Py_UNICODE) != 2)
4530 Py_FatalError("Unicode configuration error: "
4531 "sizeof(Py_UNICODE) != 2 bytes");
4532
4533 unicode_empty = _PyUnicode_New(0);
4534}
4535
4536/* Finalize the Unicode implementation */
4537
4538void
4539_PyUnicode_Fini()
4540{
4541 PyUnicodeObject *u = unicode_freelist;
4542
4543 while (u != NULL) {
4544 PyUnicodeObject *v = u;
4545 u = *(PyUnicodeObject **)u;
4546 free(v);
4547 }
4548 Py_XDECREF(unicode_empty);
4549}