blob: f10f9ab75697b9d6544763989b2c5de7851762bb [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------ */
112
113/* The empty Unicode object */
114static PyUnicodeObject *unicode_empty = NULL;
115
116/* Free list for Unicode objects */
117static PyUnicodeObject *unicode_freelist = NULL;
118static int unicode_freelist_size = 0;
119
120/* --- Unicode Object ----------------------------------------------------- */
121
122static
123int _PyUnicode_Resize(register PyUnicodeObject *unicode,
124 int length)
125{
126 void *oldstr;
127
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000130 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000131
132 /* Resizing unicode_empty is not allowed. */
133 if (unicode == unicode_empty) {
134 PyErr_SetString(PyExc_SystemError,
135 "can't resize empty unicode object");
136 return -1;
137 }
138
139 /* We allocate one more byte to make sure the string is
140 Ux0000 terminated -- XXX is this needed ? */
141 oldstr = unicode->str;
142 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
143 if (!unicode->str) {
144 unicode->str = oldstr;
145 PyErr_NoMemory();
146 return -1;
147 }
148 unicode->str[length] = 0;
149 unicode->length = length;
150
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000151 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000152 /* Reset the object caches */
153 if (unicode->utf8str) {
154 Py_DECREF(unicode->utf8str);
155 unicode->utf8str = NULL;
156 }
157 unicode->hash = -1;
158
159 return 0;
160}
161
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000162int PyUnicode_Resize(PyObject **unicode,
163 int length)
164{
165 PyUnicodeObject *v;
166
167 if (unicode == NULL) {
168 PyErr_BadInternalCall();
169 return -1;
170 }
171 v = (PyUnicodeObject *)*unicode;
172 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
173 PyErr_BadInternalCall();
174 return -1;
175 }
176 return _PyUnicode_Resize(v, length);
177}
178
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179/* We allocate one more byte to make sure the string is
180 Ux0000 terminated -- XXX is this needed ?
181
182 XXX This allocator could further be enhanced by assuring that the
183 free list never reduces its size below 1.
184
185*/
186
187static
188PyUnicodeObject *_PyUnicode_New(int length)
189{
190 register PyUnicodeObject *unicode;
191
192 /* Optimization for empty strings */
193 if (length == 0 && unicode_empty != NULL) {
194 Py_INCREF(unicode_empty);
195 return unicode_empty;
196 }
197
198 /* Unicode freelist & memory allocation */
199 if (unicode_freelist) {
200 unicode = unicode_freelist;
201 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
202 unicode_freelist_size--;
203 unicode->ob_type = &PyUnicode_Type;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000204 _Py_NewReference((PyObject *)unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000205 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000206 /* Keep-Alive optimization: we only upsize the buffer,
207 never downsize it. */
208 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000209 _PyUnicode_Resize(unicode, length)) {
210 free(unicode->str);
211 PyMem_DEL(unicode);
212 return NULL;
213 }
214 }
215 else
216 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
217 }
218 else {
219 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
220 if (unicode == NULL)
221 return NULL;
222 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
223 }
224
Barry Warsaw51ac5802000-03-20 16:36:48 +0000225 if (!unicode->str)
226 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
230 unicode->utf8str = NULL;
231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
235 PyMem_DEL(unicode);
236 PyErr_NoMemory();
237 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000238}
239
240static
241void _PyUnicode_Free(register PyUnicodeObject *unicode)
242{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000244 /* Keep-Alive optimization */
245 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 free(unicode->str);
247 unicode->str = NULL;
248 unicode->length = 0;
249 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000250 if (unicode->utf8str) {
251 Py_DECREF(unicode->utf8str);
252 unicode->utf8str = NULL;
253 }
254 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000255 *(PyUnicodeObject **)unicode = unicode_freelist;
256 unicode_freelist = unicode;
257 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 }
259 else {
260 free(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000261 Py_XDECREF(unicode->utf8str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 PyMem_DEL(unicode);
263 }
264}
265
266PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
267 int size)
268{
269 PyUnicodeObject *unicode;
270
271 unicode = _PyUnicode_New(size);
272 if (!unicode)
273 return NULL;
274
275 /* Copy the Unicode data into the new object */
276 if (u != NULL)
277 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
278
279 return (PyObject *)unicode;
280}
281
282#ifdef HAVE_WCHAR_H
283
284PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
285 int size)
286{
287 PyUnicodeObject *unicode;
288
289 if (w == NULL) {
290 PyErr_BadInternalCall();
291 return NULL;
292 }
293
294 unicode = _PyUnicode_New(size);
295 if (!unicode)
296 return NULL;
297
298 /* Copy the wchar_t data into the new object */
299#ifdef HAVE_USABLE_WCHAR_T
300 memcpy(unicode->str, w, size * sizeof(wchar_t));
301#else
302 {
303 register Py_UNICODE *u;
304 register int i;
305 u = PyUnicode_AS_UNICODE(unicode);
306 for (i = size; i >= 0; i--)
307 *u++ = *w++;
308 }
309#endif
310
311 return (PyObject *)unicode;
312}
313
314int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
315 register wchar_t *w,
316 int size)
317{
318 if (unicode == NULL) {
319 PyErr_BadInternalCall();
320 return -1;
321 }
322 if (size > PyUnicode_GET_SIZE(unicode))
323 size = PyUnicode_GET_SIZE(unicode);
324#ifdef HAVE_USABLE_WCHAR_T
325 memcpy(w, unicode->str, size * sizeof(wchar_t));
326#else
327 {
328 register Py_UNICODE *u;
329 register int i;
330 u = PyUnicode_AS_UNICODE(unicode);
331 for (i = size; i >= 0; i--)
332 *w++ = *u++;
333 }
334#endif
335
336 return size;
337}
338
339#endif
340
341PyObject *PyUnicode_FromObject(register PyObject *obj)
342{
343 const char *s;
344 int len;
345
346 if (obj == NULL) {
347 PyErr_BadInternalCall();
348 return NULL;
349 }
350 else if (PyUnicode_Check(obj)) {
351 Py_INCREF(obj);
352 return obj;
353 }
354 else if (PyString_Check(obj)) {
355 s = PyString_AS_STRING(obj);
356 len = PyString_GET_SIZE(obj);
357 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000358 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
359 /* Overwrite the error message with something more useful in
360 case of a TypeError. */
361 if (PyErr_ExceptionMatches(PyExc_TypeError))
362 PyErr_SetString(PyExc_TypeError,
363 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000364 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000365 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000366 if (len == 0) {
367 Py_INCREF(unicode_empty);
368 return (PyObject *)unicode_empty;
369 }
370 return PyUnicode_DecodeUTF8(s, len, "strict");
371}
372
373PyObject *PyUnicode_Decode(const char *s,
374 int size,
375 const char *encoding,
376 const char *errors)
377{
378 PyObject *buffer = NULL, *unicode;
379
380 /* Shortcut for the default encoding UTF-8 */
381 if (encoding == NULL ||
382 (strcmp(encoding, "utf-8") == 0))
383 return PyUnicode_DecodeUTF8(s, size, errors);
384
385 /* Decode via the codec registry */
386 buffer = PyBuffer_FromMemory((void *)s, size);
387 if (buffer == NULL)
388 goto onError;
389 unicode = PyCodec_Decode(buffer, encoding, errors);
390 if (unicode == NULL)
391 goto onError;
392 if (!PyUnicode_Check(unicode)) {
393 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000394 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000395 unicode->ob_type->tp_name);
396 Py_DECREF(unicode);
397 goto onError;
398 }
399 Py_DECREF(buffer);
400 return unicode;
401
402 onError:
403 Py_XDECREF(buffer);
404 return NULL;
405}
406
407PyObject *PyUnicode_Encode(const Py_UNICODE *s,
408 int size,
409 const char *encoding,
410 const char *errors)
411{
412 PyObject *v, *unicode;
413
414 unicode = PyUnicode_FromUnicode(s, size);
415 if (unicode == NULL)
416 return NULL;
417 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
418 Py_DECREF(unicode);
419 return v;
420}
421
422PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
423 const char *encoding,
424 const char *errors)
425{
426 PyObject *v;
427
428 if (!PyUnicode_Check(unicode)) {
429 PyErr_BadArgument();
430 goto onError;
431 }
432 /* Shortcut for the default encoding UTF-8 */
433 if ((encoding == NULL ||
434 (strcmp(encoding, "utf-8") == 0)) &&
435 errors == NULL)
436 return PyUnicode_AsUTF8String(unicode);
437
438 /* Encode via the codec registry */
439 v = PyCodec_Encode(unicode, encoding, errors);
440 if (v == NULL)
441 goto onError;
442 /* XXX Should we really enforce this ? */
443 if (!PyString_Check(v)) {
444 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000445 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000446 v->ob_type->tp_name);
447 Py_DECREF(v);
448 goto onError;
449 }
450 return v;
451
452 onError:
453 return NULL;
454}
455
456Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
457{
458 if (!PyUnicode_Check(unicode)) {
459 PyErr_BadArgument();
460 goto onError;
461 }
462 return PyUnicode_AS_UNICODE(unicode);
463
464 onError:
465 return NULL;
466}
467
468int PyUnicode_GetSize(PyObject *unicode)
469{
470 if (!PyUnicode_Check(unicode)) {
471 PyErr_BadArgument();
472 goto onError;
473 }
474 return PyUnicode_GET_SIZE(unicode);
475
476 onError:
477 return -1;
478}
479
480/* --- UTF-8 Codec -------------------------------------------------------- */
481
482static
483char utf8_code_length[256] = {
484 /* Map UTF-8 encoded prefix byte to sequence length. zero means
485 illegal prefix. see RFC 2279 for details */
486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
487 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
488 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
489 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
490 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
491 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
492 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
493 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
497 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
498 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
499 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
500 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
501 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
502};
503
504static
505int utf8_decoding_error(const char **source,
506 Py_UNICODE **dest,
507 const char *errors,
508 const char *details)
509{
510 if ((errors == NULL) ||
511 (strcmp(errors,"strict") == 0)) {
512 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000513 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000514 details);
515 return -1;
516 }
517 else if (strcmp(errors,"ignore") == 0) {
518 (*source)++;
519 return 0;
520 }
521 else if (strcmp(errors,"replace") == 0) {
522 (*source)++;
523 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
524 (*dest)++;
525 return 0;
526 }
527 else {
528 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000529 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000530 errors);
531 return -1;
532 }
533}
534
535#define UTF8_ERROR(details) do { \
536 if (utf8_decoding_error(&s, &p, errors, details)) \
537 goto onError; \
538 continue; \
539} while (0)
540
541PyObject *PyUnicode_DecodeUTF8(const char *s,
542 int size,
543 const char *errors)
544{
545 int n;
546 const char *e;
547 PyUnicodeObject *unicode;
548 Py_UNICODE *p;
549
550 /* Note: size will always be longer than the resulting Unicode
551 character count */
552 unicode = _PyUnicode_New(size);
553 if (!unicode)
554 return NULL;
555 if (size == 0)
556 return (PyObject *)unicode;
557
558 /* Unpack UTF-8 encoded data */
559 p = unicode->str;
560 e = s + size;
561
562 while (s < e) {
563 register Py_UNICODE ch = (unsigned char)*s;
564
565 if (ch < 0x80) {
566 *p++ = ch;
567 s++;
568 continue;
569 }
570
571 n = utf8_code_length[ch];
572
573 if (s + n > e)
574 UTF8_ERROR("unexpected end of data");
575
576 switch (n) {
577
578 case 0:
579 UTF8_ERROR("unexpected code byte");
580 break;
581
582 case 1:
583 UTF8_ERROR("internal error");
584 break;
585
586 case 2:
587 if ((s[1] & 0xc0) != 0x80)
588 UTF8_ERROR("invalid data");
589 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
590 if (ch < 0x80)
591 UTF8_ERROR("illegal encoding");
592 else
593 *p++ = ch;
594 break;
595
596 case 3:
597 if ((s[1] & 0xc0) != 0x80 ||
598 (s[2] & 0xc0) != 0x80)
599 UTF8_ERROR("invalid data");
600 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
601 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
602 UTF8_ERROR("illegal encoding");
603 else
604 *p++ = ch;
605 break;
606
607 default:
608 /* Other sizes are only needed for UCS-4 */
609 UTF8_ERROR("unsupported Unicode code range");
610 }
611 s += n;
612 }
613
614 /* Adjust length */
615 if (_PyUnicode_Resize(unicode, p - unicode->str))
616 goto onError;
617
618 return (PyObject *)unicode;
619
620onError:
621 Py_DECREF(unicode);
622 return NULL;
623}
624
625#undef UTF8_ERROR
626
627static
628int utf8_encoding_error(const Py_UNICODE **source,
629 char **dest,
630 const char *errors,
631 const char *details)
632{
633 if ((errors == NULL) ||
634 (strcmp(errors,"strict") == 0)) {
635 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000636 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000637 details);
638 return -1;
639 }
640 else if (strcmp(errors,"ignore") == 0) {
641 return 0;
642 }
643 else if (strcmp(errors,"replace") == 0) {
644 **dest = '?';
645 (*dest)++;
646 return 0;
647 }
648 else {
649 PyErr_Format(PyExc_ValueError,
650 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000651 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652 errors);
653 return -1;
654 }
655}
656
657PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
658 int size,
659 const char *errors)
660{
661 PyObject *v;
662 char *p;
663 char *q;
664
665 v = PyString_FromStringAndSize(NULL, 3 * size);
666 if (v == NULL)
667 return NULL;
668 if (size == 0)
669 goto done;
670
671 p = q = PyString_AS_STRING(v);
672 while (size-- > 0) {
673 Py_UNICODE ch = *s++;
674 if (ch < 0x80)
675 *p++ = (char) ch;
676 else if (ch < 0x0800) {
677 *p++ = 0xc0 | (ch >> 6);
678 *p++ = 0x80 | (ch & 0x3f);
679 } else if (0xD800 <= ch && ch <= 0xDFFF) {
680 /* These byte ranges are reserved for UTF-16 surrogate
681 bytes which the Python implementation currently does
682 not support. */
683 printf("code range problem: U+%04x\n", ch);
684 if (utf8_encoding_error(&s, &p, errors,
685 "unsupported code range"))
686 goto onError;
687 } else {
688 *p++ = 0xe0 | (ch >> 12);
689 *p++ = 0x80 | ((ch >> 6) & 0x3f);
690 *p++ = 0x80 | (ch & 0x3f);
691 }
692 }
693 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000694 if (_PyString_Resize(&v, p - q))
695 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000696
697 done:
698 return v;
699
700 onError:
701 Py_DECREF(v);
702 return NULL;
703}
704
705/* Return a Python string holding the UTF-8 encoded value of the
706 Unicode object.
707
708 The resulting string is cached in the Unicode object for subsequent
709 usage by this function. The cached version is needed to implement
710 the character buffer interface.
711
712 The refcount of the string is *not* incremented.
713
714*/
715
716static
717PyObject *utf8_string(PyUnicodeObject *self,
718 const char *errors)
719{
720 PyObject *v = self->utf8str;
721
722 if (v)
723 return v;
724 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(self),
725 PyUnicode_GET_SIZE(self),
726 errors);
727 if (v && errors == NULL)
728 self->utf8str = v;
729 return v;
730}
731
732PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
733{
734 PyObject *str;
735
736 if (!PyUnicode_Check(unicode)) {
737 PyErr_BadArgument();
738 return NULL;
739 }
740 str = utf8_string((PyUnicodeObject *)unicode, NULL);
741 if (str == NULL)
742 return NULL;
743 Py_INCREF(str);
744 return str;
745}
746
747/* --- UTF-16 Codec ------------------------------------------------------- */
748
749static
750int utf16_decoding_error(const Py_UNICODE **source,
751 Py_UNICODE **dest,
752 const char *errors,
753 const char *details)
754{
755 if ((errors == NULL) ||
756 (strcmp(errors,"strict") == 0)) {
757 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000758 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000759 details);
760 return -1;
761 }
762 else if (strcmp(errors,"ignore") == 0) {
763 return 0;
764 }
765 else if (strcmp(errors,"replace") == 0) {
766 if (dest) {
767 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
768 (*dest)++;
769 }
770 return 0;
771 }
772 else {
773 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000774 "UTF-16 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775 errors);
776 return -1;
777 }
778}
779
780#define UTF16_ERROR(details) do { \
781 if (utf16_decoding_error(&q, &p, errors, details)) \
782 goto onError; \
783 continue; \
784} while(0)
785
786PyObject *PyUnicode_DecodeUTF16(const char *s,
787 int size,
788 const char *errors,
789 int *byteorder)
790{
791 PyUnicodeObject *unicode;
792 Py_UNICODE *p;
793 const Py_UNICODE *q, *e;
794 int bo = 0;
795
796 /* size should be an even number */
797 if (size % sizeof(Py_UNICODE) != 0) {
798 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
799 return NULL;
800 /* The remaining input chars are ignored if we fall through
801 here... */
802 }
803
804 /* Note: size will always be longer than the resulting Unicode
805 character count */
806 unicode = _PyUnicode_New(size);
807 if (!unicode)
808 return NULL;
809 if (size == 0)
810 return (PyObject *)unicode;
811
812 /* Unpack UTF-16 encoded data */
813 p = unicode->str;
814 q = (Py_UNICODE *)s;
815 e = q + (size / sizeof(Py_UNICODE));
816
817 if (byteorder)
818 bo = *byteorder;
819
820 while (q < e) {
821 register Py_UNICODE ch = *q++;
822
823 /* Check for BOM marks (U+FEFF) in the input and adjust
824 current byte order setting accordingly. Swap input
825 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
826 !) */
827#ifdef BYTEORDER_IS_LITTLE_ENDIAN
828 if (ch == 0xFEFF) {
829 bo = -1;
830 continue;
831 } else if (ch == 0xFFFE) {
832 bo = 1;
833 continue;
834 }
835 if (bo == 1)
836 ch = (ch >> 8) | (ch << 8);
837#else
838 if (ch == 0xFEFF) {
839 bo = 1;
840 continue;
841 } else if (ch == 0xFFFE) {
842 bo = -1;
843 continue;
844 }
845 if (bo == -1)
846 ch = (ch >> 8) | (ch << 8);
847#endif
848 if (ch < 0xD800 || ch > 0xDFFF) {
849 *p++ = ch;
850 continue;
851 }
852
853 /* UTF-16 code pair: */
854 if (q >= e)
855 UTF16_ERROR("unexpected end of data");
856 if (0xDC00 <= *q && *q <= 0xDFFF) {
857 q++;
858 if (0xD800 <= *q && *q <= 0xDBFF)
859 /* This is valid data (a UTF-16 surrogate pair), but
860 we are not able to store this information since our
861 Py_UNICODE type only has 16 bits... this might
862 change someday, even though it's unlikely. */
863 UTF16_ERROR("code pairs are not supported");
864 else
865 continue;
866 }
867 UTF16_ERROR("illegal encoding");
868 }
869
870 if (byteorder)
871 *byteorder = bo;
872
873 /* Adjust length */
874 if (_PyUnicode_Resize(unicode, p - unicode->str))
875 goto onError;
876
877 return (PyObject *)unicode;
878
879onError:
880 Py_DECREF(unicode);
881 return NULL;
882}
883
884#undef UTF16_ERROR
885
886PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
887 int size,
888 const char *errors,
889 int byteorder)
890{
891 PyObject *v;
892 Py_UNICODE *p;
893 char *q;
894
895 /* We don't create UTF-16 pairs... */
896 v = PyString_FromStringAndSize(NULL,
897 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
898 if (v == NULL)
899 return NULL;
900 if (size == 0)
901 goto done;
902
903 q = PyString_AS_STRING(v);
904 p = (Py_UNICODE *)q;
905
906 if (byteorder == 0)
907 *p++ = 0xFEFF;
908 if (byteorder == 0 ||
909#ifdef BYTEORDER_IS_LITTLE_ENDIAN
910 byteorder == -1
911#else
912 byteorder == 1
913#endif
914 )
915 memcpy(p, s, size * sizeof(Py_UNICODE));
916 else
917 while (size-- > 0) {
918 Py_UNICODE ch = *s++;
919 *p++ = (ch >> 8) | (ch << 8);
920 }
921 done:
922 return v;
923}
924
925PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
926{
927 if (!PyUnicode_Check(unicode)) {
928 PyErr_BadArgument();
929 return NULL;
930 }
931 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
932 PyUnicode_GET_SIZE(unicode),
933 NULL,
934 0);
935}
936
937/* --- Unicode Escape Codec ----------------------------------------------- */
938
939static
940int unicodeescape_decoding_error(const char **source,
941 unsigned int *x,
942 const char *errors,
943 const char *details)
944{
945 if ((errors == NULL) ||
946 (strcmp(errors,"strict") == 0)) {
947 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000948 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000949 details);
950 return -1;
951 }
952 else if (strcmp(errors,"ignore") == 0) {
953 return 0;
954 }
955 else if (strcmp(errors,"replace") == 0) {
956 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
957 return 0;
958 }
959 else {
960 PyErr_Format(PyExc_ValueError,
961 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000962 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000963 errors);
964 return -1;
965 }
966}
967
968PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
969 int size,
970 const char *errors)
971{
972 PyUnicodeObject *v;
973 Py_UNICODE *p = NULL, *buf = NULL;
974 const char *end;
975
976 /* Escaped strings will always be longer than the resulting
977 Unicode string, so we start with size here and then reduce the
978 length after conversion to the true value. */
979 v = _PyUnicode_New(size);
980 if (v == NULL)
981 goto onError;
982 if (size == 0)
983 return (PyObject *)v;
984 p = buf = PyUnicode_AS_UNICODE(v);
985 end = s + size;
986 while (s < end) {
987 unsigned char c;
988 unsigned int x;
989 int i;
990
991 /* Non-escape characters are interpreted as Unicode ordinals */
992 if (*s != '\\') {
993 *p++ = (unsigned char)*s++;
994 continue;
995 }
996
997 /* \ - Escapes */
998 s++;
999 switch (*s++) {
1000
1001 /* \x escapes */
1002 case '\n': break;
1003 case '\\': *p++ = '\\'; break;
1004 case '\'': *p++ = '\''; break;
1005 case '\"': *p++ = '\"'; break;
1006 case 'b': *p++ = '\b'; break;
1007 case 'f': *p++ = '\014'; break; /* FF */
1008 case 't': *p++ = '\t'; break;
1009 case 'n': *p++ = '\n'; break;
1010 case 'r': *p++ = '\r'; break;
1011 case 'v': *p++ = '\013'; break; /* VT */
1012 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1013
1014 /* \OOO (octal) escapes */
1015 case '0': case '1': case '2': case '3':
1016 case '4': case '5': case '6': case '7':
1017 c = s[-1] - '0';
1018 if ('0' <= *s && *s <= '7') {
1019 c = (c<<3) + *s++ - '0';
1020 if ('0' <= *s && *s <= '7')
1021 c = (c<<3) + *s++ - '0';
1022 }
1023 *p++ = c;
1024 break;
1025
1026 /* \xXXXX escape with 0-4 hex digits */
1027 case 'x':
1028 x = 0;
1029 c = (unsigned char)*s;
1030 if (isxdigit(c)) {
1031 do {
1032 x = (x<<4) & ~0xF;
1033 if ('0' <= c && c <= '9')
1034 x += c - '0';
1035 else if ('a' <= c && c <= 'f')
1036 x += 10 + c - 'a';
1037 else
1038 x += 10 + c - 'A';
1039 c = (unsigned char)*++s;
1040 } while (isxdigit(c));
1041 *p++ = x;
1042 } else {
1043 *p++ = '\\';
1044 *p++ = (unsigned char)s[-1];
1045 }
1046 break;
1047
1048 /* \uXXXX with 4 hex digits */
1049 case 'u':
1050 for (x = 0, i = 0; i < 4; i++) {
1051 c = (unsigned char)s[i];
1052 if (!isxdigit(c)) {
1053 if (unicodeescape_decoding_error(&s, &x, errors,
1054 "truncated \\uXXXX"))
1055 goto onError;
1056 i++;
1057 break;
1058 }
1059 x = (x<<4) & ~0xF;
1060 if (c >= '0' && c <= '9')
1061 x += c - '0';
1062 else if (c >= 'a' && c <= 'f')
1063 x += 10 + c - 'a';
1064 else
1065 x += 10 + c - 'A';
1066 }
1067 s += i;
1068 *p++ = x;
1069 break;
1070
1071 default:
1072 *p++ = '\\';
1073 *p++ = (unsigned char)s[-1];
1074 break;
1075 }
1076 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001077 if (_PyUnicode_Resize(v, (int)(p - buf)))
1078 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 return (PyObject *)v;
1080
1081 onError:
1082 Py_XDECREF(v);
1083 return NULL;
1084}
1085
1086/* Return a Unicode-Escape string version of the Unicode object.
1087
1088 If quotes is true, the string is enclosed in u"" or u'' quotes as
1089 appropriate.
1090
1091*/
1092
Barry Warsaw51ac5802000-03-20 16:36:48 +00001093static const Py_UNICODE *findchar(const Py_UNICODE *s,
1094 int size,
1095 Py_UNICODE ch);
1096
Guido van Rossumd57fd912000-03-10 22:53:23 +00001097static
1098PyObject *unicodeescape_string(const Py_UNICODE *s,
1099 int size,
1100 int quotes)
1101{
1102 PyObject *repr;
1103 char *p;
1104 char *q;
1105
1106 static const char *hexdigit = "0123456789ABCDEF";
1107
1108 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1109 if (repr == NULL)
1110 return NULL;
1111
1112 p = q = PyString_AS_STRING(repr);
1113
1114 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001115 *p++ = 'u';
1116 *p++ = (findchar(s, size, '\'') &&
1117 !findchar(s, size, '"')) ? '"' : '\'';
1118 }
1119 while (size-- > 0) {
1120 Py_UNICODE ch = *s++;
1121 /* Escape quotes */
1122 if (quotes && (ch == q[1] || ch == '\\')) {
1123 *p++ = '\\';
1124 *p++ = (char) ch;
1125 }
1126 /* Map 16-bit characters to '\uxxxx' */
1127 else if (ch >= 256) {
1128 *p++ = '\\';
1129 *p++ = 'u';
1130 *p++ = hexdigit[(ch >> 12) & 0xf];
1131 *p++ = hexdigit[(ch >> 8) & 0xf];
1132 *p++ = hexdigit[(ch >> 4) & 0xf];
1133 *p++ = hexdigit[ch & 15];
1134 }
1135 /* Map non-printable US ASCII to '\ooo' */
1136 else if (ch < ' ' || ch >= 128) {
1137 *p++ = '\\';
1138 *p++ = hexdigit[(ch >> 6) & 7];
1139 *p++ = hexdigit[(ch >> 3) & 7];
1140 *p++ = hexdigit[ch & 7];
1141 }
1142 /* Copy everything else as-is */
1143 else
1144 *p++ = (char) ch;
1145 }
1146 if (quotes)
1147 *p++ = q[1];
1148
1149 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001150 if (_PyString_Resize(&repr, p - q))
1151 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152
1153 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001154
1155 onError:
1156 Py_DECREF(repr);
1157 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001158}
1159
1160PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1161 int size)
1162{
1163 return unicodeescape_string(s, size, 0);
1164}
1165
1166PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1167{
1168 if (!PyUnicode_Check(unicode)) {
1169 PyErr_BadArgument();
1170 return NULL;
1171 }
1172 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1173 PyUnicode_GET_SIZE(unicode));
1174}
1175
1176/* --- Raw Unicode Escape Codec ------------------------------------------- */
1177
1178PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1179 int size,
1180 const char *errors)
1181{
1182 PyUnicodeObject *v;
1183 Py_UNICODE *p, *buf;
1184 const char *end;
1185 const char *bs;
1186
1187 /* Escaped strings will always be longer than the resulting
1188 Unicode string, so we start with size here and then reduce the
1189 length after conversion to the true value. */
1190 v = _PyUnicode_New(size);
1191 if (v == NULL)
1192 goto onError;
1193 if (size == 0)
1194 return (PyObject *)v;
1195 p = buf = PyUnicode_AS_UNICODE(v);
1196 end = s + size;
1197 while (s < end) {
1198 unsigned char c;
1199 unsigned int x;
1200 int i;
1201
1202 /* Non-escape characters are interpreted as Unicode ordinals */
1203 if (*s != '\\') {
1204 *p++ = (unsigned char)*s++;
1205 continue;
1206 }
1207
1208 /* \u-escapes are only interpreted iff the number of leading
1209 backslashes if odd */
1210 bs = s;
1211 for (;s < end;) {
1212 if (*s != '\\')
1213 break;
1214 *p++ = (unsigned char)*s++;
1215 }
1216 if (((s - bs) & 1) == 0 ||
1217 s >= end ||
1218 *s != 'u') {
1219 continue;
1220 }
1221 p--;
1222 s++;
1223
1224 /* \uXXXX with 4 hex digits */
1225 for (x = 0, i = 0; i < 4; i++) {
1226 c = (unsigned char)s[i];
1227 if (!isxdigit(c)) {
1228 if (unicodeescape_decoding_error(&s, &x, errors,
1229 "truncated \\uXXXX"))
1230 goto onError;
1231 i++;
1232 break;
1233 }
1234 x = (x<<4) & ~0xF;
1235 if (c >= '0' && c <= '9')
1236 x += c - '0';
1237 else if (c >= 'a' && c <= 'f')
1238 x += 10 + c - 'a';
1239 else
1240 x += 10 + c - 'A';
1241 }
1242 s += i;
1243 *p++ = x;
1244 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001245 if (_PyUnicode_Resize(v, (int)(p - buf)))
1246 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001247 return (PyObject *)v;
1248
1249 onError:
1250 Py_XDECREF(v);
1251 return NULL;
1252}
1253
1254PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1255 int size)
1256{
1257 PyObject *repr;
1258 char *p;
1259 char *q;
1260
1261 static const char *hexdigit = "0123456789ABCDEF";
1262
1263 repr = PyString_FromStringAndSize(NULL, 6 * size);
1264 if (repr == NULL)
1265 return NULL;
1266
1267 p = q = PyString_AS_STRING(repr);
1268 while (size-- > 0) {
1269 Py_UNICODE ch = *s++;
1270 /* Map 16-bit characters to '\uxxxx' */
1271 if (ch >= 256) {
1272 *p++ = '\\';
1273 *p++ = 'u';
1274 *p++ = hexdigit[(ch >> 12) & 0xf];
1275 *p++ = hexdigit[(ch >> 8) & 0xf];
1276 *p++ = hexdigit[(ch >> 4) & 0xf];
1277 *p++ = hexdigit[ch & 15];
1278 }
1279 /* Copy everything else as-is */
1280 else
1281 *p++ = (char) ch;
1282 }
1283 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001284 if (_PyString_Resize(&repr, p - q))
1285 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001286
1287 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001288
1289 onError:
1290 Py_DECREF(repr);
1291 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001292}
1293
1294PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1295{
1296 if (!PyUnicode_Check(unicode)) {
1297 PyErr_BadArgument();
1298 return NULL;
1299 }
1300 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1301 PyUnicode_GET_SIZE(unicode));
1302}
1303
1304/* --- Latin-1 Codec ------------------------------------------------------ */
1305
1306PyObject *PyUnicode_DecodeLatin1(const char *s,
1307 int size,
1308 const char *errors)
1309{
1310 PyUnicodeObject *v;
1311 Py_UNICODE *p;
1312
1313 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1314 v = _PyUnicode_New(size);
1315 if (v == NULL)
1316 goto onError;
1317 if (size == 0)
1318 return (PyObject *)v;
1319 p = PyUnicode_AS_UNICODE(v);
1320 while (size-- > 0)
1321 *p++ = (unsigned char)*s++;
1322 return (PyObject *)v;
1323
1324 onError:
1325 Py_XDECREF(v);
1326 return NULL;
1327}
1328
1329static
1330int latin1_encoding_error(const Py_UNICODE **source,
1331 char **dest,
1332 const char *errors,
1333 const char *details)
1334{
1335 if ((errors == NULL) ||
1336 (strcmp(errors,"strict") == 0)) {
1337 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001338 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001339 details);
1340 return -1;
1341 }
1342 else if (strcmp(errors,"ignore") == 0) {
1343 return 0;
1344 }
1345 else if (strcmp(errors,"replace") == 0) {
1346 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001347 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001348 return 0;
1349 }
1350 else {
1351 PyErr_Format(PyExc_ValueError,
1352 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001353 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001354 errors);
1355 return -1;
1356 }
1357}
1358
1359PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1360 int size,
1361 const char *errors)
1362{
1363 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001364 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001365 repr = PyString_FromStringAndSize(NULL, size);
1366 if (repr == NULL)
1367 return NULL;
1368
1369 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001370 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001371 while (size-- > 0) {
1372 Py_UNICODE ch = *p++;
1373 if (ch >= 256) {
1374 if (latin1_encoding_error(&p, &s, errors,
1375 "ordinal not in range(256)"))
1376 goto onError;
1377 }
1378 else
1379 *s++ = (char)ch;
1380 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001381 /* Resize if error handling skipped some characters */
1382 if (s - start < PyString_GET_SIZE(repr))
1383 if (_PyString_Resize(&repr, s - start))
1384 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001385 return repr;
1386
1387 onError:
1388 Py_DECREF(repr);
1389 return NULL;
1390}
1391
1392PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1393{
1394 if (!PyUnicode_Check(unicode)) {
1395 PyErr_BadArgument();
1396 return NULL;
1397 }
1398 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1399 PyUnicode_GET_SIZE(unicode),
1400 NULL);
1401}
1402
1403/* --- 7-bit ASCII Codec -------------------------------------------------- */
1404
1405static
1406int ascii_decoding_error(const char **source,
1407 Py_UNICODE **dest,
1408 const char *errors,
1409 const char *details)
1410{
1411 if ((errors == NULL) ||
1412 (strcmp(errors,"strict") == 0)) {
1413 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001414 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415 details);
1416 return -1;
1417 }
1418 else if (strcmp(errors,"ignore") == 0) {
1419 return 0;
1420 }
1421 else if (strcmp(errors,"replace") == 0) {
1422 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1423 (*dest)++;
1424 return 0;
1425 }
1426 else {
1427 PyErr_Format(PyExc_ValueError,
1428 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001429 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001430 errors);
1431 return -1;
1432 }
1433}
1434
1435PyObject *PyUnicode_DecodeASCII(const char *s,
1436 int size,
1437 const char *errors)
1438{
1439 PyUnicodeObject *v;
1440 Py_UNICODE *p;
1441
1442 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1443 v = _PyUnicode_New(size);
1444 if (v == NULL)
1445 goto onError;
1446 if (size == 0)
1447 return (PyObject *)v;
1448 p = PyUnicode_AS_UNICODE(v);
1449 while (size-- > 0) {
1450 register unsigned char c;
1451
1452 c = (unsigned char)*s++;
1453 if (c < 128)
1454 *p++ = c;
1455 else if (ascii_decoding_error(&s, &p, errors,
1456 "ordinal not in range(128)"))
1457 goto onError;
1458 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001459 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1460 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1461 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001462 return (PyObject *)v;
1463
1464 onError:
1465 Py_XDECREF(v);
1466 return NULL;
1467}
1468
1469static
1470int ascii_encoding_error(const Py_UNICODE **source,
1471 char **dest,
1472 const char *errors,
1473 const char *details)
1474{
1475 if ((errors == NULL) ||
1476 (strcmp(errors,"strict") == 0)) {
1477 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001478 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001479 details);
1480 return -1;
1481 }
1482 else if (strcmp(errors,"ignore") == 0) {
1483 return 0;
1484 }
1485 else if (strcmp(errors,"replace") == 0) {
1486 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001487 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001488 return 0;
1489 }
1490 else {
1491 PyErr_Format(PyExc_ValueError,
1492 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001493 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001494 errors);
1495 return -1;
1496 }
1497}
1498
1499PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1500 int size,
1501 const char *errors)
1502{
1503 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001504 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001505 repr = PyString_FromStringAndSize(NULL, size);
1506 if (repr == NULL)
1507 return NULL;
1508
1509 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001510 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001511 while (size-- > 0) {
1512 Py_UNICODE ch = *p++;
1513 if (ch >= 128) {
1514 if (ascii_encoding_error(&p, &s, errors,
1515 "ordinal not in range(128)"))
1516 goto onError;
1517 }
1518 else
1519 *s++ = (char)ch;
1520 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001521 /* Resize if error handling skipped some characters */
1522 if (s - start < PyString_GET_SIZE(repr))
1523 if (_PyString_Resize(&repr, s - start))
1524 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001525 return repr;
1526
1527 onError:
1528 Py_DECREF(repr);
1529 return NULL;
1530}
1531
1532PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1533{
1534 if (!PyUnicode_Check(unicode)) {
1535 PyErr_BadArgument();
1536 return NULL;
1537 }
1538 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1539 PyUnicode_GET_SIZE(unicode),
1540 NULL);
1541}
1542
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001543#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001544
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001545/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001546
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001547PyObject *PyUnicode_DecodeMBCS(const char *s,
1548 int size,
1549 const char *errors)
1550{
1551 PyUnicodeObject *v;
1552 Py_UNICODE *p;
1553
1554 /* First get the size of the result */
1555 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
1556 if (usize==0)
1557 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1558
1559 v = _PyUnicode_New(usize);
1560 if (v == NULL)
1561 return NULL;
1562 if (usize == 0)
1563 return (PyObject *)v;
1564 p = PyUnicode_AS_UNICODE(v);
1565 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1566 Py_DECREF(v);
1567 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1568 }
1569
1570 return (PyObject *)v;
1571}
1572
1573PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1574 int size,
1575 const char *errors)
1576{
1577 PyObject *repr;
1578 char *s;
1579
1580 /* First get the size of the result */
1581 DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
1582 if (mbcssize==0)
1583 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1584
1585 repr = PyString_FromStringAndSize(NULL, mbcssize);
1586 if (repr == NULL)
1587 return NULL;
1588 if (mbcssize==0)
1589 return repr;
1590
1591 /* Do the conversion */
1592 s = PyString_AS_STRING(repr);
1593 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1594 Py_DECREF(repr);
1595 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1596 }
1597 return repr;
1598}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001599
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001600#endif /* MS_WIN32 */
1601
Guido van Rossumd57fd912000-03-10 22:53:23 +00001602/* --- Character Mapping Codec -------------------------------------------- */
1603
1604static
1605int charmap_decoding_error(const char **source,
1606 Py_UNICODE **dest,
1607 const char *errors,
1608 const char *details)
1609{
1610 if ((errors == NULL) ||
1611 (strcmp(errors,"strict") == 0)) {
1612 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001613 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001614 details);
1615 return -1;
1616 }
1617 else if (strcmp(errors,"ignore") == 0) {
1618 return 0;
1619 }
1620 else if (strcmp(errors,"replace") == 0) {
1621 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1622 (*dest)++;
1623 return 0;
1624 }
1625 else {
1626 PyErr_Format(PyExc_ValueError,
1627 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001628 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001629 errors);
1630 return -1;
1631 }
1632}
1633
1634PyObject *PyUnicode_DecodeCharmap(const char *s,
1635 int size,
1636 PyObject *mapping,
1637 const char *errors)
1638{
1639 PyUnicodeObject *v;
1640 Py_UNICODE *p;
1641
1642 /* Default to Latin-1 */
1643 if (mapping == NULL)
1644 return PyUnicode_DecodeLatin1(s, size, errors);
1645
1646 v = _PyUnicode_New(size);
1647 if (v == NULL)
1648 goto onError;
1649 if (size == 0)
1650 return (PyObject *)v;
1651 p = PyUnicode_AS_UNICODE(v);
1652 while (size-- > 0) {
1653 unsigned char ch = *s++;
1654 PyObject *w, *x;
1655
1656 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1657 w = PyInt_FromLong((long)ch);
1658 if (w == NULL)
1659 goto onError;
1660 x = PyObject_GetItem(mapping, w);
1661 Py_DECREF(w);
1662 if (x == NULL) {
1663 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1664 /* No mapping found: default to Latin-1 mapping */
1665 PyErr_Clear();
1666 *p++ = (Py_UNICODE)ch;
1667 continue;
1668 }
1669 goto onError;
1670 }
1671
1672 /* Apply mapping */
1673 if (PyInt_Check(x)) {
1674 int value = PyInt_AS_LONG(x);
1675 if (value < 0 || value > 65535) {
1676 PyErr_SetString(PyExc_TypeError,
1677 "character mapping must be in range(65336)");
1678 Py_DECREF(x);
1679 goto onError;
1680 }
1681 *p++ = (Py_UNICODE)value;
1682 }
1683 else if (x == Py_None) {
1684 /* undefined mapping */
1685 if (charmap_decoding_error(&s, &p, errors,
1686 "character maps to <undefined>")) {
1687 Py_DECREF(x);
1688 goto onError;
1689 }
1690 }
1691 else if (PyUnicode_Check(x)) {
1692 if (PyUnicode_GET_SIZE(x) != 1) {
1693 /* 1-n mapping */
1694 PyErr_SetString(PyExc_NotImplementedError,
1695 "1-n mappings are currently not implemented");
1696 Py_DECREF(x);
1697 goto onError;
1698 }
1699 *p++ = *PyUnicode_AS_UNICODE(x);
1700 }
1701 else {
1702 /* wrong return value */
1703 PyErr_SetString(PyExc_TypeError,
1704 "character mapping must return integer, None or unicode");
1705 Py_DECREF(x);
1706 goto onError;
1707 }
1708 Py_DECREF(x);
1709 }
1710 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1711 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1712 goto onError;
1713 return (PyObject *)v;
1714
1715 onError:
1716 Py_XDECREF(v);
1717 return NULL;
1718}
1719
1720static
1721int charmap_encoding_error(const Py_UNICODE **source,
1722 char **dest,
1723 const char *errors,
1724 const char *details)
1725{
1726 if ((errors == NULL) ||
1727 (strcmp(errors,"strict") == 0)) {
1728 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001729 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001730 details);
1731 return -1;
1732 }
1733 else if (strcmp(errors,"ignore") == 0) {
1734 return 0;
1735 }
1736 else if (strcmp(errors,"replace") == 0) {
1737 **dest = '?';
1738 (*dest)++;
1739 return 0;
1740 }
1741 else {
1742 PyErr_Format(PyExc_ValueError,
1743 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001744 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001745 errors);
1746 return -1;
1747 }
1748}
1749
1750PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1751 int size,
1752 PyObject *mapping,
1753 const char *errors)
1754{
1755 PyObject *v;
1756 char *s;
1757
1758 /* Default to Latin-1 */
1759 if (mapping == NULL)
1760 return PyUnicode_EncodeLatin1(p, size, errors);
1761
1762 v = PyString_FromStringAndSize(NULL, size);
1763 if (v == NULL)
1764 return NULL;
1765 s = PyString_AS_STRING(v);
1766 while (size-- > 0) {
1767 Py_UNICODE ch = *p++;
1768 PyObject *w, *x;
1769
1770 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1771 w = PyInt_FromLong((long)ch);
1772 if (w == NULL)
1773 goto onError;
1774 x = PyObject_GetItem(mapping, w);
1775 Py_DECREF(w);
1776 if (x == NULL) {
1777 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1778 /* No mapping found: default to Latin-1 mapping if possible */
1779 PyErr_Clear();
1780 if (ch < 256) {
1781 *s++ = (char)ch;
1782 continue;
1783 }
1784 else if (!charmap_encoding_error(&p, &s, errors,
1785 "missing character mapping"))
1786 continue;
1787 }
1788 goto onError;
1789 }
1790
1791 /* Apply mapping */
1792 if (PyInt_Check(x)) {
1793 int value = PyInt_AS_LONG(x);
1794 if (value < 0 || value > 255) {
1795 PyErr_SetString(PyExc_TypeError,
1796 "character mapping must be in range(256)");
1797 Py_DECREF(x);
1798 goto onError;
1799 }
1800 *s++ = (char)value;
1801 }
1802 else if (x == Py_None) {
1803 /* undefined mapping */
1804 if (charmap_encoding_error(&p, &s, errors,
1805 "character maps to <undefined>")) {
1806 Py_DECREF(x);
1807 goto onError;
1808 }
1809 }
1810 else if (PyString_Check(x)) {
1811 if (PyString_GET_SIZE(x) != 1) {
1812 /* 1-n mapping */
1813 PyErr_SetString(PyExc_NotImplementedError,
1814 "1-n mappings are currently not implemented");
1815 Py_DECREF(x);
1816 goto onError;
1817 }
1818 *s++ = *PyString_AS_STRING(x);
1819 }
1820 else {
1821 /* wrong return value */
1822 PyErr_SetString(PyExc_TypeError,
1823 "character mapping must return integer, None or unicode");
1824 Py_DECREF(x);
1825 goto onError;
1826 }
1827 Py_DECREF(x);
1828 }
1829 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1830 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1831 goto onError;
1832 return v;
1833
1834 onError:
1835 Py_DECREF(v);
1836 return NULL;
1837}
1838
1839PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1840 PyObject *mapping)
1841{
1842 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1843 PyErr_BadArgument();
1844 return NULL;
1845 }
1846 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1847 PyUnicode_GET_SIZE(unicode),
1848 mapping,
1849 NULL);
1850}
1851
1852static
1853int translate_error(const Py_UNICODE **source,
1854 Py_UNICODE **dest,
1855 const char *errors,
1856 const char *details)
1857{
1858 if ((errors == NULL) ||
1859 (strcmp(errors,"strict") == 0)) {
1860 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001861 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001862 details);
1863 return -1;
1864 }
1865 else if (strcmp(errors,"ignore") == 0) {
1866 return 0;
1867 }
1868 else if (strcmp(errors,"replace") == 0) {
1869 **dest = '?';
1870 (*dest)++;
1871 return 0;
1872 }
1873 else {
1874 PyErr_Format(PyExc_ValueError,
1875 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001876 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001877 errors);
1878 return -1;
1879 }
1880}
1881
1882PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1883 int size,
1884 PyObject *mapping,
1885 const char *errors)
1886{
1887 PyUnicodeObject *v;
1888 Py_UNICODE *p;
1889
1890 if (mapping == NULL) {
1891 PyErr_BadArgument();
1892 return NULL;
1893 }
1894
1895 /* Output will never be longer than input */
1896 v = _PyUnicode_New(size);
1897 if (v == NULL)
1898 goto onError;
1899 if (size == 0)
1900 goto done;
1901 p = PyUnicode_AS_UNICODE(v);
1902 while (size-- > 0) {
1903 Py_UNICODE ch = *s++;
1904 PyObject *w, *x;
1905
1906 /* Get mapping */
1907 w = PyInt_FromLong(ch);
1908 if (w == NULL)
1909 goto onError;
1910 x = PyObject_GetItem(mapping, w);
1911 Py_DECREF(w);
1912 if (x == NULL) {
1913 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1914 /* No mapping found: default to 1-1 mapping */
1915 PyErr_Clear();
1916 *p++ = ch;
1917 continue;
1918 }
1919 goto onError;
1920 }
1921
1922 /* Apply mapping */
1923 if (PyInt_Check(x))
1924 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1925 else if (x == Py_None) {
1926 /* undefined mapping */
1927 if (translate_error(&s, &p, errors,
1928 "character maps to <undefined>")) {
1929 Py_DECREF(x);
1930 goto onError;
1931 }
1932 }
1933 else if (PyUnicode_Check(x)) {
1934 if (PyUnicode_GET_SIZE(x) != 1) {
1935 /* 1-n mapping */
1936 PyErr_SetString(PyExc_NotImplementedError,
1937 "1-n mappings are currently not implemented");
1938 Py_DECREF(x);
1939 goto onError;
1940 }
1941 *p++ = *PyUnicode_AS_UNICODE(x);
1942 }
1943 else {
1944 /* wrong return value */
1945 PyErr_SetString(PyExc_TypeError,
1946 "translate mapping must return integer, None or unicode");
1947 Py_DECREF(x);
1948 goto onError;
1949 }
1950 Py_DECREF(x);
1951 }
1952 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001953 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1954 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001955
1956 done:
1957 return (PyObject *)v;
1958
1959 onError:
1960 Py_XDECREF(v);
1961 return NULL;
1962}
1963
1964PyObject *PyUnicode_Translate(PyObject *str,
1965 PyObject *mapping,
1966 const char *errors)
1967{
1968 PyObject *result;
1969
1970 str = PyUnicode_FromObject(str);
1971 if (str == NULL)
1972 goto onError;
1973 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1974 PyUnicode_GET_SIZE(str),
1975 mapping,
1976 errors);
1977 Py_DECREF(str);
1978 return result;
1979
1980 onError:
1981 Py_XDECREF(str);
1982 return NULL;
1983}
1984
Guido van Rossum9e896b32000-04-05 20:11:21 +00001985/* --- Decimal Encoder ---------------------------------------------------- */
1986
1987int PyUnicode_EncodeDecimal(Py_UNICODE *s,
1988 int length,
1989 char *output,
1990 const char *errors)
1991{
1992 Py_UNICODE *p, *end;
1993
1994 if (output == NULL) {
1995 PyErr_BadArgument();
1996 return -1;
1997 }
1998
1999 p = s;
2000 end = s + length;
2001 while (p < end) {
2002 register Py_UNICODE ch = *p++;
2003 int decimal;
2004
2005 if (Py_UNICODE_ISSPACE(ch)) {
2006 *output++ = ' ';
2007 continue;
2008 }
2009 decimal = Py_UNICODE_TODECIMAL(ch);
2010 if (decimal >= 0) {
2011 *output++ = '0' + decimal;
2012 continue;
2013 }
Guido van Rossumba477042000-04-06 18:18:10 +00002014 if (0 < ch && ch < 256) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002015 *output++ = ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002016 continue;
2017 }
2018 /* All other characters are considered invalid */
2019 if (errors == NULL || strcmp(errors, "strict") == 0) {
2020 PyErr_SetString(PyExc_ValueError,
2021 "invalid decimal Unicode string");
2022 goto onError;
2023 }
2024 else if (strcmp(errors, "ignore") == 0)
2025 continue;
2026 else if (strcmp(errors, "replace") == 0) {
2027 *output++ = '?';
2028 continue;
2029 }
2030 }
2031 /* 0-terminate the output string */
2032 *output++ = '\0';
2033 return 0;
2034
2035 onError:
2036 return -1;
2037}
2038
Guido van Rossumd57fd912000-03-10 22:53:23 +00002039/* --- Helpers ------------------------------------------------------------ */
2040
2041static
2042int count(PyUnicodeObject *self,
2043 int start,
2044 int end,
2045 PyUnicodeObject *substring)
2046{
2047 int count = 0;
2048
2049 end -= substring->length;
2050
2051 while (start <= end)
2052 if (Py_UNICODE_MATCH(self, start, substring)) {
2053 count++;
2054 start += substring->length;
2055 } else
2056 start++;
2057
2058 return count;
2059}
2060
2061int PyUnicode_Count(PyObject *str,
2062 PyObject *substr,
2063 int start,
2064 int end)
2065{
2066 int result;
2067
2068 str = PyUnicode_FromObject(str);
2069 if (str == NULL)
2070 return -1;
2071 substr = PyUnicode_FromObject(substr);
2072 if (substr == NULL) {
2073 Py_DECREF(substr);
2074 return -1;
2075 }
2076
2077 result = count((PyUnicodeObject *)str,
2078 start, end,
2079 (PyUnicodeObject *)substr);
2080
2081 Py_DECREF(str);
2082 Py_DECREF(substr);
2083 return result;
2084}
2085
2086static
2087int findstring(PyUnicodeObject *self,
2088 PyUnicodeObject *substring,
2089 int start,
2090 int end,
2091 int direction)
2092{
2093 if (start < 0)
2094 start += self->length;
2095 if (start < 0)
2096 start = 0;
2097
2098 if (substring->length == 0)
2099 return start;
2100
2101 if (end > self->length)
2102 end = self->length;
2103 if (end < 0)
2104 end += self->length;
2105 if (end < 0)
2106 end = 0;
2107
2108 end -= substring->length;
2109
2110 if (direction < 0) {
2111 for (; end >= start; end--)
2112 if (Py_UNICODE_MATCH(self, end, substring))
2113 return end;
2114 } else {
2115 for (; start <= end; start++)
2116 if (Py_UNICODE_MATCH(self, start, substring))
2117 return start;
2118 }
2119
2120 return -1;
2121}
2122
2123int PyUnicode_Find(PyObject *str,
2124 PyObject *substr,
2125 int start,
2126 int end,
2127 int direction)
2128{
2129 int result;
2130
2131 str = PyUnicode_FromObject(str);
2132 if (str == NULL)
2133 return -1;
2134 substr = PyUnicode_FromObject(substr);
2135 if (substr == NULL) {
2136 Py_DECREF(substr);
2137 return -1;
2138 }
2139
2140 result = findstring((PyUnicodeObject *)str,
2141 (PyUnicodeObject *)substr,
2142 start, end, direction);
2143 Py_DECREF(str);
2144 Py_DECREF(substr);
2145 return result;
2146}
2147
2148static
2149int tailmatch(PyUnicodeObject *self,
2150 PyUnicodeObject *substring,
2151 int start,
2152 int end,
2153 int direction)
2154{
2155 if (start < 0)
2156 start += self->length;
2157 if (start < 0)
2158 start = 0;
2159
2160 if (substring->length == 0)
2161 return 1;
2162
2163 if (end > self->length)
2164 end = self->length;
2165 if (end < 0)
2166 end += self->length;
2167 if (end < 0)
2168 end = 0;
2169
2170 end -= substring->length;
2171 if (end < start)
2172 return 0;
2173
2174 if (direction > 0) {
2175 if (Py_UNICODE_MATCH(self, end, substring))
2176 return 1;
2177 } else {
2178 if (Py_UNICODE_MATCH(self, start, substring))
2179 return 1;
2180 }
2181
2182 return 0;
2183}
2184
2185int PyUnicode_Tailmatch(PyObject *str,
2186 PyObject *substr,
2187 int start,
2188 int end,
2189 int direction)
2190{
2191 int result;
2192
2193 str = PyUnicode_FromObject(str);
2194 if (str == NULL)
2195 return -1;
2196 substr = PyUnicode_FromObject(substr);
2197 if (substr == NULL) {
2198 Py_DECREF(substr);
2199 return -1;
2200 }
2201
2202 result = tailmatch((PyUnicodeObject *)str,
2203 (PyUnicodeObject *)substr,
2204 start, end, direction);
2205 Py_DECREF(str);
2206 Py_DECREF(substr);
2207 return result;
2208}
2209
2210static
2211const Py_UNICODE *findchar(const Py_UNICODE *s,
2212 int size,
2213 Py_UNICODE ch)
2214{
2215 /* like wcschr, but doesn't stop at NULL characters */
2216
2217 while (size-- > 0) {
2218 if (*s == ch)
2219 return s;
2220 s++;
2221 }
2222
2223 return NULL;
2224}
2225
2226/* Apply fixfct filter to the Unicode object self and return a
2227 reference to the modified object */
2228
2229static
2230PyObject *fixup(PyUnicodeObject *self,
2231 int (*fixfct)(PyUnicodeObject *s))
2232{
2233
2234 PyUnicodeObject *u;
2235
2236 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2237 self->length);
2238 if (u == NULL)
2239 return NULL;
2240 if (!fixfct(u)) {
2241 /* fixfct should return TRUE if it modified the buffer. If
2242 FALSE, return a reference to the original buffer instead
2243 (to save space, not time) */
2244 Py_INCREF(self);
2245 Py_DECREF(u);
2246 return (PyObject*) self;
2247 }
2248 return (PyObject*) u;
2249}
2250
2251static
2252int fixupper(PyUnicodeObject *self)
2253{
2254 int len = self->length;
2255 Py_UNICODE *s = self->str;
2256 int status = 0;
2257
2258 while (len-- > 0) {
2259 register Py_UNICODE ch;
2260
2261 ch = Py_UNICODE_TOUPPER(*s);
2262 if (ch != *s) {
2263 status = 1;
2264 *s = ch;
2265 }
2266 s++;
2267 }
2268
2269 return status;
2270}
2271
2272static
2273int fixlower(PyUnicodeObject *self)
2274{
2275 int len = self->length;
2276 Py_UNICODE *s = self->str;
2277 int status = 0;
2278
2279 while (len-- > 0) {
2280 register Py_UNICODE ch;
2281
2282 ch = Py_UNICODE_TOLOWER(*s);
2283 if (ch != *s) {
2284 status = 1;
2285 *s = ch;
2286 }
2287 s++;
2288 }
2289
2290 return status;
2291}
2292
2293static
2294int fixswapcase(PyUnicodeObject *self)
2295{
2296 int len = self->length;
2297 Py_UNICODE *s = self->str;
2298 int status = 0;
2299
2300 while (len-- > 0) {
2301 if (Py_UNICODE_ISUPPER(*s)) {
2302 *s = Py_UNICODE_TOLOWER(*s);
2303 status = 1;
2304 } else if (Py_UNICODE_ISLOWER(*s)) {
2305 *s = Py_UNICODE_TOUPPER(*s);
2306 status = 1;
2307 }
2308 s++;
2309 }
2310
2311 return status;
2312}
2313
2314static
2315int fixcapitalize(PyUnicodeObject *self)
2316{
2317 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2318 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2319 return 1;
2320 }
2321 return 0;
2322}
2323
2324static
2325int fixtitle(PyUnicodeObject *self)
2326{
2327 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2328 register Py_UNICODE *e;
2329 int previous_is_cased;
2330
2331 /* Shortcut for single character strings */
2332 if (PyUnicode_GET_SIZE(self) == 1) {
2333 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2334 if (*p != ch) {
2335 *p = ch;
2336 return 1;
2337 }
2338 else
2339 return 0;
2340 }
2341
2342 e = p + PyUnicode_GET_SIZE(self);
2343 previous_is_cased = 0;
2344 for (; p < e; p++) {
2345 register const Py_UNICODE ch = *p;
2346
2347 if (previous_is_cased)
2348 *p = Py_UNICODE_TOLOWER(ch);
2349 else
2350 *p = Py_UNICODE_TOTITLE(ch);
2351
2352 if (Py_UNICODE_ISLOWER(ch) ||
2353 Py_UNICODE_ISUPPER(ch) ||
2354 Py_UNICODE_ISTITLE(ch))
2355 previous_is_cased = 1;
2356 else
2357 previous_is_cased = 0;
2358 }
2359 return 1;
2360}
2361
2362PyObject *PyUnicode_Join(PyObject *separator,
2363 PyObject *seq)
2364{
2365 Py_UNICODE *sep;
2366 int seplen;
2367 PyUnicodeObject *res = NULL;
2368 int reslen = 0;
2369 Py_UNICODE *p;
2370 int seqlen = 0;
2371 int sz = 100;
2372 int i;
2373
2374 seqlen = PySequence_Length(seq);
2375 if (seqlen < 0 && PyErr_Occurred())
2376 return NULL;
2377
2378 if (separator == NULL) {
2379 Py_UNICODE blank = ' ';
2380 sep = &blank;
2381 seplen = 1;
2382 }
2383 else {
2384 separator = PyUnicode_FromObject(separator);
2385 if (separator == NULL)
2386 return NULL;
2387 sep = PyUnicode_AS_UNICODE(separator);
2388 seplen = PyUnicode_GET_SIZE(separator);
2389 }
2390
2391 res = _PyUnicode_New(sz);
2392 if (res == NULL)
2393 goto onError;
2394 p = PyUnicode_AS_UNICODE(res);
2395 reslen = 0;
2396
2397 for (i = 0; i < seqlen; i++) {
2398 int itemlen;
2399 PyObject *item;
2400
2401 item = PySequence_GetItem(seq, i);
2402 if (item == NULL)
2403 goto onError;
2404 if (!PyUnicode_Check(item)) {
2405 PyObject *v;
2406 v = PyUnicode_FromObject(item);
2407 Py_DECREF(item);
2408 item = v;
2409 if (item == NULL)
2410 goto onError;
2411 }
2412 itemlen = PyUnicode_GET_SIZE(item);
2413 while (reslen + itemlen + seplen >= sz) {
2414 if (_PyUnicode_Resize(res, sz*2))
2415 goto onError;
2416 sz *= 2;
2417 p = PyUnicode_AS_UNICODE(res) + reslen;
2418 }
2419 if (i > 0) {
2420 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2421 p += seplen;
2422 reslen += seplen;
2423 }
2424 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2425 p += itemlen;
2426 reslen += itemlen;
2427 Py_DECREF(item);
2428 }
2429 if (_PyUnicode_Resize(res, reslen))
2430 goto onError;
2431
2432 Py_XDECREF(separator);
2433 return (PyObject *)res;
2434
2435 onError:
2436 Py_XDECREF(separator);
2437 Py_DECREF(res);
2438 return NULL;
2439}
2440
2441static
2442PyUnicodeObject *pad(PyUnicodeObject *self,
2443 int left,
2444 int right,
2445 Py_UNICODE fill)
2446{
2447 PyUnicodeObject *u;
2448
2449 if (left < 0)
2450 left = 0;
2451 if (right < 0)
2452 right = 0;
2453
2454 if (left == 0 && right == 0) {
2455 Py_INCREF(self);
2456 return self;
2457 }
2458
2459 u = _PyUnicode_New(left + self->length + right);
2460 if (u) {
2461 if (left)
2462 Py_UNICODE_FILL(u->str, fill, left);
2463 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2464 if (right)
2465 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2466 }
2467
2468 return u;
2469}
2470
2471#define SPLIT_APPEND(data, left, right) \
2472 str = PyUnicode_FromUnicode(data + left, right - left); \
2473 if (!str) \
2474 goto onError; \
2475 if (PyList_Append(list, str)) { \
2476 Py_DECREF(str); \
2477 goto onError; \
2478 } \
2479 else \
2480 Py_DECREF(str);
2481
2482static
2483PyObject *split_whitespace(PyUnicodeObject *self,
2484 PyObject *list,
2485 int maxcount)
2486{
2487 register int i;
2488 register int j;
2489 int len = self->length;
2490 PyObject *str;
2491
2492 for (i = j = 0; i < len; ) {
2493 /* find a token */
2494 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2495 i++;
2496 j = i;
2497 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2498 i++;
2499 if (j < i) {
2500 if (maxcount-- <= 0)
2501 break;
2502 SPLIT_APPEND(self->str, j, i);
2503 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2504 i++;
2505 j = i;
2506 }
2507 }
2508 if (j < len) {
2509 SPLIT_APPEND(self->str, j, len);
2510 }
2511 return list;
2512
2513 onError:
2514 Py_DECREF(list);
2515 return NULL;
2516}
2517
2518PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002519 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002520{
2521 register int i;
2522 register int j;
2523 int len;
2524 PyObject *list;
2525 PyObject *str;
2526 Py_UNICODE *data;
2527
2528 string = PyUnicode_FromObject(string);
2529 if (string == NULL)
2530 return NULL;
2531 data = PyUnicode_AS_UNICODE(string);
2532 len = PyUnicode_GET_SIZE(string);
2533
Guido van Rossumd57fd912000-03-10 22:53:23 +00002534 list = PyList_New(0);
2535 if (!list)
2536 goto onError;
2537
2538 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002539 int eol;
2540
Guido van Rossumd57fd912000-03-10 22:53:23 +00002541 /* Find a line and append it */
2542 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2543 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002544
2545 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002546 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 if (i < len) {
2548 if (data[i] == '\r' && i + 1 < len &&
2549 data[i+1] == '\n')
2550 i += 2;
2551 else
2552 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002553 if (keepends)
2554 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002555 }
Guido van Rossum86662912000-04-11 15:38:46 +00002556 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002557 j = i;
2558 }
2559 if (j < len) {
2560 SPLIT_APPEND(data, j, len);
2561 }
2562
2563 Py_DECREF(string);
2564 return list;
2565
2566 onError:
2567 Py_DECREF(list);
2568 Py_DECREF(string);
2569 return NULL;
2570}
2571
2572static
2573PyObject *split_char(PyUnicodeObject *self,
2574 PyObject *list,
2575 Py_UNICODE ch,
2576 int maxcount)
2577{
2578 register int i;
2579 register int j;
2580 int len = self->length;
2581 PyObject *str;
2582
2583 for (i = j = 0; i < len; ) {
2584 if (self->str[i] == ch) {
2585 if (maxcount-- <= 0)
2586 break;
2587 SPLIT_APPEND(self->str, j, i);
2588 i = j = i + 1;
2589 } else
2590 i++;
2591 }
2592 if (j <= len) {
2593 SPLIT_APPEND(self->str, j, len);
2594 }
2595 return list;
2596
2597 onError:
2598 Py_DECREF(list);
2599 return NULL;
2600}
2601
2602static
2603PyObject *split_substring(PyUnicodeObject *self,
2604 PyObject *list,
2605 PyUnicodeObject *substring,
2606 int maxcount)
2607{
2608 register int i;
2609 register int j;
2610 int len = self->length;
2611 int sublen = substring->length;
2612 PyObject *str;
2613
2614 for (i = j = 0; i < len - sublen; ) {
2615 if (Py_UNICODE_MATCH(self, i, substring)) {
2616 if (maxcount-- <= 0)
2617 break;
2618 SPLIT_APPEND(self->str, j, i);
2619 i = j = i + sublen;
2620 } else
2621 i++;
2622 }
2623 if (j <= len) {
2624 SPLIT_APPEND(self->str, j, len);
2625 }
2626 return list;
2627
2628 onError:
2629 Py_DECREF(list);
2630 return NULL;
2631}
2632
2633#undef SPLIT_APPEND
2634
2635static
2636PyObject *split(PyUnicodeObject *self,
2637 PyUnicodeObject *substring,
2638 int maxcount)
2639{
2640 PyObject *list;
2641
2642 if (maxcount < 0)
2643 maxcount = INT_MAX;
2644
2645 list = PyList_New(0);
2646 if (!list)
2647 return NULL;
2648
2649 if (substring == NULL)
2650 return split_whitespace(self,list,maxcount);
2651
2652 else if (substring->length == 1)
2653 return split_char(self,list,substring->str[0],maxcount);
2654
2655 else if (substring->length == 0) {
2656 Py_DECREF(list);
2657 PyErr_SetString(PyExc_ValueError, "empty separator");
2658 return NULL;
2659 }
2660 else
2661 return split_substring(self,list,substring,maxcount);
2662}
2663
2664static
2665PyObject *strip(PyUnicodeObject *self,
2666 int left,
2667 int right)
2668{
2669 Py_UNICODE *p = self->str;
2670 int start = 0;
2671 int end = self->length;
2672
2673 if (left)
2674 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2675 start++;
2676
2677 if (right)
2678 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2679 end--;
2680
2681 if (start == 0 && end == self->length) {
2682 /* couldn't strip anything off, return original string */
2683 Py_INCREF(self);
2684 return (PyObject*) self;
2685 }
2686
2687 return (PyObject*) PyUnicode_FromUnicode(
2688 self->str + start,
2689 end - start
2690 );
2691}
2692
2693static
2694PyObject *replace(PyUnicodeObject *self,
2695 PyUnicodeObject *str1,
2696 PyUnicodeObject *str2,
2697 int maxcount)
2698{
2699 PyUnicodeObject *u;
2700
2701 if (maxcount < 0)
2702 maxcount = INT_MAX;
2703
2704 if (str1->length == 1 && str2->length == 1) {
2705 int i;
2706
2707 /* replace characters */
2708 if (!findchar(self->str, self->length, str1->str[0])) {
2709 /* nothing to replace, return original string */
2710 Py_INCREF(self);
2711 u = self;
2712 } else {
2713 Py_UNICODE u1 = str1->str[0];
2714 Py_UNICODE u2 = str2->str[0];
2715
2716 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2717 self->str,
2718 self->length
2719 );
2720 if (u)
2721 for (i = 0; i < u->length; i++)
2722 if (u->str[i] == u1) {
2723 if (--maxcount < 0)
2724 break;
2725 u->str[i] = u2;
2726 }
2727 }
2728
2729 } else {
2730 int n, i;
2731 Py_UNICODE *p;
2732
2733 /* replace strings */
2734 n = count(self, 0, self->length, str1);
2735 if (n > maxcount)
2736 n = maxcount;
2737 if (n == 0) {
2738 /* nothing to replace, return original string */
2739 Py_INCREF(self);
2740 u = self;
2741 } else {
2742 u = _PyUnicode_New(
2743 self->length + n * (str2->length - str1->length));
2744 if (u) {
2745 i = 0;
2746 p = u->str;
2747 while (i <= self->length - str1->length)
2748 if (Py_UNICODE_MATCH(self, i, str1)) {
2749 /* replace string segment */
2750 Py_UNICODE_COPY(p, str2->str, str2->length);
2751 p += str2->length;
2752 i += str1->length;
2753 if (--n <= 0) {
2754 /* copy remaining part */
2755 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2756 break;
2757 }
2758 } else
2759 *p++ = self->str[i++];
2760 }
2761 }
2762 }
2763
2764 return (PyObject *) u;
2765}
2766
2767/* --- Unicode Object Methods --------------------------------------------- */
2768
2769static char title__doc__[] =
2770"S.title() -> unicode\n\
2771\n\
2772Return a titlecased version of S, i.e. words start with title case\n\
2773characters, all remaining cased characters have lower case.";
2774
2775static PyObject*
2776unicode_title(PyUnicodeObject *self, PyObject *args)
2777{
2778 if (!PyArg_NoArgs(args))
2779 return NULL;
2780 return fixup(self, fixtitle);
2781}
2782
2783static char capitalize__doc__[] =
2784"S.capitalize() -> unicode\n\
2785\n\
2786Return a capitalized version of S, i.e. make the first character\n\
2787have upper case.";
2788
2789static PyObject*
2790unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2791{
2792 if (!PyArg_NoArgs(args))
2793 return NULL;
2794 return fixup(self, fixcapitalize);
2795}
2796
2797#if 0
2798static char capwords__doc__[] =
2799"S.capwords() -> unicode\n\
2800\n\
2801Apply .capitalize() to all words in S and return the result with\n\
2802normalized whitespace (all whitespace strings are replaced by ' ').";
2803
2804static PyObject*
2805unicode_capwords(PyUnicodeObject *self, PyObject *args)
2806{
2807 PyObject *list;
2808 PyObject *item;
2809 int i;
2810
2811 if (!PyArg_NoArgs(args))
2812 return NULL;
2813
2814 /* Split into words */
2815 list = split(self, NULL, -1);
2816 if (!list)
2817 return NULL;
2818
2819 /* Capitalize each word */
2820 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2821 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2822 fixcapitalize);
2823 if (item == NULL)
2824 goto onError;
2825 Py_DECREF(PyList_GET_ITEM(list, i));
2826 PyList_SET_ITEM(list, i, item);
2827 }
2828
2829 /* Join the words to form a new string */
2830 item = PyUnicode_Join(NULL, list);
2831
2832onError:
2833 Py_DECREF(list);
2834 return (PyObject *)item;
2835}
2836#endif
2837
2838static char center__doc__[] =
2839"S.center(width) -> unicode\n\
2840\n\
2841Return S centered in a Unicode string of length width. Padding is done\n\
2842using spaces.";
2843
2844static PyObject *
2845unicode_center(PyUnicodeObject *self, PyObject *args)
2846{
2847 int marg, left;
2848 int width;
2849
2850 if (!PyArg_ParseTuple(args, "i:center", &width))
2851 return NULL;
2852
2853 if (self->length >= width) {
2854 Py_INCREF(self);
2855 return (PyObject*) self;
2856 }
2857
2858 marg = width - self->length;
2859 left = marg / 2 + (marg & width & 1);
2860
2861 return (PyObject*) pad(self, left, marg - left, ' ');
2862}
2863
2864static int
2865unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2866{
2867 int len1, len2;
2868 Py_UNICODE *s1 = str1->str;
2869 Py_UNICODE *s2 = str2->str;
2870
2871 len1 = str1->length;
2872 len2 = str2->length;
2873
2874 while (len1 > 0 && len2 > 0) {
2875 int cmp = (*s1++) - (*s2++);
2876 if (cmp)
2877 /* This should make Christian happy! */
2878 return (cmp < 0) ? -1 : (cmp != 0);
2879 len1--, len2--;
2880 }
2881
2882 return (len1 < len2) ? -1 : (len1 != len2);
2883}
2884
2885int PyUnicode_Compare(PyObject *left,
2886 PyObject *right)
2887{
2888 PyUnicodeObject *u = NULL, *v = NULL;
2889 int result;
2890
2891 /* Coerce the two arguments */
2892 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2893 if (u == NULL)
2894 goto onError;
2895 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2896 if (v == NULL)
2897 goto onError;
2898
2899 /* Shortcut for emtpy or interned objects */
2900 if (v == u) {
2901 Py_DECREF(u);
2902 Py_DECREF(v);
2903 return 0;
2904 }
2905
2906 result = unicode_compare(u, v);
2907
2908 Py_DECREF(u);
2909 Py_DECREF(v);
2910 return result;
2911
2912onError:
2913 Py_XDECREF(u);
2914 Py_XDECREF(v);
2915 return -1;
2916}
2917
Guido van Rossum403d68b2000-03-13 15:55:09 +00002918int PyUnicode_Contains(PyObject *container,
2919 PyObject *element)
2920{
2921 PyUnicodeObject *u = NULL, *v = NULL;
2922 int result;
2923 register const Py_UNICODE *p, *e;
2924 register Py_UNICODE ch;
2925
2926 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002927 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2928 if (v == NULL)
2929 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002930 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2931 if (u == NULL) {
2932 Py_DECREF(v);
2933 goto onError;
2934 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002935
2936 /* Check v in u */
2937 if (PyUnicode_GET_SIZE(v) != 1) {
2938 PyErr_SetString(PyExc_TypeError,
2939 "string member test needs char left operand");
2940 goto onError;
2941 }
2942 ch = *PyUnicode_AS_UNICODE(v);
2943 p = PyUnicode_AS_UNICODE(u);
2944 e = p + PyUnicode_GET_SIZE(u);
2945 result = 0;
2946 while (p < e) {
2947 if (*p++ == ch) {
2948 result = 1;
2949 break;
2950 }
2951 }
2952
2953 Py_DECREF(u);
2954 Py_DECREF(v);
2955 return result;
2956
2957onError:
2958 Py_XDECREF(u);
2959 Py_XDECREF(v);
2960 return -1;
2961}
2962
Guido van Rossumd57fd912000-03-10 22:53:23 +00002963/* Concat to string or Unicode object giving a new Unicode object. */
2964
2965PyObject *PyUnicode_Concat(PyObject *left,
2966 PyObject *right)
2967{
2968 PyUnicodeObject *u = NULL, *v = NULL, *w;
2969
2970 /* Coerce the two arguments */
2971 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2972 if (u == NULL)
2973 goto onError;
2974 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2975 if (v == NULL)
2976 goto onError;
2977
2978 /* Shortcuts */
2979 if (v == unicode_empty) {
2980 Py_DECREF(v);
2981 return (PyObject *)u;
2982 }
2983 if (u == unicode_empty) {
2984 Py_DECREF(u);
2985 return (PyObject *)v;
2986 }
2987
2988 /* Concat the two Unicode strings */
2989 w = _PyUnicode_New(u->length + v->length);
2990 if (w == NULL)
2991 goto onError;
2992 Py_UNICODE_COPY(w->str, u->str, u->length);
2993 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
2994
2995 Py_DECREF(u);
2996 Py_DECREF(v);
2997 return (PyObject *)w;
2998
2999onError:
3000 Py_XDECREF(u);
3001 Py_XDECREF(v);
3002 return NULL;
3003}
3004
3005static char count__doc__[] =
3006"S.count(sub[, start[, end]]) -> int\n\
3007\n\
3008Return the number of occurrences of substring sub in Unicode string\n\
3009S[start:end]. Optional arguments start and end are\n\
3010interpreted as in slice notation.";
3011
3012static PyObject *
3013unicode_count(PyUnicodeObject *self, PyObject *args)
3014{
3015 PyUnicodeObject *substring;
3016 int start = 0;
3017 int end = INT_MAX;
3018 PyObject *result;
3019
3020 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
3021 return NULL;
3022
3023 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3024 (PyObject *)substring);
3025 if (substring == NULL)
3026 return NULL;
3027
3028 if (substring->length == 0) {
3029 Py_DECREF(substring);
3030 return PyInt_FromLong((long) 0);
3031 }
3032
3033 if (start < 0)
3034 start += self->length;
3035 if (start < 0)
3036 start = 0;
3037 if (end > self->length)
3038 end = self->length;
3039 if (end < 0)
3040 end += self->length;
3041 if (end < 0)
3042 end = 0;
3043
3044 result = PyInt_FromLong((long) count(self, start, end, substring));
3045
3046 Py_DECREF(substring);
3047 return result;
3048}
3049
3050static char encode__doc__[] =
3051"S.encode([encoding[,errors]]) -> string\n\
3052\n\
3053Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
3054errors may be given to set a different error handling scheme. Default\n\
3055is 'strict' meaning that encoding errors raise a ValueError. Other\n\
3056possible values are 'ignore' and 'replace'.";
3057
3058static PyObject *
3059unicode_encode(PyUnicodeObject *self, PyObject *args)
3060{
3061 char *encoding = NULL;
3062 char *errors = NULL;
3063 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3064 return NULL;
3065 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3066}
3067
3068static char expandtabs__doc__[] =
3069"S.expandtabs([tabsize]) -> unicode\n\
3070\n\
3071Return a copy of S where all tab characters are expanded using spaces.\n\
3072If tabsize is not given, a tab size of 8 characters is assumed.";
3073
3074static PyObject*
3075unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3076{
3077 Py_UNICODE *e;
3078 Py_UNICODE *p;
3079 Py_UNICODE *q;
3080 int i, j;
3081 PyUnicodeObject *u;
3082 int tabsize = 8;
3083
3084 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3085 return NULL;
3086
3087 /* First pass: determine size of ouput string */
3088 i = j = 0;
3089 e = self->str + self->length;
3090 for (p = self->str; p < e; p++)
3091 if (*p == '\t') {
3092 if (tabsize > 0)
3093 j += tabsize - (j % tabsize);
3094 }
3095 else {
3096 j++;
3097 if (*p == '\n' || *p == '\r') {
3098 i += j;
3099 j = 0;
3100 }
3101 }
3102
3103 /* Second pass: create output string and fill it */
3104 u = _PyUnicode_New(i + j);
3105 if (!u)
3106 return NULL;
3107
3108 j = 0;
3109 q = u->str;
3110
3111 for (p = self->str; p < e; p++)
3112 if (*p == '\t') {
3113 if (tabsize > 0) {
3114 i = tabsize - (j % tabsize);
3115 j += i;
3116 while (i--)
3117 *q++ = ' ';
3118 }
3119 }
3120 else {
3121 j++;
3122 *q++ = *p;
3123 if (*p == '\n' || *p == '\r')
3124 j = 0;
3125 }
3126
3127 return (PyObject*) u;
3128}
3129
3130static char find__doc__[] =
3131"S.find(sub [,start [,end]]) -> int\n\
3132\n\
3133Return the lowest index in S where substring sub is found,\n\
3134such that sub is contained within s[start,end]. Optional\n\
3135arguments start and end are interpreted as in slice notation.\n\
3136\n\
3137Return -1 on failure.";
3138
3139static PyObject *
3140unicode_find(PyUnicodeObject *self, PyObject *args)
3141{
3142 PyUnicodeObject *substring;
3143 int start = 0;
3144 int end = INT_MAX;
3145 PyObject *result;
3146
3147 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
3148 return NULL;
3149 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3150 (PyObject *)substring);
3151 if (substring == NULL)
3152 return NULL;
3153
3154 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3155
3156 Py_DECREF(substring);
3157 return result;
3158}
3159
3160static PyObject *
3161unicode_getitem(PyUnicodeObject *self, int index)
3162{
3163 if (index < 0 || index >= self->length) {
3164 PyErr_SetString(PyExc_IndexError, "string index out of range");
3165 return NULL;
3166 }
3167
3168 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3169}
3170
3171static long
3172unicode_hash(PyUnicodeObject *self)
3173{
3174 long hash;
3175 PyObject *utf8;
3176
3177 /* Since Unicode objects compare equal to their UTF-8 string
3178 counterparts, they should also use the UTF-8 strings as basis
3179 for their hash value. This is needed to assure that strings and
3180 Unicode objects behave in the same way as dictionary
3181 keys. Unfortunately, this costs some performance and also some
3182 memory if the cached UTF-8 representation is not used later
3183 on. */
3184 if (self->hash != -1)
3185 return self->hash;
3186 utf8 = utf8_string(self, NULL);
3187 if (utf8 == NULL)
3188 return -1;
3189 hash = PyObject_Hash(utf8);
3190 if (hash == -1)
3191 return -1;
3192 self->hash = hash;
3193 return hash;
3194}
3195
3196static char index__doc__[] =
3197"S.index(sub [,start [,end]]) -> int\n\
3198\n\
3199Like S.find() but raise ValueError when the substring is not found.";
3200
3201static PyObject *
3202unicode_index(PyUnicodeObject *self, PyObject *args)
3203{
3204 int result;
3205 PyUnicodeObject *substring;
3206 int start = 0;
3207 int end = INT_MAX;
3208
3209 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
3210 return NULL;
3211
3212 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3213 (PyObject *)substring);
3214 if (substring == NULL)
3215 return NULL;
3216
3217 result = findstring(self, substring, start, end, 1);
3218
3219 Py_DECREF(substring);
3220 if (result < 0) {
3221 PyErr_SetString(PyExc_ValueError, "substring not found");
3222 return NULL;
3223 }
3224 return PyInt_FromLong(result);
3225}
3226
3227static char islower__doc__[] =
3228"S.islower() -> int\n\
3229\n\
3230Return 1 if all cased characters in S are lowercase and there is\n\
3231at least one cased character in S, 0 otherwise.";
3232
3233static PyObject*
3234unicode_islower(PyUnicodeObject *self, PyObject *args)
3235{
3236 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3237 register const Py_UNICODE *e;
3238 int cased;
3239
3240 if (!PyArg_NoArgs(args))
3241 return NULL;
3242
3243 /* Shortcut for single character strings */
3244 if (PyUnicode_GET_SIZE(self) == 1)
3245 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3246
3247 e = p + PyUnicode_GET_SIZE(self);
3248 cased = 0;
3249 for (; p < e; p++) {
3250 register const Py_UNICODE ch = *p;
3251
3252 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3253 return PyInt_FromLong(0);
3254 else if (!cased && Py_UNICODE_ISLOWER(ch))
3255 cased = 1;
3256 }
3257 return PyInt_FromLong(cased);
3258}
3259
3260static char isupper__doc__[] =
3261"S.isupper() -> int\n\
3262\n\
3263Return 1 if all cased characters in S are uppercase and there is\n\
3264at least one cased character in S, 0 otherwise.";
3265
3266static PyObject*
3267unicode_isupper(PyUnicodeObject *self, PyObject *args)
3268{
3269 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3270 register const Py_UNICODE *e;
3271 int cased;
3272
3273 if (!PyArg_NoArgs(args))
3274 return NULL;
3275
3276 /* Shortcut for single character strings */
3277 if (PyUnicode_GET_SIZE(self) == 1)
3278 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3279
3280 e = p + PyUnicode_GET_SIZE(self);
3281 cased = 0;
3282 for (; p < e; p++) {
3283 register const Py_UNICODE ch = *p;
3284
3285 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3286 return PyInt_FromLong(0);
3287 else if (!cased && Py_UNICODE_ISUPPER(ch))
3288 cased = 1;
3289 }
3290 return PyInt_FromLong(cased);
3291}
3292
3293static char istitle__doc__[] =
3294"S.istitle() -> int\n\
3295\n\
3296Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3297may only follow uncased characters and lowercase characters only cased\n\
3298ones. Return 0 otherwise.";
3299
3300static PyObject*
3301unicode_istitle(PyUnicodeObject *self, PyObject *args)
3302{
3303 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3304 register const Py_UNICODE *e;
3305 int cased, previous_is_cased;
3306
3307 if (!PyArg_NoArgs(args))
3308 return NULL;
3309
3310 /* Shortcut for single character strings */
3311 if (PyUnicode_GET_SIZE(self) == 1)
3312 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3313 (Py_UNICODE_ISUPPER(*p) != 0));
3314
3315 e = p + PyUnicode_GET_SIZE(self);
3316 cased = 0;
3317 previous_is_cased = 0;
3318 for (; p < e; p++) {
3319 register const Py_UNICODE ch = *p;
3320
3321 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3322 if (previous_is_cased)
3323 return PyInt_FromLong(0);
3324 previous_is_cased = 1;
3325 cased = 1;
3326 }
3327 else if (Py_UNICODE_ISLOWER(ch)) {
3328 if (!previous_is_cased)
3329 return PyInt_FromLong(0);
3330 previous_is_cased = 1;
3331 cased = 1;
3332 }
3333 else
3334 previous_is_cased = 0;
3335 }
3336 return PyInt_FromLong(cased);
3337}
3338
3339static char isspace__doc__[] =
3340"S.isspace() -> int\n\
3341\n\
3342Return 1 if there are only whitespace characters in S,\n\
33430 otherwise.";
3344
3345static PyObject*
3346unicode_isspace(PyUnicodeObject *self, PyObject *args)
3347{
3348 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3349 register const Py_UNICODE *e;
3350
3351 if (!PyArg_NoArgs(args))
3352 return NULL;
3353
3354 /* Shortcut for single character strings */
3355 if (PyUnicode_GET_SIZE(self) == 1 &&
3356 Py_UNICODE_ISSPACE(*p))
3357 return PyInt_FromLong(1);
3358
3359 e = p + PyUnicode_GET_SIZE(self);
3360 for (; p < e; p++) {
3361 if (!Py_UNICODE_ISSPACE(*p))
3362 return PyInt_FromLong(0);
3363 }
3364 return PyInt_FromLong(1);
3365}
3366
3367static char isdecimal__doc__[] =
3368"S.isdecimal() -> int\n\
3369\n\
3370Return 1 if there are only decimal characters in S,\n\
33710 otherwise.";
3372
3373static PyObject*
3374unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3375{
3376 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3377 register const Py_UNICODE *e;
3378
3379 if (!PyArg_NoArgs(args))
3380 return NULL;
3381
3382 /* Shortcut for single character strings */
3383 if (PyUnicode_GET_SIZE(self) == 1 &&
3384 Py_UNICODE_ISDECIMAL(*p))
3385 return PyInt_FromLong(1);
3386
3387 e = p + PyUnicode_GET_SIZE(self);
3388 for (; p < e; p++) {
3389 if (!Py_UNICODE_ISDECIMAL(*p))
3390 return PyInt_FromLong(0);
3391 }
3392 return PyInt_FromLong(1);
3393}
3394
3395static char isdigit__doc__[] =
3396"S.isdigit() -> int\n\
3397\n\
3398Return 1 if there are only digit characters in S,\n\
33990 otherwise.";
3400
3401static PyObject*
3402unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3403{
3404 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3405 register const Py_UNICODE *e;
3406
3407 if (!PyArg_NoArgs(args))
3408 return NULL;
3409
3410 /* Shortcut for single character strings */
3411 if (PyUnicode_GET_SIZE(self) == 1 &&
3412 Py_UNICODE_ISDIGIT(*p))
3413 return PyInt_FromLong(1);
3414
3415 e = p + PyUnicode_GET_SIZE(self);
3416 for (; p < e; p++) {
3417 if (!Py_UNICODE_ISDIGIT(*p))
3418 return PyInt_FromLong(0);
3419 }
3420 return PyInt_FromLong(1);
3421}
3422
3423static char isnumeric__doc__[] =
3424"S.isnumeric() -> int\n\
3425\n\
3426Return 1 if there are only numeric characters in S,\n\
34270 otherwise.";
3428
3429static PyObject*
3430unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3431{
3432 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3433 register const Py_UNICODE *e;
3434
3435 if (!PyArg_NoArgs(args))
3436 return NULL;
3437
3438 /* Shortcut for single character strings */
3439 if (PyUnicode_GET_SIZE(self) == 1 &&
3440 Py_UNICODE_ISNUMERIC(*p))
3441 return PyInt_FromLong(1);
3442
3443 e = p + PyUnicode_GET_SIZE(self);
3444 for (; p < e; p++) {
3445 if (!Py_UNICODE_ISNUMERIC(*p))
3446 return PyInt_FromLong(0);
3447 }
3448 return PyInt_FromLong(1);
3449}
3450
3451static char join__doc__[] =
3452"S.join(sequence) -> unicode\n\
3453\n\
3454Return a string which is the concatenation of the strings in the\n\
3455sequence. The separator between elements is S.";
3456
3457static PyObject*
3458unicode_join(PyUnicodeObject *self, PyObject *args)
3459{
3460 PyObject *data;
3461 if (!PyArg_ParseTuple(args, "O:join", &data))
3462 return NULL;
3463
3464 return PyUnicode_Join((PyObject *)self, data);
3465}
3466
3467static int
3468unicode_length(PyUnicodeObject *self)
3469{
3470 return self->length;
3471}
3472
3473static char ljust__doc__[] =
3474"S.ljust(width) -> unicode\n\
3475\n\
3476Return S left justified in a Unicode string of length width. Padding is\n\
3477done using spaces.";
3478
3479static PyObject *
3480unicode_ljust(PyUnicodeObject *self, PyObject *args)
3481{
3482 int width;
3483 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3484 return NULL;
3485
3486 if (self->length >= width) {
3487 Py_INCREF(self);
3488 return (PyObject*) self;
3489 }
3490
3491 return (PyObject*) pad(self, 0, width - self->length, ' ');
3492}
3493
3494static char lower__doc__[] =
3495"S.lower() -> unicode\n\
3496\n\
3497Return a copy of the string S converted to lowercase.";
3498
3499static PyObject*
3500unicode_lower(PyUnicodeObject *self, PyObject *args)
3501{
3502 if (!PyArg_NoArgs(args))
3503 return NULL;
3504 return fixup(self, fixlower);
3505}
3506
3507static char lstrip__doc__[] =
3508"S.lstrip() -> unicode\n\
3509\n\
3510Return a copy of the string S with leading whitespace removed.";
3511
3512static PyObject *
3513unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3514{
3515 if (!PyArg_NoArgs(args))
3516 return NULL;
3517 return strip(self, 1, 0);
3518}
3519
3520static PyObject*
3521unicode_repeat(PyUnicodeObject *str, int len)
3522{
3523 PyUnicodeObject *u;
3524 Py_UNICODE *p;
3525
3526 if (len < 0)
3527 len = 0;
3528
3529 if (len == 1) {
3530 /* no repeat, return original string */
3531 Py_INCREF(str);
3532 return (PyObject*) str;
3533 }
3534
3535 u = _PyUnicode_New(len * str->length);
3536 if (!u)
3537 return NULL;
3538
3539 p = u->str;
3540
3541 while (len-- > 0) {
3542 Py_UNICODE_COPY(p, str->str, str->length);
3543 p += str->length;
3544 }
3545
3546 return (PyObject*) u;
3547}
3548
3549PyObject *PyUnicode_Replace(PyObject *obj,
3550 PyObject *subobj,
3551 PyObject *replobj,
3552 int maxcount)
3553{
3554 PyObject *self;
3555 PyObject *str1;
3556 PyObject *str2;
3557 PyObject *result;
3558
3559 self = PyUnicode_FromObject(obj);
3560 if (self == NULL)
3561 return NULL;
3562 str1 = PyUnicode_FromObject(subobj);
3563 if (str1 == NULL) {
3564 Py_DECREF(self);
3565 return NULL;
3566 }
3567 str2 = PyUnicode_FromObject(replobj);
3568 if (str2 == NULL) {
3569 Py_DECREF(self);
3570 Py_DECREF(str1);
3571 return NULL;
3572 }
3573 result = replace((PyUnicodeObject *)self,
3574 (PyUnicodeObject *)str1,
3575 (PyUnicodeObject *)str2,
3576 maxcount);
3577 Py_DECREF(self);
3578 Py_DECREF(str1);
3579 Py_DECREF(str2);
3580 return result;
3581}
3582
3583static char replace__doc__[] =
3584"S.replace (old, new[, maxsplit]) -> unicode\n\
3585\n\
3586Return a copy of S with all occurrences of substring\n\
3587old replaced by new. If the optional argument maxsplit is\n\
3588given, only the first maxsplit occurrences are replaced.";
3589
3590static PyObject*
3591unicode_replace(PyUnicodeObject *self, PyObject *args)
3592{
3593 PyUnicodeObject *str1;
3594 PyUnicodeObject *str2;
3595 int maxcount = -1;
3596 PyObject *result;
3597
3598 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3599 return NULL;
3600 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3601 if (str1 == NULL)
3602 return NULL;
3603 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3604 if (str2 == NULL)
3605 return NULL;
3606
3607 result = replace(self, str1, str2, maxcount);
3608
3609 Py_DECREF(str1);
3610 Py_DECREF(str2);
3611 return result;
3612}
3613
3614static
3615PyObject *unicode_repr(PyObject *unicode)
3616{
3617 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3618 PyUnicode_GET_SIZE(unicode),
3619 1);
3620}
3621
3622static char rfind__doc__[] =
3623"S.rfind(sub [,start [,end]]) -> int\n\
3624\n\
3625Return the highest index in S where substring sub is found,\n\
3626such that sub is contained within s[start,end]. Optional\n\
3627arguments start and end are interpreted as in slice notation.\n\
3628\n\
3629Return -1 on failure.";
3630
3631static PyObject *
3632unicode_rfind(PyUnicodeObject *self, PyObject *args)
3633{
3634 PyUnicodeObject *substring;
3635 int start = 0;
3636 int end = INT_MAX;
3637 PyObject *result;
3638
3639 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3640 return NULL;
3641 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3642 (PyObject *)substring);
3643 if (substring == NULL)
3644 return NULL;
3645
3646 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3647
3648 Py_DECREF(substring);
3649 return result;
3650}
3651
3652static char rindex__doc__[] =
3653"S.rindex(sub [,start [,end]]) -> int\n\
3654\n\
3655Like S.rfind() but raise ValueError when the substring is not found.";
3656
3657static PyObject *
3658unicode_rindex(PyUnicodeObject *self, PyObject *args)
3659{
3660 int result;
3661 PyUnicodeObject *substring;
3662 int start = 0;
3663 int end = INT_MAX;
3664
3665 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3666 return NULL;
3667 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3668 (PyObject *)substring);
3669 if (substring == NULL)
3670 return NULL;
3671
3672 result = findstring(self, substring, start, end, -1);
3673
3674 Py_DECREF(substring);
3675 if (result < 0) {
3676 PyErr_SetString(PyExc_ValueError, "substring not found");
3677 return NULL;
3678 }
3679 return PyInt_FromLong(result);
3680}
3681
3682static char rjust__doc__[] =
3683"S.rjust(width) -> unicode\n\
3684\n\
3685Return S right justified in a Unicode string of length width. Padding is\n\
3686done using spaces.";
3687
3688static PyObject *
3689unicode_rjust(PyUnicodeObject *self, PyObject *args)
3690{
3691 int width;
3692 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3693 return NULL;
3694
3695 if (self->length >= width) {
3696 Py_INCREF(self);
3697 return (PyObject*) self;
3698 }
3699
3700 return (PyObject*) pad(self, width - self->length, 0, ' ');
3701}
3702
3703static char rstrip__doc__[] =
3704"S.rstrip() -> unicode\n\
3705\n\
3706Return a copy of the string S with trailing whitespace removed.";
3707
3708static PyObject *
3709unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3710{
3711 if (!PyArg_NoArgs(args))
3712 return NULL;
3713 return strip(self, 0, 1);
3714}
3715
3716static PyObject*
3717unicode_slice(PyUnicodeObject *self, int start, int end)
3718{
3719 /* standard clamping */
3720 if (start < 0)
3721 start = 0;
3722 if (end < 0)
3723 end = 0;
3724 if (end > self->length)
3725 end = self->length;
3726 if (start == 0 && end == self->length) {
3727 /* full slice, return original string */
3728 Py_INCREF(self);
3729 return (PyObject*) self;
3730 }
3731 if (start > end)
3732 start = end;
3733 /* copy slice */
3734 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3735 end - start);
3736}
3737
3738PyObject *PyUnicode_Split(PyObject *s,
3739 PyObject *sep,
3740 int maxsplit)
3741{
3742 PyObject *result;
3743
3744 s = PyUnicode_FromObject(s);
3745 if (s == NULL)
3746 return NULL;
3747 if (sep != NULL) {
3748 sep = PyUnicode_FromObject(sep);
3749 if (sep == NULL) {
3750 Py_DECREF(s);
3751 return NULL;
3752 }
3753 }
3754
3755 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3756
3757 Py_DECREF(s);
3758 Py_XDECREF(sep);
3759 return result;
3760}
3761
3762static char split__doc__[] =
3763"S.split([sep [,maxsplit]]) -> list of strings\n\
3764\n\
3765Return a list of the words in S, using sep as the\n\
3766delimiter string. If maxsplit is given, at most maxsplit\n\
3767splits are done. If sep is not specified, any whitespace string\n\
3768is a separator.";
3769
3770static PyObject*
3771unicode_split(PyUnicodeObject *self, PyObject *args)
3772{
3773 PyObject *substring = Py_None;
3774 int maxcount = -1;
3775
3776 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3777 return NULL;
3778
3779 if (substring == Py_None)
3780 return split(self, NULL, maxcount);
3781 else if (PyUnicode_Check(substring))
3782 return split(self, (PyUnicodeObject *)substring, maxcount);
3783 else
3784 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3785}
3786
3787static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00003788"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789\n\
3790Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00003791Line breaks are not included in the resulting list unless keepends\n\
3792is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003793
3794static PyObject*
3795unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3796{
Guido van Rossum86662912000-04-11 15:38:46 +00003797 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003798
Guido van Rossum86662912000-04-11 15:38:46 +00003799 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800 return NULL;
3801
Guido van Rossum86662912000-04-11 15:38:46 +00003802 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003803}
3804
3805static
3806PyObject *unicode_str(PyUnicodeObject *self)
3807{
3808 return PyUnicode_AsUTF8String((PyObject *)self);
3809}
3810
3811static char strip__doc__[] =
3812"S.strip() -> unicode\n\
3813\n\
3814Return a copy of S with leading and trailing whitespace removed.";
3815
3816static PyObject *
3817unicode_strip(PyUnicodeObject *self, PyObject *args)
3818{
3819 if (!PyArg_NoArgs(args))
3820 return NULL;
3821 return strip(self, 1, 1);
3822}
3823
3824static char swapcase__doc__[] =
3825"S.swapcase() -> unicode\n\
3826\n\
3827Return a copy of S with uppercase characters converted to lowercase\n\
3828and vice versa.";
3829
3830static PyObject*
3831unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3832{
3833 if (!PyArg_NoArgs(args))
3834 return NULL;
3835 return fixup(self, fixswapcase);
3836}
3837
3838static char translate__doc__[] =
3839"S.translate(table) -> unicode\n\
3840\n\
3841Return a copy of the string S, where all characters have been mapped\n\
3842through the given translation table, which must be a mapping of\n\
3843Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3844are left untouched. Characters mapped to None are deleted.";
3845
3846static PyObject*
3847unicode_translate(PyUnicodeObject *self, PyObject *args)
3848{
3849 PyObject *table;
3850
3851 if (!PyArg_ParseTuple(args, "O:translate", &table))
3852 return NULL;
3853 return PyUnicode_TranslateCharmap(self->str,
3854 self->length,
3855 table,
3856 "ignore");
3857}
3858
3859static char upper__doc__[] =
3860"S.upper() -> unicode\n\
3861\n\
3862Return a copy of S converted to uppercase.";
3863
3864static PyObject*
3865unicode_upper(PyUnicodeObject *self, PyObject *args)
3866{
3867 if (!PyArg_NoArgs(args))
3868 return NULL;
3869 return fixup(self, fixupper);
3870}
3871
3872#if 0
3873static char zfill__doc__[] =
3874"S.zfill(width) -> unicode\n\
3875\n\
3876Pad a numeric string x with zeros on the left, to fill a field\n\
3877of the specified width. The string x is never truncated.";
3878
3879static PyObject *
3880unicode_zfill(PyUnicodeObject *self, PyObject *args)
3881{
3882 int fill;
3883 PyUnicodeObject *u;
3884
3885 int width;
3886 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3887 return NULL;
3888
3889 if (self->length >= width) {
3890 Py_INCREF(self);
3891 return (PyObject*) self;
3892 }
3893
3894 fill = width - self->length;
3895
3896 u = pad(self, fill, 0, '0');
3897
3898 if (u->str[fill] == '+' || u->str[fill] == '-') {
3899 /* move sign to beginning of string */
3900 u->str[0] = u->str[fill];
3901 u->str[fill] = '0';
3902 }
3903
3904 return (PyObject*) u;
3905}
3906#endif
3907
3908#if 0
3909static PyObject*
3910unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3911{
3912 if (!PyArg_NoArgs(args))
3913 return NULL;
3914 return PyInt_FromLong(unicode_freelist_size);
3915}
3916#endif
3917
3918static char startswith__doc__[] =
3919"S.startswith(prefix[, start[, end]]) -> int\n\
3920\n\
3921Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3922optional start, test S beginning at that position. With optional end, stop\n\
3923comparing S at that position.";
3924
3925static PyObject *
3926unicode_startswith(PyUnicodeObject *self,
3927 PyObject *args)
3928{
3929 PyUnicodeObject *substring;
3930 int start = 0;
3931 int end = INT_MAX;
3932 PyObject *result;
3933
3934 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3935 return NULL;
3936 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3937 (PyObject *)substring);
3938 if (substring == NULL)
3939 return NULL;
3940
3941 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3942
3943 Py_DECREF(substring);
3944 return result;
3945}
3946
3947
3948static char endswith__doc__[] =
3949"S.endswith(suffix[, start[, end]]) -> int\n\
3950\n\
3951Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3952optional start, test S beginning at that position. With optional end, stop\n\
3953comparing S at that position.";
3954
3955static PyObject *
3956unicode_endswith(PyUnicodeObject *self,
3957 PyObject *args)
3958{
3959 PyUnicodeObject *substring;
3960 int start = 0;
3961 int end = INT_MAX;
3962 PyObject *result;
3963
3964 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3965 return NULL;
3966 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3967 (PyObject *)substring);
3968 if (substring == NULL)
3969 return NULL;
3970
3971 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3972
3973 Py_DECREF(substring);
3974 return result;
3975}
3976
3977
3978static PyMethodDef unicode_methods[] = {
3979
3980 /* Order is according to common usage: often used methods should
3981 appear first, since lookup is done sequentially. */
3982
3983 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3984 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3985 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3986 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3987 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3988 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3989 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3990 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3991 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3992 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3993 {"index", (PyCFunction) unicode_index, 1, index__doc__},
3994 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
3995 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
3996 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
3997/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
3998 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
3999 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4000 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4001 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4002 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4003 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4004 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4005 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4006 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4007 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4008 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4009 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4010 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4011 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4012 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4013 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4014 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4015 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4016#if 0
4017 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4018 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4019#endif
4020
4021#if 0
4022 /* This one is just used for debugging the implementation. */
4023 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4024#endif
4025
4026 {NULL, NULL}
4027};
4028
4029static PyObject *
4030unicode_getattr(PyUnicodeObject *self, char *name)
4031{
4032 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4033}
4034
4035static PySequenceMethods unicode_as_sequence = {
4036 (inquiry) unicode_length, /* sq_length */
4037 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4038 (intargfunc) unicode_repeat, /* sq_repeat */
4039 (intargfunc) unicode_getitem, /* sq_item */
4040 (intintargfunc) unicode_slice, /* sq_slice */
4041 0, /* sq_ass_item */
4042 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004043 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044};
4045
4046static int
4047unicode_buffer_getreadbuf(PyUnicodeObject *self,
4048 int index,
4049 const void **ptr)
4050{
4051 if (index != 0) {
4052 PyErr_SetString(PyExc_SystemError,
4053 "accessing non-existent unicode segment");
4054 return -1;
4055 }
4056 *ptr = (void *) self->str;
4057 return PyUnicode_GET_DATA_SIZE(self);
4058}
4059
4060static int
4061unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4062 const void **ptr)
4063{
4064 PyErr_SetString(PyExc_TypeError,
4065 "cannot use unicode as modifyable buffer");
4066 return -1;
4067}
4068
4069static int
4070unicode_buffer_getsegcount(PyUnicodeObject *self,
4071 int *lenp)
4072{
4073 if (lenp)
4074 *lenp = PyUnicode_GET_DATA_SIZE(self);
4075 return 1;
4076}
4077
4078static int
4079unicode_buffer_getcharbuf(PyUnicodeObject *self,
4080 int index,
4081 const void **ptr)
4082{
4083 PyObject *str;
4084
4085 if (index != 0) {
4086 PyErr_SetString(PyExc_SystemError,
4087 "accessing non-existent unicode segment");
4088 return -1;
4089 }
4090 str = utf8_string(self, NULL);
4091 if (str == NULL)
4092 return -1;
4093 *ptr = (void *) PyString_AS_STRING(str);
4094 return PyString_GET_SIZE(str);
4095}
4096
4097/* Helpers for PyUnicode_Format() */
4098
4099static PyObject *
4100getnextarg(args, arglen, p_argidx)
4101 PyObject *args;
4102int arglen;
4103int *p_argidx;
4104{
4105 int argidx = *p_argidx;
4106 if (argidx < arglen) {
4107 (*p_argidx)++;
4108 if (arglen < 0)
4109 return args;
4110 else
4111 return PyTuple_GetItem(args, argidx);
4112 }
4113 PyErr_SetString(PyExc_TypeError,
4114 "not enough arguments for format string");
4115 return NULL;
4116}
4117
4118#define F_LJUST (1<<0)
4119#define F_SIGN (1<<1)
4120#define F_BLANK (1<<2)
4121#define F_ALT (1<<3)
4122#define F_ZERO (1<<4)
4123
4124static
4125#ifdef HAVE_STDARG_PROTOTYPES
4126int usprintf(register Py_UNICODE *buffer, char *format, ...)
4127#else
4128int usprintf(va_alist) va_dcl
4129#endif
4130{
4131 register int i;
4132 int len;
4133 va_list va;
4134 char *charbuffer;
4135#ifdef HAVE_STDARG_PROTOTYPES
4136 va_start(va, format);
4137#else
4138 Py_UNICODE *args;
4139 char *format;
4140
4141 va_start(va);
4142 buffer = va_arg(va, Py_UNICODE *);
4143 format = va_arg(va, char *);
4144#endif
4145
4146 /* First, format the string as char array, then expand to Py_UNICODE
4147 array. */
4148 charbuffer = (char *)buffer;
4149 len = vsprintf(charbuffer, format, va);
4150 for (i = len - 1; i >= 0; i--)
4151 buffer[i] = (Py_UNICODE) charbuffer[i];
4152
4153 va_end(va);
4154 return len;
4155}
4156
4157static int
4158formatfloat(Py_UNICODE *buf,
4159 int flags,
4160 int prec,
4161 int type,
4162 PyObject *v)
4163{
4164 char fmt[20];
4165 double x;
4166
4167 x = PyFloat_AsDouble(v);
4168 if (x == -1.0 && PyErr_Occurred())
4169 return -1;
4170 if (prec < 0)
4171 prec = 6;
4172 if (prec > 50)
4173 prec = 50; /* Arbitrary limitation */
4174 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4175 type = 'g';
4176 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4177 return usprintf(buf, fmt, x);
4178}
4179
4180static int
4181formatint(Py_UNICODE *buf,
4182 int flags,
4183 int prec,
4184 int type,
4185 PyObject *v)
4186{
4187 char fmt[20];
4188 long x;
4189
4190 x = PyInt_AsLong(v);
4191 if (x == -1 && PyErr_Occurred())
4192 return -1;
4193 if (prec < 0)
4194 prec = 1;
4195 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4196 return usprintf(buf, fmt, x);
4197}
4198
4199static int
4200formatchar(Py_UNICODE *buf,
4201 PyObject *v)
4202{
4203 if (PyUnicode_Check(v))
4204 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4205
4206 else if (PyString_Check(v))
4207 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4208
4209 else {
4210 /* Integer input truncated to a character */
4211 long x;
4212 x = PyInt_AsLong(v);
4213 if (x == -1 && PyErr_Occurred())
4214 return -1;
4215 buf[0] = (char) x;
4216 }
4217 buf[1] = '\0';
4218 return 1;
4219}
4220
4221PyObject *PyUnicode_Format(PyObject *format,
4222 PyObject *args)
4223{
4224 Py_UNICODE *fmt, *res;
4225 int fmtcnt, rescnt, reslen, arglen, argidx;
4226 int args_owned = 0;
4227 PyUnicodeObject *result = NULL;
4228 PyObject *dict = NULL;
4229 PyObject *uformat;
4230
4231 if (format == NULL || args == NULL) {
4232 PyErr_BadInternalCall();
4233 return NULL;
4234 }
4235 uformat = PyUnicode_FromObject(format);
4236 fmt = PyUnicode_AS_UNICODE(uformat);
4237 fmtcnt = PyUnicode_GET_SIZE(uformat);
4238
4239 reslen = rescnt = fmtcnt + 100;
4240 result = _PyUnicode_New(reslen);
4241 if (result == NULL)
4242 goto onError;
4243 res = PyUnicode_AS_UNICODE(result);
4244
4245 if (PyTuple_Check(args)) {
4246 arglen = PyTuple_Size(args);
4247 argidx = 0;
4248 }
4249 else {
4250 arglen = -1;
4251 argidx = -2;
4252 }
4253 if (args->ob_type->tp_as_mapping)
4254 dict = args;
4255
4256 while (--fmtcnt >= 0) {
4257 if (*fmt != '%') {
4258 if (--rescnt < 0) {
4259 rescnt = fmtcnt + 100;
4260 reslen += rescnt;
4261 if (_PyUnicode_Resize(result, reslen) < 0)
4262 return NULL;
4263 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4264 --rescnt;
4265 }
4266 *res++ = *fmt++;
4267 }
4268 else {
4269 /* Got a format specifier */
4270 int flags = 0;
4271 int width = -1;
4272 int prec = -1;
4273 int size = 0;
4274 Py_UNICODE c = '\0';
4275 Py_UNICODE fill;
4276 PyObject *v = NULL;
4277 PyObject *temp = NULL;
4278 Py_UNICODE *buf;
4279 Py_UNICODE sign;
4280 int len;
4281 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4282
4283 fmt++;
4284 if (*fmt == '(') {
4285 Py_UNICODE *keystart;
4286 int keylen;
4287 PyObject *key;
4288 int pcount = 1;
4289
4290 if (dict == NULL) {
4291 PyErr_SetString(PyExc_TypeError,
4292 "format requires a mapping");
4293 goto onError;
4294 }
4295 ++fmt;
4296 --fmtcnt;
4297 keystart = fmt;
4298 /* Skip over balanced parentheses */
4299 while (pcount > 0 && --fmtcnt >= 0) {
4300 if (*fmt == ')')
4301 --pcount;
4302 else if (*fmt == '(')
4303 ++pcount;
4304 fmt++;
4305 }
4306 keylen = fmt - keystart - 1;
4307 if (fmtcnt < 0 || pcount > 0) {
4308 PyErr_SetString(PyExc_ValueError,
4309 "incomplete format key");
4310 goto onError;
4311 }
4312 /* keys are converted to strings (using UTF-8) and
4313 then looked up since Python uses strings to hold
4314 variables names etc. in its namespaces and we
4315 wouldn't want to break common idioms. The
4316 alternative would be using Unicode objects for the
4317 lookup but u"abc" and "abc" have different hash
4318 values (on purpose). */
4319 key = PyUnicode_EncodeUTF8(keystart,
4320 keylen,
4321 NULL);
4322 if (key == NULL)
4323 goto onError;
4324 if (args_owned) {
4325 Py_DECREF(args);
4326 args_owned = 0;
4327 }
4328 args = PyObject_GetItem(dict, key);
4329 Py_DECREF(key);
4330 if (args == NULL) {
4331 goto onError;
4332 }
4333 args_owned = 1;
4334 arglen = -1;
4335 argidx = -2;
4336 }
4337 while (--fmtcnt >= 0) {
4338 switch (c = *fmt++) {
4339 case '-': flags |= F_LJUST; continue;
4340 case '+': flags |= F_SIGN; continue;
4341 case ' ': flags |= F_BLANK; continue;
4342 case '#': flags |= F_ALT; continue;
4343 case '0': flags |= F_ZERO; continue;
4344 }
4345 break;
4346 }
4347 if (c == '*') {
4348 v = getnextarg(args, arglen, &argidx);
4349 if (v == NULL)
4350 goto onError;
4351 if (!PyInt_Check(v)) {
4352 PyErr_SetString(PyExc_TypeError,
4353 "* wants int");
4354 goto onError;
4355 }
4356 width = PyInt_AsLong(v);
4357 if (width < 0) {
4358 flags |= F_LJUST;
4359 width = -width;
4360 }
4361 if (--fmtcnt >= 0)
4362 c = *fmt++;
4363 }
4364 else if (c >= '0' && c <= '9') {
4365 width = c - '0';
4366 while (--fmtcnt >= 0) {
4367 c = *fmt++;
4368 if (c < '0' || c > '9')
4369 break;
4370 if ((width*10) / 10 != width) {
4371 PyErr_SetString(PyExc_ValueError,
4372 "width too big");
4373 goto onError;
4374 }
4375 width = width*10 + (c - '0');
4376 }
4377 }
4378 if (c == '.') {
4379 prec = 0;
4380 if (--fmtcnt >= 0)
4381 c = *fmt++;
4382 if (c == '*') {
4383 v = getnextarg(args, arglen, &argidx);
4384 if (v == NULL)
4385 goto onError;
4386 if (!PyInt_Check(v)) {
4387 PyErr_SetString(PyExc_TypeError,
4388 "* wants int");
4389 goto onError;
4390 }
4391 prec = PyInt_AsLong(v);
4392 if (prec < 0)
4393 prec = 0;
4394 if (--fmtcnt >= 0)
4395 c = *fmt++;
4396 }
4397 else if (c >= '0' && c <= '9') {
4398 prec = c - '0';
4399 while (--fmtcnt >= 0) {
4400 c = Py_CHARMASK(*fmt++);
4401 if (c < '0' || c > '9')
4402 break;
4403 if ((prec*10) / 10 != prec) {
4404 PyErr_SetString(PyExc_ValueError,
4405 "prec too big");
4406 goto onError;
4407 }
4408 prec = prec*10 + (c - '0');
4409 }
4410 }
4411 } /* prec */
4412 if (fmtcnt >= 0) {
4413 if (c == 'h' || c == 'l' || c == 'L') {
4414 size = c;
4415 if (--fmtcnt >= 0)
4416 c = *fmt++;
4417 }
4418 }
4419 if (fmtcnt < 0) {
4420 PyErr_SetString(PyExc_ValueError,
4421 "incomplete format");
4422 goto onError;
4423 }
4424 if (c != '%') {
4425 v = getnextarg(args, arglen, &argidx);
4426 if (v == NULL)
4427 goto onError;
4428 }
4429 sign = 0;
4430 fill = ' ';
4431 switch (c) {
4432
4433 case '%':
4434 buf = tmpbuf;
4435 buf[0] = '%';
4436 len = 1;
4437 break;
4438
4439 case 's':
4440 case 'r':
4441 if (PyUnicode_Check(v) && c == 's') {
4442 temp = v;
4443 Py_INCREF(temp);
4444 }
4445 else {
4446 PyObject *unicode;
4447 if (c == 's')
4448 temp = PyObject_Str(v);
4449 else
4450 temp = PyObject_Repr(v);
4451 if (temp == NULL)
4452 goto onError;
4453 if (!PyString_Check(temp)) {
4454 /* XXX Note: this should never happen, since
4455 PyObject_Repr() and PyObject_Str() assure
4456 this */
4457 Py_DECREF(temp);
4458 PyErr_SetString(PyExc_TypeError,
4459 "%s argument has non-string str()");
4460 goto onError;
4461 }
4462 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4463 PyString_GET_SIZE(temp),
4464 "strict");
4465 Py_DECREF(temp);
4466 temp = unicode;
4467 if (temp == NULL)
4468 goto onError;
4469 }
4470 buf = PyUnicode_AS_UNICODE(temp);
4471 len = PyUnicode_GET_SIZE(temp);
4472 if (prec >= 0 && len > prec)
4473 len = prec;
4474 break;
4475
4476 case 'i':
4477 case 'd':
4478 case 'u':
4479 case 'o':
4480 case 'x':
4481 case 'X':
4482 if (c == 'i')
4483 c = 'd';
4484 buf = tmpbuf;
4485 len = formatint(buf, flags, prec, c, v);
4486 if (len < 0)
4487 goto onError;
4488 sign = (c == 'd');
4489 if (flags & F_ZERO) {
4490 fill = '0';
4491 if ((flags&F_ALT) &&
4492 (c == 'x' || c == 'X') &&
4493 buf[0] == '0' && buf[1] == c) {
4494 *res++ = *buf++;
4495 *res++ = *buf++;
4496 rescnt -= 2;
4497 len -= 2;
4498 width -= 2;
4499 if (width < 0)
4500 width = 0;
4501 }
4502 }
4503 break;
4504
4505 case 'e':
4506 case 'E':
4507 case 'f':
4508 case 'g':
4509 case 'G':
4510 buf = tmpbuf;
4511 len = formatfloat(buf, flags, prec, c, v);
4512 if (len < 0)
4513 goto onError;
4514 sign = 1;
4515 if (flags&F_ZERO)
4516 fill = '0';
4517 break;
4518
4519 case 'c':
4520 buf = tmpbuf;
4521 len = formatchar(buf, v);
4522 if (len < 0)
4523 goto onError;
4524 break;
4525
4526 default:
4527 PyErr_Format(PyExc_ValueError,
4528 "unsupported format character '%c' (0x%x)",
4529 c, c);
4530 goto onError;
4531 }
4532 if (sign) {
4533 if (*buf == '-' || *buf == '+') {
4534 sign = *buf++;
4535 len--;
4536 }
4537 else if (flags & F_SIGN)
4538 sign = '+';
4539 else if (flags & F_BLANK)
4540 sign = ' ';
4541 else
4542 sign = 0;
4543 }
4544 if (width < len)
4545 width = len;
4546 if (rescnt < width + (sign != 0)) {
4547 reslen -= rescnt;
4548 rescnt = width + fmtcnt + 100;
4549 reslen += rescnt;
4550 if (_PyUnicode_Resize(result, reslen) < 0)
4551 return NULL;
4552 res = PyUnicode_AS_UNICODE(result)
4553 + reslen - rescnt;
4554 }
4555 if (sign) {
4556 if (fill != ' ')
4557 *res++ = sign;
4558 rescnt--;
4559 if (width > len)
4560 width--;
4561 }
4562 if (width > len && !(flags & F_LJUST)) {
4563 do {
4564 --rescnt;
4565 *res++ = fill;
4566 } while (--width > len);
4567 }
4568 if (sign && fill == ' ')
4569 *res++ = sign;
4570 memcpy(res, buf, len * sizeof(Py_UNICODE));
4571 res += len;
4572 rescnt -= len;
4573 while (--width >= len) {
4574 --rescnt;
4575 *res++ = ' ';
4576 }
4577 if (dict && (argidx < arglen) && c != '%') {
4578 PyErr_SetString(PyExc_TypeError,
4579 "not all arguments converted");
4580 goto onError;
4581 }
4582 Py_XDECREF(temp);
4583 } /* '%' */
4584 } /* until end */
4585 if (argidx < arglen && !dict) {
4586 PyErr_SetString(PyExc_TypeError,
4587 "not all arguments converted");
4588 goto onError;
4589 }
4590
4591 if (args_owned) {
4592 Py_DECREF(args);
4593 }
4594 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004595 if (_PyUnicode_Resize(result, reslen - rescnt))
4596 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004597 return (PyObject *)result;
4598
4599 onError:
4600 Py_XDECREF(result);
4601 Py_DECREF(uformat);
4602 if (args_owned) {
4603 Py_DECREF(args);
4604 }
4605 return NULL;
4606}
4607
4608static PyBufferProcs unicode_as_buffer = {
4609 (getreadbufferproc) unicode_buffer_getreadbuf,
4610 (getwritebufferproc) unicode_buffer_getwritebuf,
4611 (getsegcountproc) unicode_buffer_getsegcount,
4612 (getcharbufferproc) unicode_buffer_getcharbuf,
4613};
4614
4615PyTypeObject PyUnicode_Type = {
4616 PyObject_HEAD_INIT(&PyType_Type)
4617 0, /* ob_size */
4618 "unicode", /* tp_name */
4619 sizeof(PyUnicodeObject), /* tp_size */
4620 0, /* tp_itemsize */
4621 /* Slots */
4622 (destructor)_PyUnicode_Free, /* tp_dealloc */
4623 0, /* tp_print */
4624 (getattrfunc)unicode_getattr, /* tp_getattr */
4625 0, /* tp_setattr */
4626 (cmpfunc) unicode_compare, /* tp_compare */
4627 (reprfunc) unicode_repr, /* tp_repr */
4628 0, /* tp_as_number */
4629 &unicode_as_sequence, /* tp_as_sequence */
4630 0, /* tp_as_mapping */
4631 (hashfunc) unicode_hash, /* tp_hash*/
4632 0, /* tp_call*/
4633 (reprfunc) unicode_str, /* tp_str */
4634 (getattrofunc) NULL, /* tp_getattro */
4635 (setattrofunc) NULL, /* tp_setattro */
4636 &unicode_as_buffer, /* tp_as_buffer */
4637 Py_TPFLAGS_DEFAULT, /* tp_flags */
4638};
4639
4640/* Initialize the Unicode implementation */
4641
4642void _PyUnicode_Init()
4643{
4644 /* Doublecheck the configuration... */
4645 if (sizeof(Py_UNICODE) != 2)
4646 Py_FatalError("Unicode configuration error: "
4647 "sizeof(Py_UNICODE) != 2 bytes");
4648
4649 unicode_empty = _PyUnicode_New(0);
4650}
4651
4652/* Finalize the Unicode implementation */
4653
4654void
4655_PyUnicode_Fini()
4656{
4657 PyUnicodeObject *u = unicode_freelist;
4658
4659 while (u != NULL) {
4660 PyUnicodeObject *v = u;
4661 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004662 if (v->str)
4663 free(v->str);
4664 Py_XDECREF(v->utf8str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004665 free(v);
4666 }
4667 Py_XDECREF(unicode_empty);
4668}