blob: 14866ab0526bef7d29a3ded3948dfada9c9d90bb [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------ */
112
113/* The empty Unicode object */
114static PyUnicodeObject *unicode_empty = NULL;
115
116/* Free list for Unicode objects */
117static PyUnicodeObject *unicode_freelist = NULL;
118static int unicode_freelist_size = 0;
119
120/* --- Unicode Object ----------------------------------------------------- */
121
122static
123int _PyUnicode_Resize(register PyUnicodeObject *unicode,
124 int length)
125{
126 void *oldstr;
127
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000130 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000131
132 /* Resizing unicode_empty is not allowed. */
133 if (unicode == unicode_empty) {
134 PyErr_SetString(PyExc_SystemError,
135 "can't resize empty unicode object");
136 return -1;
137 }
138
139 /* We allocate one more byte to make sure the string is
140 Ux0000 terminated -- XXX is this needed ? */
141 oldstr = unicode->str;
142 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
143 if (!unicode->str) {
144 unicode->str = oldstr;
145 PyErr_NoMemory();
146 return -1;
147 }
148 unicode->str[length] = 0;
149 unicode->length = length;
150
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000151 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000152 /* Reset the object caches */
153 if (unicode->utf8str) {
154 Py_DECREF(unicode->utf8str);
155 unicode->utf8str = NULL;
156 }
157 unicode->hash = -1;
158
159 return 0;
160}
161
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000162int PyUnicode_Resize(PyObject **unicode,
163 int length)
164{
165 PyUnicodeObject *v;
166
167 if (unicode == NULL) {
168 PyErr_BadInternalCall();
169 return -1;
170 }
171 v = (PyUnicodeObject *)*unicode;
172 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
173 PyErr_BadInternalCall();
174 return -1;
175 }
176 return _PyUnicode_Resize(v, length);
177}
178
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179/* We allocate one more byte to make sure the string is
180 Ux0000 terminated -- XXX is this needed ?
181
182 XXX This allocator could further be enhanced by assuring that the
183 free list never reduces its size below 1.
184
185*/
186
187static
188PyUnicodeObject *_PyUnicode_New(int length)
189{
190 register PyUnicodeObject *unicode;
191
192 /* Optimization for empty strings */
193 if (length == 0 && unicode_empty != NULL) {
194 Py_INCREF(unicode_empty);
195 return unicode_empty;
196 }
197
198 /* Unicode freelist & memory allocation */
199 if (unicode_freelist) {
200 unicode = unicode_freelist;
201 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
202 unicode_freelist_size--;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000203 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000205 /* Keep-Alive optimization: we only upsize the buffer,
206 never downsize it. */
207 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000209 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 }
212 }
213 else
214 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
215 }
216 else {
217 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
218 if (unicode == NULL)
219 return NULL;
220 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
221 }
222
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000223 if (!unicode->str) {
224 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000225 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
230 unicode->utf8str = NULL;
231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000235 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
240void _PyUnicode_Free(register PyUnicodeObject *unicode)
241{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 /* Keep-Alive optimization */
244 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode->str = NULL;
247 unicode->length = 0;
248 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000249 if (unicode->utf8str) {
250 Py_DECREF(unicode->utf8str);
251 unicode->utf8str = NULL;
252 }
253 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 *(PyUnicodeObject **)unicode = unicode_freelist;
255 unicode_freelist = unicode;
256 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 }
258 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000259 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 }
263}
264
265PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
266 int size)
267{
268 PyUnicodeObject *unicode;
269
270 unicode = _PyUnicode_New(size);
271 if (!unicode)
272 return NULL;
273
274 /* Copy the Unicode data into the new object */
275 if (u != NULL)
276 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
277
278 return (PyObject *)unicode;
279}
280
281#ifdef HAVE_WCHAR_H
282
283PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
284 int size)
285{
286 PyUnicodeObject *unicode;
287
288 if (w == NULL) {
289 PyErr_BadInternalCall();
290 return NULL;
291 }
292
293 unicode = _PyUnicode_New(size);
294 if (!unicode)
295 return NULL;
296
297 /* Copy the wchar_t data into the new object */
298#ifdef HAVE_USABLE_WCHAR_T
299 memcpy(unicode->str, w, size * sizeof(wchar_t));
300#else
301 {
302 register Py_UNICODE *u;
303 register int i;
304 u = PyUnicode_AS_UNICODE(unicode);
305 for (i = size; i >= 0; i--)
306 *u++ = *w++;
307 }
308#endif
309
310 return (PyObject *)unicode;
311}
312
313int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
314 register wchar_t *w,
315 int size)
316{
317 if (unicode == NULL) {
318 PyErr_BadInternalCall();
319 return -1;
320 }
321 if (size > PyUnicode_GET_SIZE(unicode))
322 size = PyUnicode_GET_SIZE(unicode);
323#ifdef HAVE_USABLE_WCHAR_T
324 memcpy(w, unicode->str, size * sizeof(wchar_t));
325#else
326 {
327 register Py_UNICODE *u;
328 register int i;
329 u = PyUnicode_AS_UNICODE(unicode);
330 for (i = size; i >= 0; i--)
331 *w++ = *u++;
332 }
333#endif
334
335 return size;
336}
337
338#endif
339
340PyObject *PyUnicode_FromObject(register PyObject *obj)
341{
342 const char *s;
343 int len;
344
345 if (obj == NULL) {
346 PyErr_BadInternalCall();
347 return NULL;
348 }
349 else if (PyUnicode_Check(obj)) {
350 Py_INCREF(obj);
351 return obj;
352 }
353 else if (PyString_Check(obj)) {
354 s = PyString_AS_STRING(obj);
355 len = PyString_GET_SIZE(obj);
356 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000357 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
358 /* Overwrite the error message with something more useful in
359 case of a TypeError. */
360 if (PyErr_ExceptionMatches(PyExc_TypeError))
361 PyErr_SetString(PyExc_TypeError,
362 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 if (len == 0) {
366 Py_INCREF(unicode_empty);
367 return (PyObject *)unicode_empty;
368 }
369 return PyUnicode_DecodeUTF8(s, len, "strict");
370}
371
372PyObject *PyUnicode_Decode(const char *s,
373 int size,
374 const char *encoding,
375 const char *errors)
376{
377 PyObject *buffer = NULL, *unicode;
378
379 /* Shortcut for the default encoding UTF-8 */
380 if (encoding == NULL ||
381 (strcmp(encoding, "utf-8") == 0))
382 return PyUnicode_DecodeUTF8(s, size, errors);
383
384 /* Decode via the codec registry */
385 buffer = PyBuffer_FromMemory((void *)s, size);
386 if (buffer == NULL)
387 goto onError;
388 unicode = PyCodec_Decode(buffer, encoding, errors);
389 if (unicode == NULL)
390 goto onError;
391 if (!PyUnicode_Check(unicode)) {
392 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000393 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 unicode->ob_type->tp_name);
395 Py_DECREF(unicode);
396 goto onError;
397 }
398 Py_DECREF(buffer);
399 return unicode;
400
401 onError:
402 Py_XDECREF(buffer);
403 return NULL;
404}
405
406PyObject *PyUnicode_Encode(const Py_UNICODE *s,
407 int size,
408 const char *encoding,
409 const char *errors)
410{
411 PyObject *v, *unicode;
412
413 unicode = PyUnicode_FromUnicode(s, size);
414 if (unicode == NULL)
415 return NULL;
416 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
417 Py_DECREF(unicode);
418 return v;
419}
420
421PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
422 const char *encoding,
423 const char *errors)
424{
425 PyObject *v;
426
427 if (!PyUnicode_Check(unicode)) {
428 PyErr_BadArgument();
429 goto onError;
430 }
431 /* Shortcut for the default encoding UTF-8 */
432 if ((encoding == NULL ||
433 (strcmp(encoding, "utf-8") == 0)) &&
434 errors == NULL)
435 return PyUnicode_AsUTF8String(unicode);
436
437 /* Encode via the codec registry */
438 v = PyCodec_Encode(unicode, encoding, errors);
439 if (v == NULL)
440 goto onError;
441 /* XXX Should we really enforce this ? */
442 if (!PyString_Check(v)) {
443 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000444 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445 v->ob_type->tp_name);
446 Py_DECREF(v);
447 goto onError;
448 }
449 return v;
450
451 onError:
452 return NULL;
453}
454
455Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
456{
457 if (!PyUnicode_Check(unicode)) {
458 PyErr_BadArgument();
459 goto onError;
460 }
461 return PyUnicode_AS_UNICODE(unicode);
462
463 onError:
464 return NULL;
465}
466
467int PyUnicode_GetSize(PyObject *unicode)
468{
469 if (!PyUnicode_Check(unicode)) {
470 PyErr_BadArgument();
471 goto onError;
472 }
473 return PyUnicode_GET_SIZE(unicode);
474
475 onError:
476 return -1;
477}
478
479/* --- UTF-8 Codec -------------------------------------------------------- */
480
481static
482char utf8_code_length[256] = {
483 /* Map UTF-8 encoded prefix byte to sequence length. zero means
484 illegal prefix. see RFC 2279 for details */
485 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
487 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
488 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
489 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
490 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
491 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
492 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
497 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
498 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
499 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
500 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
501};
502
503static
504int utf8_decoding_error(const char **source,
505 Py_UNICODE **dest,
506 const char *errors,
507 const char *details)
508{
509 if ((errors == NULL) ||
510 (strcmp(errors,"strict") == 0)) {
511 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000512 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513 details);
514 return -1;
515 }
516 else if (strcmp(errors,"ignore") == 0) {
517 (*source)++;
518 return 0;
519 }
520 else if (strcmp(errors,"replace") == 0) {
521 (*source)++;
522 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
523 (*dest)++;
524 return 0;
525 }
526 else {
527 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000528 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529 errors);
530 return -1;
531 }
532}
533
534#define UTF8_ERROR(details) do { \
535 if (utf8_decoding_error(&s, &p, errors, details)) \
536 goto onError; \
537 continue; \
538} while (0)
539
540PyObject *PyUnicode_DecodeUTF8(const char *s,
541 int size,
542 const char *errors)
543{
544 int n;
545 const char *e;
546 PyUnicodeObject *unicode;
547 Py_UNICODE *p;
548
549 /* Note: size will always be longer than the resulting Unicode
550 character count */
551 unicode = _PyUnicode_New(size);
552 if (!unicode)
553 return NULL;
554 if (size == 0)
555 return (PyObject *)unicode;
556
557 /* Unpack UTF-8 encoded data */
558 p = unicode->str;
559 e = s + size;
560
561 while (s < e) {
562 register Py_UNICODE ch = (unsigned char)*s;
563
564 if (ch < 0x80) {
565 *p++ = ch;
566 s++;
567 continue;
568 }
569
570 n = utf8_code_length[ch];
571
572 if (s + n > e)
573 UTF8_ERROR("unexpected end of data");
574
575 switch (n) {
576
577 case 0:
578 UTF8_ERROR("unexpected code byte");
579 break;
580
581 case 1:
582 UTF8_ERROR("internal error");
583 break;
584
585 case 2:
586 if ((s[1] & 0xc0) != 0x80)
587 UTF8_ERROR("invalid data");
588 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
589 if (ch < 0x80)
590 UTF8_ERROR("illegal encoding");
591 else
592 *p++ = ch;
593 break;
594
595 case 3:
596 if ((s[1] & 0xc0) != 0x80 ||
597 (s[2] & 0xc0) != 0x80)
598 UTF8_ERROR("invalid data");
599 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
600 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
601 UTF8_ERROR("illegal encoding");
602 else
603 *p++ = ch;
604 break;
605
606 default:
607 /* Other sizes are only needed for UCS-4 */
608 UTF8_ERROR("unsupported Unicode code range");
609 }
610 s += n;
611 }
612
613 /* Adjust length */
614 if (_PyUnicode_Resize(unicode, p - unicode->str))
615 goto onError;
616
617 return (PyObject *)unicode;
618
619onError:
620 Py_DECREF(unicode);
621 return NULL;
622}
623
624#undef UTF8_ERROR
625
626static
627int utf8_encoding_error(const Py_UNICODE **source,
628 char **dest,
629 const char *errors,
630 const char *details)
631{
632 if ((errors == NULL) ||
633 (strcmp(errors,"strict") == 0)) {
634 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000635 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636 details);
637 return -1;
638 }
639 else if (strcmp(errors,"ignore") == 0) {
640 return 0;
641 }
642 else if (strcmp(errors,"replace") == 0) {
643 **dest = '?';
644 (*dest)++;
645 return 0;
646 }
647 else {
648 PyErr_Format(PyExc_ValueError,
649 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 errors);
652 return -1;
653 }
654}
655
656PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
657 int size,
658 const char *errors)
659{
660 PyObject *v;
661 char *p;
662 char *q;
663
664 v = PyString_FromStringAndSize(NULL, 3 * size);
665 if (v == NULL)
666 return NULL;
667 if (size == 0)
668 goto done;
669
670 p = q = PyString_AS_STRING(v);
671 while (size-- > 0) {
672 Py_UNICODE ch = *s++;
673 if (ch < 0x80)
674 *p++ = (char) ch;
675 else if (ch < 0x0800) {
676 *p++ = 0xc0 | (ch >> 6);
677 *p++ = 0x80 | (ch & 0x3f);
678 } else if (0xD800 <= ch && ch <= 0xDFFF) {
679 /* These byte ranges are reserved for UTF-16 surrogate
680 bytes which the Python implementation currently does
681 not support. */
682 printf("code range problem: U+%04x\n", ch);
683 if (utf8_encoding_error(&s, &p, errors,
684 "unsupported code range"))
685 goto onError;
686 } else {
687 *p++ = 0xe0 | (ch >> 12);
688 *p++ = 0x80 | ((ch >> 6) & 0x3f);
689 *p++ = 0x80 | (ch & 0x3f);
690 }
691 }
692 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000693 if (_PyString_Resize(&v, p - q))
694 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000695
696 done:
697 return v;
698
699 onError:
700 Py_DECREF(v);
701 return NULL;
702}
703
704/* Return a Python string holding the UTF-8 encoded value of the
705 Unicode object.
706
707 The resulting string is cached in the Unicode object for subsequent
708 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000709 the character buffer interface and will live (at least) as long as
710 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711
712 The refcount of the string is *not* incremented.
713
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000714 *** Exported for internal use by the interpreter only !!! ***
715
Guido van Rossumd57fd912000-03-10 22:53:23 +0000716*/
717
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000718PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 const char *errors)
720{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000721 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722
723 if (v)
724 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000725 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
726 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 errors);
728 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000729 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730 return v;
731}
732
733PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
734{
735 PyObject *str;
736
737 if (!PyUnicode_Check(unicode)) {
738 PyErr_BadArgument();
739 return NULL;
740 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000741 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000742 if (str == NULL)
743 return NULL;
744 Py_INCREF(str);
745 return str;
746}
747
748/* --- UTF-16 Codec ------------------------------------------------------- */
749
750static
751int utf16_decoding_error(const Py_UNICODE **source,
752 Py_UNICODE **dest,
753 const char *errors,
754 const char *details)
755{
756 if ((errors == NULL) ||
757 (strcmp(errors,"strict") == 0)) {
758 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000759 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760 details);
761 return -1;
762 }
763 else if (strcmp(errors,"ignore") == 0) {
764 return 0;
765 }
766 else if (strcmp(errors,"replace") == 0) {
767 if (dest) {
768 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
769 (*dest)++;
770 }
771 return 0;
772 }
773 else {
774 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000775 "UTF-16 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 errors);
777 return -1;
778 }
779}
780
781#define UTF16_ERROR(details) do { \
782 if (utf16_decoding_error(&q, &p, errors, details)) \
783 goto onError; \
784 continue; \
785} while(0)
786
787PyObject *PyUnicode_DecodeUTF16(const char *s,
788 int size,
789 const char *errors,
790 int *byteorder)
791{
792 PyUnicodeObject *unicode;
793 Py_UNICODE *p;
794 const Py_UNICODE *q, *e;
795 int bo = 0;
796
797 /* size should be an even number */
798 if (size % sizeof(Py_UNICODE) != 0) {
799 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
800 return NULL;
801 /* The remaining input chars are ignored if we fall through
802 here... */
803 }
804
805 /* Note: size will always be longer than the resulting Unicode
806 character count */
807 unicode = _PyUnicode_New(size);
808 if (!unicode)
809 return NULL;
810 if (size == 0)
811 return (PyObject *)unicode;
812
813 /* Unpack UTF-16 encoded data */
814 p = unicode->str;
815 q = (Py_UNICODE *)s;
816 e = q + (size / sizeof(Py_UNICODE));
817
818 if (byteorder)
819 bo = *byteorder;
820
821 while (q < e) {
822 register Py_UNICODE ch = *q++;
823
824 /* Check for BOM marks (U+FEFF) in the input and adjust
825 current byte order setting accordingly. Swap input
826 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
827 !) */
828#ifdef BYTEORDER_IS_LITTLE_ENDIAN
829 if (ch == 0xFEFF) {
830 bo = -1;
831 continue;
832 } else if (ch == 0xFFFE) {
833 bo = 1;
834 continue;
835 }
836 if (bo == 1)
837 ch = (ch >> 8) | (ch << 8);
838#else
839 if (ch == 0xFEFF) {
840 bo = 1;
841 continue;
842 } else if (ch == 0xFFFE) {
843 bo = -1;
844 continue;
845 }
846 if (bo == -1)
847 ch = (ch >> 8) | (ch << 8);
848#endif
849 if (ch < 0xD800 || ch > 0xDFFF) {
850 *p++ = ch;
851 continue;
852 }
853
854 /* UTF-16 code pair: */
855 if (q >= e)
856 UTF16_ERROR("unexpected end of data");
857 if (0xDC00 <= *q && *q <= 0xDFFF) {
858 q++;
859 if (0xD800 <= *q && *q <= 0xDBFF)
860 /* This is valid data (a UTF-16 surrogate pair), but
861 we are not able to store this information since our
862 Py_UNICODE type only has 16 bits... this might
863 change someday, even though it's unlikely. */
864 UTF16_ERROR("code pairs are not supported");
865 else
866 continue;
867 }
868 UTF16_ERROR("illegal encoding");
869 }
870
871 if (byteorder)
872 *byteorder = bo;
873
874 /* Adjust length */
875 if (_PyUnicode_Resize(unicode, p - unicode->str))
876 goto onError;
877
878 return (PyObject *)unicode;
879
880onError:
881 Py_DECREF(unicode);
882 return NULL;
883}
884
885#undef UTF16_ERROR
886
887PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
888 int size,
889 const char *errors,
890 int byteorder)
891{
892 PyObject *v;
893 Py_UNICODE *p;
894 char *q;
895
896 /* We don't create UTF-16 pairs... */
897 v = PyString_FromStringAndSize(NULL,
898 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
899 if (v == NULL)
900 return NULL;
901 if (size == 0)
902 goto done;
903
904 q = PyString_AS_STRING(v);
905 p = (Py_UNICODE *)q;
906
907 if (byteorder == 0)
908 *p++ = 0xFEFF;
909 if (byteorder == 0 ||
910#ifdef BYTEORDER_IS_LITTLE_ENDIAN
911 byteorder == -1
912#else
913 byteorder == 1
914#endif
915 )
916 memcpy(p, s, size * sizeof(Py_UNICODE));
917 else
918 while (size-- > 0) {
919 Py_UNICODE ch = *s++;
920 *p++ = (ch >> 8) | (ch << 8);
921 }
922 done:
923 return v;
924}
925
926PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
927{
928 if (!PyUnicode_Check(unicode)) {
929 PyErr_BadArgument();
930 return NULL;
931 }
932 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
933 PyUnicode_GET_SIZE(unicode),
934 NULL,
935 0);
936}
937
938/* --- Unicode Escape Codec ----------------------------------------------- */
939
940static
941int unicodeescape_decoding_error(const char **source,
942 unsigned int *x,
943 const char *errors,
944 const char *details)
945{
946 if ((errors == NULL) ||
947 (strcmp(errors,"strict") == 0)) {
948 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000949 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 details);
951 return -1;
952 }
953 else if (strcmp(errors,"ignore") == 0) {
954 return 0;
955 }
956 else if (strcmp(errors,"replace") == 0) {
957 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
958 return 0;
959 }
960 else {
961 PyErr_Format(PyExc_ValueError,
962 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000963 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 errors);
965 return -1;
966 }
967}
968
969PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
970 int size,
971 const char *errors)
972{
973 PyUnicodeObject *v;
974 Py_UNICODE *p = NULL, *buf = NULL;
975 const char *end;
976
977 /* Escaped strings will always be longer than the resulting
978 Unicode string, so we start with size here and then reduce the
979 length after conversion to the true value. */
980 v = _PyUnicode_New(size);
981 if (v == NULL)
982 goto onError;
983 if (size == 0)
984 return (PyObject *)v;
985 p = buf = PyUnicode_AS_UNICODE(v);
986 end = s + size;
987 while (s < end) {
988 unsigned char c;
989 unsigned int x;
990 int i;
991
992 /* Non-escape characters are interpreted as Unicode ordinals */
993 if (*s != '\\') {
994 *p++ = (unsigned char)*s++;
995 continue;
996 }
997
998 /* \ - Escapes */
999 s++;
1000 switch (*s++) {
1001
1002 /* \x escapes */
1003 case '\n': break;
1004 case '\\': *p++ = '\\'; break;
1005 case '\'': *p++ = '\''; break;
1006 case '\"': *p++ = '\"'; break;
1007 case 'b': *p++ = '\b'; break;
1008 case 'f': *p++ = '\014'; break; /* FF */
1009 case 't': *p++ = '\t'; break;
1010 case 'n': *p++ = '\n'; break;
1011 case 'r': *p++ = '\r'; break;
1012 case 'v': *p++ = '\013'; break; /* VT */
1013 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1014
1015 /* \OOO (octal) escapes */
1016 case '0': case '1': case '2': case '3':
1017 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001018 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001020 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001022 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001024 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025 break;
1026
1027 /* \xXXXX escape with 0-4 hex digits */
1028 case 'x':
1029 x = 0;
1030 c = (unsigned char)*s;
1031 if (isxdigit(c)) {
1032 do {
1033 x = (x<<4) & ~0xF;
1034 if ('0' <= c && c <= '9')
1035 x += c - '0';
1036 else if ('a' <= c && c <= 'f')
1037 x += 10 + c - 'a';
1038 else
1039 x += 10 + c - 'A';
1040 c = (unsigned char)*++s;
1041 } while (isxdigit(c));
1042 *p++ = x;
1043 } else {
1044 *p++ = '\\';
1045 *p++ = (unsigned char)s[-1];
1046 }
1047 break;
1048
1049 /* \uXXXX with 4 hex digits */
1050 case 'u':
1051 for (x = 0, i = 0; i < 4; i++) {
1052 c = (unsigned char)s[i];
1053 if (!isxdigit(c)) {
1054 if (unicodeescape_decoding_error(&s, &x, errors,
1055 "truncated \\uXXXX"))
1056 goto onError;
1057 i++;
1058 break;
1059 }
1060 x = (x<<4) & ~0xF;
1061 if (c >= '0' && c <= '9')
1062 x += c - '0';
1063 else if (c >= 'a' && c <= 'f')
1064 x += 10 + c - 'a';
1065 else
1066 x += 10 + c - 'A';
1067 }
1068 s += i;
1069 *p++ = x;
1070 break;
1071
1072 default:
1073 *p++ = '\\';
1074 *p++ = (unsigned char)s[-1];
1075 break;
1076 }
1077 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001078 if (_PyUnicode_Resize(v, (int)(p - buf)))
1079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 return (PyObject *)v;
1081
1082 onError:
1083 Py_XDECREF(v);
1084 return NULL;
1085}
1086
1087/* Return a Unicode-Escape string version of the Unicode object.
1088
1089 If quotes is true, the string is enclosed in u"" or u'' quotes as
1090 appropriate.
1091
1092*/
1093
Barry Warsaw51ac5802000-03-20 16:36:48 +00001094static const Py_UNICODE *findchar(const Py_UNICODE *s,
1095 int size,
1096 Py_UNICODE ch);
1097
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098static
1099PyObject *unicodeescape_string(const Py_UNICODE *s,
1100 int size,
1101 int quotes)
1102{
1103 PyObject *repr;
1104 char *p;
1105 char *q;
1106
1107 static const char *hexdigit = "0123456789ABCDEF";
1108
1109 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1110 if (repr == NULL)
1111 return NULL;
1112
1113 p = q = PyString_AS_STRING(repr);
1114
1115 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 *p++ = 'u';
1117 *p++ = (findchar(s, size, '\'') &&
1118 !findchar(s, size, '"')) ? '"' : '\'';
1119 }
1120 while (size-- > 0) {
1121 Py_UNICODE ch = *s++;
1122 /* Escape quotes */
1123 if (quotes && (ch == q[1] || ch == '\\')) {
1124 *p++ = '\\';
1125 *p++ = (char) ch;
1126 }
1127 /* Map 16-bit characters to '\uxxxx' */
1128 else if (ch >= 256) {
1129 *p++ = '\\';
1130 *p++ = 'u';
1131 *p++ = hexdigit[(ch >> 12) & 0xf];
1132 *p++ = hexdigit[(ch >> 8) & 0xf];
1133 *p++ = hexdigit[(ch >> 4) & 0xf];
1134 *p++ = hexdigit[ch & 15];
1135 }
1136 /* Map non-printable US ASCII to '\ooo' */
1137 else if (ch < ' ' || ch >= 128) {
1138 *p++ = '\\';
1139 *p++ = hexdigit[(ch >> 6) & 7];
1140 *p++ = hexdigit[(ch >> 3) & 7];
1141 *p++ = hexdigit[ch & 7];
1142 }
1143 /* Copy everything else as-is */
1144 else
1145 *p++ = (char) ch;
1146 }
1147 if (quotes)
1148 *p++ = q[1];
1149
1150 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001151 if (_PyString_Resize(&repr, p - q))
1152 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153
1154 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001155
1156 onError:
1157 Py_DECREF(repr);
1158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159}
1160
1161PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1162 int size)
1163{
1164 return unicodeescape_string(s, size, 0);
1165}
1166
1167PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1168{
1169 if (!PyUnicode_Check(unicode)) {
1170 PyErr_BadArgument();
1171 return NULL;
1172 }
1173 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1174 PyUnicode_GET_SIZE(unicode));
1175}
1176
1177/* --- Raw Unicode Escape Codec ------------------------------------------- */
1178
1179PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1180 int size,
1181 const char *errors)
1182{
1183 PyUnicodeObject *v;
1184 Py_UNICODE *p, *buf;
1185 const char *end;
1186 const char *bs;
1187
1188 /* Escaped strings will always be longer than the resulting
1189 Unicode string, so we start with size here and then reduce the
1190 length after conversion to the true value. */
1191 v = _PyUnicode_New(size);
1192 if (v == NULL)
1193 goto onError;
1194 if (size == 0)
1195 return (PyObject *)v;
1196 p = buf = PyUnicode_AS_UNICODE(v);
1197 end = s + size;
1198 while (s < end) {
1199 unsigned char c;
1200 unsigned int x;
1201 int i;
1202
1203 /* Non-escape characters are interpreted as Unicode ordinals */
1204 if (*s != '\\') {
1205 *p++ = (unsigned char)*s++;
1206 continue;
1207 }
1208
1209 /* \u-escapes are only interpreted iff the number of leading
1210 backslashes if odd */
1211 bs = s;
1212 for (;s < end;) {
1213 if (*s != '\\')
1214 break;
1215 *p++ = (unsigned char)*s++;
1216 }
1217 if (((s - bs) & 1) == 0 ||
1218 s >= end ||
1219 *s != 'u') {
1220 continue;
1221 }
1222 p--;
1223 s++;
1224
1225 /* \uXXXX with 4 hex digits */
1226 for (x = 0, i = 0; i < 4; i++) {
1227 c = (unsigned char)s[i];
1228 if (!isxdigit(c)) {
1229 if (unicodeescape_decoding_error(&s, &x, errors,
1230 "truncated \\uXXXX"))
1231 goto onError;
1232 i++;
1233 break;
1234 }
1235 x = (x<<4) & ~0xF;
1236 if (c >= '0' && c <= '9')
1237 x += c - '0';
1238 else if (c >= 'a' && c <= 'f')
1239 x += 10 + c - 'a';
1240 else
1241 x += 10 + c - 'A';
1242 }
1243 s += i;
1244 *p++ = x;
1245 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001246 if (_PyUnicode_Resize(v, (int)(p - buf)))
1247 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 return (PyObject *)v;
1249
1250 onError:
1251 Py_XDECREF(v);
1252 return NULL;
1253}
1254
1255PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1256 int size)
1257{
1258 PyObject *repr;
1259 char *p;
1260 char *q;
1261
1262 static const char *hexdigit = "0123456789ABCDEF";
1263
1264 repr = PyString_FromStringAndSize(NULL, 6 * size);
1265 if (repr == NULL)
1266 return NULL;
1267
1268 p = q = PyString_AS_STRING(repr);
1269 while (size-- > 0) {
1270 Py_UNICODE ch = *s++;
1271 /* Map 16-bit characters to '\uxxxx' */
1272 if (ch >= 256) {
1273 *p++ = '\\';
1274 *p++ = 'u';
1275 *p++ = hexdigit[(ch >> 12) & 0xf];
1276 *p++ = hexdigit[(ch >> 8) & 0xf];
1277 *p++ = hexdigit[(ch >> 4) & 0xf];
1278 *p++ = hexdigit[ch & 15];
1279 }
1280 /* Copy everything else as-is */
1281 else
1282 *p++ = (char) ch;
1283 }
1284 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001285 if (_PyString_Resize(&repr, p - q))
1286 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287
1288 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001289
1290 onError:
1291 Py_DECREF(repr);
1292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293}
1294
1295PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1296{
1297 if (!PyUnicode_Check(unicode)) {
1298 PyErr_BadArgument();
1299 return NULL;
1300 }
1301 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1302 PyUnicode_GET_SIZE(unicode));
1303}
1304
1305/* --- Latin-1 Codec ------------------------------------------------------ */
1306
1307PyObject *PyUnicode_DecodeLatin1(const char *s,
1308 int size,
1309 const char *errors)
1310{
1311 PyUnicodeObject *v;
1312 Py_UNICODE *p;
1313
1314 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1315 v = _PyUnicode_New(size);
1316 if (v == NULL)
1317 goto onError;
1318 if (size == 0)
1319 return (PyObject *)v;
1320 p = PyUnicode_AS_UNICODE(v);
1321 while (size-- > 0)
1322 *p++ = (unsigned char)*s++;
1323 return (PyObject *)v;
1324
1325 onError:
1326 Py_XDECREF(v);
1327 return NULL;
1328}
1329
1330static
1331int latin1_encoding_error(const Py_UNICODE **source,
1332 char **dest,
1333 const char *errors,
1334 const char *details)
1335{
1336 if ((errors == NULL) ||
1337 (strcmp(errors,"strict") == 0)) {
1338 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001339 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340 details);
1341 return -1;
1342 }
1343 else if (strcmp(errors,"ignore") == 0) {
1344 return 0;
1345 }
1346 else if (strcmp(errors,"replace") == 0) {
1347 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001348 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 return 0;
1350 }
1351 else {
1352 PyErr_Format(PyExc_ValueError,
1353 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001354 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 errors);
1356 return -1;
1357 }
1358}
1359
1360PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1361 int size,
1362 const char *errors)
1363{
1364 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001365 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 repr = PyString_FromStringAndSize(NULL, size);
1367 if (repr == NULL)
1368 return NULL;
1369
1370 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001371 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 while (size-- > 0) {
1373 Py_UNICODE ch = *p++;
1374 if (ch >= 256) {
1375 if (latin1_encoding_error(&p, &s, errors,
1376 "ordinal not in range(256)"))
1377 goto onError;
1378 }
1379 else
1380 *s++ = (char)ch;
1381 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001382 /* Resize if error handling skipped some characters */
1383 if (s - start < PyString_GET_SIZE(repr))
1384 if (_PyString_Resize(&repr, s - start))
1385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 return repr;
1387
1388 onError:
1389 Py_DECREF(repr);
1390 return NULL;
1391}
1392
1393PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1394{
1395 if (!PyUnicode_Check(unicode)) {
1396 PyErr_BadArgument();
1397 return NULL;
1398 }
1399 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1400 PyUnicode_GET_SIZE(unicode),
1401 NULL);
1402}
1403
1404/* --- 7-bit ASCII Codec -------------------------------------------------- */
1405
1406static
1407int ascii_decoding_error(const char **source,
1408 Py_UNICODE **dest,
1409 const char *errors,
1410 const char *details)
1411{
1412 if ((errors == NULL) ||
1413 (strcmp(errors,"strict") == 0)) {
1414 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001415 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 details);
1417 return -1;
1418 }
1419 else if (strcmp(errors,"ignore") == 0) {
1420 return 0;
1421 }
1422 else if (strcmp(errors,"replace") == 0) {
1423 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1424 (*dest)++;
1425 return 0;
1426 }
1427 else {
1428 PyErr_Format(PyExc_ValueError,
1429 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001430 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 errors);
1432 return -1;
1433 }
1434}
1435
1436PyObject *PyUnicode_DecodeASCII(const char *s,
1437 int size,
1438 const char *errors)
1439{
1440 PyUnicodeObject *v;
1441 Py_UNICODE *p;
1442
1443 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1444 v = _PyUnicode_New(size);
1445 if (v == NULL)
1446 goto onError;
1447 if (size == 0)
1448 return (PyObject *)v;
1449 p = PyUnicode_AS_UNICODE(v);
1450 while (size-- > 0) {
1451 register unsigned char c;
1452
1453 c = (unsigned char)*s++;
1454 if (c < 128)
1455 *p++ = c;
1456 else if (ascii_decoding_error(&s, &p, errors,
1457 "ordinal not in range(128)"))
1458 goto onError;
1459 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001460 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1461 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463 return (PyObject *)v;
1464
1465 onError:
1466 Py_XDECREF(v);
1467 return NULL;
1468}
1469
1470static
1471int ascii_encoding_error(const Py_UNICODE **source,
1472 char **dest,
1473 const char *errors,
1474 const char *details)
1475{
1476 if ((errors == NULL) ||
1477 (strcmp(errors,"strict") == 0)) {
1478 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001479 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 details);
1481 return -1;
1482 }
1483 else if (strcmp(errors,"ignore") == 0) {
1484 return 0;
1485 }
1486 else if (strcmp(errors,"replace") == 0) {
1487 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001488 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 return 0;
1490 }
1491 else {
1492 PyErr_Format(PyExc_ValueError,
1493 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001494 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495 errors);
1496 return -1;
1497 }
1498}
1499
1500PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1501 int size,
1502 const char *errors)
1503{
1504 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001505 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506 repr = PyString_FromStringAndSize(NULL, size);
1507 if (repr == NULL)
1508 return NULL;
1509
1510 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001511 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 while (size-- > 0) {
1513 Py_UNICODE ch = *p++;
1514 if (ch >= 128) {
1515 if (ascii_encoding_error(&p, &s, errors,
1516 "ordinal not in range(128)"))
1517 goto onError;
1518 }
1519 else
1520 *s++ = (char)ch;
1521 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001522 /* Resize if error handling skipped some characters */
1523 if (s - start < PyString_GET_SIZE(repr))
1524 if (_PyString_Resize(&repr, s - start))
1525 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 return repr;
1527
1528 onError:
1529 Py_DECREF(repr);
1530 return NULL;
1531}
1532
1533PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1534{
1535 if (!PyUnicode_Check(unicode)) {
1536 PyErr_BadArgument();
1537 return NULL;
1538 }
1539 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1540 PyUnicode_GET_SIZE(unicode),
1541 NULL);
1542}
1543
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001544#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001545
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001546/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001547
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001548PyObject *PyUnicode_DecodeMBCS(const char *s,
1549 int size,
1550 const char *errors)
1551{
1552 PyUnicodeObject *v;
1553 Py_UNICODE *p;
1554
1555 /* First get the size of the result */
1556 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001557 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001558 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1559
1560 v = _PyUnicode_New(usize);
1561 if (v == NULL)
1562 return NULL;
1563 if (usize == 0)
1564 return (PyObject *)v;
1565 p = PyUnicode_AS_UNICODE(v);
1566 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1567 Py_DECREF(v);
1568 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1569 }
1570
1571 return (PyObject *)v;
1572}
1573
1574PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1575 int size,
1576 const char *errors)
1577{
1578 PyObject *repr;
1579 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001580 DWORD mbcssize;
1581
1582 /* If there are no characters, bail now! */
1583 if (size==0)
1584 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001585
1586 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001587 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001588 if (mbcssize==0)
1589 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1590
1591 repr = PyString_FromStringAndSize(NULL, mbcssize);
1592 if (repr == NULL)
1593 return NULL;
1594 if (mbcssize==0)
1595 return repr;
1596
1597 /* Do the conversion */
1598 s = PyString_AS_STRING(repr);
1599 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1600 Py_DECREF(repr);
1601 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1602 }
1603 return repr;
1604}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001605
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001606#endif /* MS_WIN32 */
1607
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608/* --- Character Mapping Codec -------------------------------------------- */
1609
1610static
1611int charmap_decoding_error(const char **source,
1612 Py_UNICODE **dest,
1613 const char *errors,
1614 const char *details)
1615{
1616 if ((errors == NULL) ||
1617 (strcmp(errors,"strict") == 0)) {
1618 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001619 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 details);
1621 return -1;
1622 }
1623 else if (strcmp(errors,"ignore") == 0) {
1624 return 0;
1625 }
1626 else if (strcmp(errors,"replace") == 0) {
1627 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1628 (*dest)++;
1629 return 0;
1630 }
1631 else {
1632 PyErr_Format(PyExc_ValueError,
1633 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001634 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001635 errors);
1636 return -1;
1637 }
1638}
1639
1640PyObject *PyUnicode_DecodeCharmap(const char *s,
1641 int size,
1642 PyObject *mapping,
1643 const char *errors)
1644{
1645 PyUnicodeObject *v;
1646 Py_UNICODE *p;
1647
1648 /* Default to Latin-1 */
1649 if (mapping == NULL)
1650 return PyUnicode_DecodeLatin1(s, size, errors);
1651
1652 v = _PyUnicode_New(size);
1653 if (v == NULL)
1654 goto onError;
1655 if (size == 0)
1656 return (PyObject *)v;
1657 p = PyUnicode_AS_UNICODE(v);
1658 while (size-- > 0) {
1659 unsigned char ch = *s++;
1660 PyObject *w, *x;
1661
1662 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1663 w = PyInt_FromLong((long)ch);
1664 if (w == NULL)
1665 goto onError;
1666 x = PyObject_GetItem(mapping, w);
1667 Py_DECREF(w);
1668 if (x == NULL) {
1669 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1670 /* No mapping found: default to Latin-1 mapping */
1671 PyErr_Clear();
1672 *p++ = (Py_UNICODE)ch;
1673 continue;
1674 }
1675 goto onError;
1676 }
1677
1678 /* Apply mapping */
1679 if (PyInt_Check(x)) {
1680 int value = PyInt_AS_LONG(x);
1681 if (value < 0 || value > 65535) {
1682 PyErr_SetString(PyExc_TypeError,
1683 "character mapping must be in range(65336)");
1684 Py_DECREF(x);
1685 goto onError;
1686 }
1687 *p++ = (Py_UNICODE)value;
1688 }
1689 else if (x == Py_None) {
1690 /* undefined mapping */
1691 if (charmap_decoding_error(&s, &p, errors,
1692 "character maps to <undefined>")) {
1693 Py_DECREF(x);
1694 goto onError;
1695 }
1696 }
1697 else if (PyUnicode_Check(x)) {
1698 if (PyUnicode_GET_SIZE(x) != 1) {
1699 /* 1-n mapping */
1700 PyErr_SetString(PyExc_NotImplementedError,
1701 "1-n mappings are currently not implemented");
1702 Py_DECREF(x);
1703 goto onError;
1704 }
1705 *p++ = *PyUnicode_AS_UNICODE(x);
1706 }
1707 else {
1708 /* wrong return value */
1709 PyErr_SetString(PyExc_TypeError,
1710 "character mapping must return integer, None or unicode");
1711 Py_DECREF(x);
1712 goto onError;
1713 }
1714 Py_DECREF(x);
1715 }
1716 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1717 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1718 goto onError;
1719 return (PyObject *)v;
1720
1721 onError:
1722 Py_XDECREF(v);
1723 return NULL;
1724}
1725
1726static
1727int charmap_encoding_error(const Py_UNICODE **source,
1728 char **dest,
1729 const char *errors,
1730 const char *details)
1731{
1732 if ((errors == NULL) ||
1733 (strcmp(errors,"strict") == 0)) {
1734 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001735 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 details);
1737 return -1;
1738 }
1739 else if (strcmp(errors,"ignore") == 0) {
1740 return 0;
1741 }
1742 else if (strcmp(errors,"replace") == 0) {
1743 **dest = '?';
1744 (*dest)++;
1745 return 0;
1746 }
1747 else {
1748 PyErr_Format(PyExc_ValueError,
1749 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001750 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 errors);
1752 return -1;
1753 }
1754}
1755
1756PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1757 int size,
1758 PyObject *mapping,
1759 const char *errors)
1760{
1761 PyObject *v;
1762 char *s;
1763
1764 /* Default to Latin-1 */
1765 if (mapping == NULL)
1766 return PyUnicode_EncodeLatin1(p, size, errors);
1767
1768 v = PyString_FromStringAndSize(NULL, size);
1769 if (v == NULL)
1770 return NULL;
1771 s = PyString_AS_STRING(v);
1772 while (size-- > 0) {
1773 Py_UNICODE ch = *p++;
1774 PyObject *w, *x;
1775
1776 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1777 w = PyInt_FromLong((long)ch);
1778 if (w == NULL)
1779 goto onError;
1780 x = PyObject_GetItem(mapping, w);
1781 Py_DECREF(w);
1782 if (x == NULL) {
1783 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1784 /* No mapping found: default to Latin-1 mapping if possible */
1785 PyErr_Clear();
1786 if (ch < 256) {
1787 *s++ = (char)ch;
1788 continue;
1789 }
1790 else if (!charmap_encoding_error(&p, &s, errors,
1791 "missing character mapping"))
1792 continue;
1793 }
1794 goto onError;
1795 }
1796
1797 /* Apply mapping */
1798 if (PyInt_Check(x)) {
1799 int value = PyInt_AS_LONG(x);
1800 if (value < 0 || value > 255) {
1801 PyErr_SetString(PyExc_TypeError,
1802 "character mapping must be in range(256)");
1803 Py_DECREF(x);
1804 goto onError;
1805 }
1806 *s++ = (char)value;
1807 }
1808 else if (x == Py_None) {
1809 /* undefined mapping */
1810 if (charmap_encoding_error(&p, &s, errors,
1811 "character maps to <undefined>")) {
1812 Py_DECREF(x);
1813 goto onError;
1814 }
1815 }
1816 else if (PyString_Check(x)) {
1817 if (PyString_GET_SIZE(x) != 1) {
1818 /* 1-n mapping */
1819 PyErr_SetString(PyExc_NotImplementedError,
1820 "1-n mappings are currently not implemented");
1821 Py_DECREF(x);
1822 goto onError;
1823 }
1824 *s++ = *PyString_AS_STRING(x);
1825 }
1826 else {
1827 /* wrong return value */
1828 PyErr_SetString(PyExc_TypeError,
1829 "character mapping must return integer, None or unicode");
1830 Py_DECREF(x);
1831 goto onError;
1832 }
1833 Py_DECREF(x);
1834 }
1835 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1836 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1837 goto onError;
1838 return v;
1839
1840 onError:
1841 Py_DECREF(v);
1842 return NULL;
1843}
1844
1845PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1846 PyObject *mapping)
1847{
1848 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1849 PyErr_BadArgument();
1850 return NULL;
1851 }
1852 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1853 PyUnicode_GET_SIZE(unicode),
1854 mapping,
1855 NULL);
1856}
1857
1858static
1859int translate_error(const Py_UNICODE **source,
1860 Py_UNICODE **dest,
1861 const char *errors,
1862 const char *details)
1863{
1864 if ((errors == NULL) ||
1865 (strcmp(errors,"strict") == 0)) {
1866 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001867 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 details);
1869 return -1;
1870 }
1871 else if (strcmp(errors,"ignore") == 0) {
1872 return 0;
1873 }
1874 else if (strcmp(errors,"replace") == 0) {
1875 **dest = '?';
1876 (*dest)++;
1877 return 0;
1878 }
1879 else {
1880 PyErr_Format(PyExc_ValueError,
1881 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001882 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 errors);
1884 return -1;
1885 }
1886}
1887
1888PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1889 int size,
1890 PyObject *mapping,
1891 const char *errors)
1892{
1893 PyUnicodeObject *v;
1894 Py_UNICODE *p;
1895
1896 if (mapping == NULL) {
1897 PyErr_BadArgument();
1898 return NULL;
1899 }
1900
1901 /* Output will never be longer than input */
1902 v = _PyUnicode_New(size);
1903 if (v == NULL)
1904 goto onError;
1905 if (size == 0)
1906 goto done;
1907 p = PyUnicode_AS_UNICODE(v);
1908 while (size-- > 0) {
1909 Py_UNICODE ch = *s++;
1910 PyObject *w, *x;
1911
1912 /* Get mapping */
1913 w = PyInt_FromLong(ch);
1914 if (w == NULL)
1915 goto onError;
1916 x = PyObject_GetItem(mapping, w);
1917 Py_DECREF(w);
1918 if (x == NULL) {
1919 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1920 /* No mapping found: default to 1-1 mapping */
1921 PyErr_Clear();
1922 *p++ = ch;
1923 continue;
1924 }
1925 goto onError;
1926 }
1927
1928 /* Apply mapping */
1929 if (PyInt_Check(x))
1930 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1931 else if (x == Py_None) {
1932 /* undefined mapping */
1933 if (translate_error(&s, &p, errors,
1934 "character maps to <undefined>")) {
1935 Py_DECREF(x);
1936 goto onError;
1937 }
1938 }
1939 else if (PyUnicode_Check(x)) {
1940 if (PyUnicode_GET_SIZE(x) != 1) {
1941 /* 1-n mapping */
1942 PyErr_SetString(PyExc_NotImplementedError,
1943 "1-n mappings are currently not implemented");
1944 Py_DECREF(x);
1945 goto onError;
1946 }
1947 *p++ = *PyUnicode_AS_UNICODE(x);
1948 }
1949 else {
1950 /* wrong return value */
1951 PyErr_SetString(PyExc_TypeError,
1952 "translate mapping must return integer, None or unicode");
1953 Py_DECREF(x);
1954 goto onError;
1955 }
1956 Py_DECREF(x);
1957 }
1958 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001959 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1960 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961
1962 done:
1963 return (PyObject *)v;
1964
1965 onError:
1966 Py_XDECREF(v);
1967 return NULL;
1968}
1969
1970PyObject *PyUnicode_Translate(PyObject *str,
1971 PyObject *mapping,
1972 const char *errors)
1973{
1974 PyObject *result;
1975
1976 str = PyUnicode_FromObject(str);
1977 if (str == NULL)
1978 goto onError;
1979 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1980 PyUnicode_GET_SIZE(str),
1981 mapping,
1982 errors);
1983 Py_DECREF(str);
1984 return result;
1985
1986 onError:
1987 Py_XDECREF(str);
1988 return NULL;
1989}
1990
Guido van Rossum9e896b32000-04-05 20:11:21 +00001991/* --- Decimal Encoder ---------------------------------------------------- */
1992
1993int PyUnicode_EncodeDecimal(Py_UNICODE *s,
1994 int length,
1995 char *output,
1996 const char *errors)
1997{
1998 Py_UNICODE *p, *end;
1999
2000 if (output == NULL) {
2001 PyErr_BadArgument();
2002 return -1;
2003 }
2004
2005 p = s;
2006 end = s + length;
2007 while (p < end) {
2008 register Py_UNICODE ch = *p++;
2009 int decimal;
2010
2011 if (Py_UNICODE_ISSPACE(ch)) {
2012 *output++ = ' ';
2013 continue;
2014 }
2015 decimal = Py_UNICODE_TODECIMAL(ch);
2016 if (decimal >= 0) {
2017 *output++ = '0' + decimal;
2018 continue;
2019 }
Guido van Rossumba477042000-04-06 18:18:10 +00002020 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002021 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002022 continue;
2023 }
2024 /* All other characters are considered invalid */
2025 if (errors == NULL || strcmp(errors, "strict") == 0) {
2026 PyErr_SetString(PyExc_ValueError,
2027 "invalid decimal Unicode string");
2028 goto onError;
2029 }
2030 else if (strcmp(errors, "ignore") == 0)
2031 continue;
2032 else if (strcmp(errors, "replace") == 0) {
2033 *output++ = '?';
2034 continue;
2035 }
2036 }
2037 /* 0-terminate the output string */
2038 *output++ = '\0';
2039 return 0;
2040
2041 onError:
2042 return -1;
2043}
2044
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045/* --- Helpers ------------------------------------------------------------ */
2046
2047static
2048int count(PyUnicodeObject *self,
2049 int start,
2050 int end,
2051 PyUnicodeObject *substring)
2052{
2053 int count = 0;
2054
2055 end -= substring->length;
2056
2057 while (start <= end)
2058 if (Py_UNICODE_MATCH(self, start, substring)) {
2059 count++;
2060 start += substring->length;
2061 } else
2062 start++;
2063
2064 return count;
2065}
2066
2067int PyUnicode_Count(PyObject *str,
2068 PyObject *substr,
2069 int start,
2070 int end)
2071{
2072 int result;
2073
2074 str = PyUnicode_FromObject(str);
2075 if (str == NULL)
2076 return -1;
2077 substr = PyUnicode_FromObject(substr);
2078 if (substr == NULL) {
2079 Py_DECREF(substr);
2080 return -1;
2081 }
2082
2083 result = count((PyUnicodeObject *)str,
2084 start, end,
2085 (PyUnicodeObject *)substr);
2086
2087 Py_DECREF(str);
2088 Py_DECREF(substr);
2089 return result;
2090}
2091
2092static
2093int findstring(PyUnicodeObject *self,
2094 PyUnicodeObject *substring,
2095 int start,
2096 int end,
2097 int direction)
2098{
2099 if (start < 0)
2100 start += self->length;
2101 if (start < 0)
2102 start = 0;
2103
2104 if (substring->length == 0)
2105 return start;
2106
2107 if (end > self->length)
2108 end = self->length;
2109 if (end < 0)
2110 end += self->length;
2111 if (end < 0)
2112 end = 0;
2113
2114 end -= substring->length;
2115
2116 if (direction < 0) {
2117 for (; end >= start; end--)
2118 if (Py_UNICODE_MATCH(self, end, substring))
2119 return end;
2120 } else {
2121 for (; start <= end; start++)
2122 if (Py_UNICODE_MATCH(self, start, substring))
2123 return start;
2124 }
2125
2126 return -1;
2127}
2128
2129int PyUnicode_Find(PyObject *str,
2130 PyObject *substr,
2131 int start,
2132 int end,
2133 int direction)
2134{
2135 int result;
2136
2137 str = PyUnicode_FromObject(str);
2138 if (str == NULL)
2139 return -1;
2140 substr = PyUnicode_FromObject(substr);
2141 if (substr == NULL) {
2142 Py_DECREF(substr);
2143 return -1;
2144 }
2145
2146 result = findstring((PyUnicodeObject *)str,
2147 (PyUnicodeObject *)substr,
2148 start, end, direction);
2149 Py_DECREF(str);
2150 Py_DECREF(substr);
2151 return result;
2152}
2153
2154static
2155int tailmatch(PyUnicodeObject *self,
2156 PyUnicodeObject *substring,
2157 int start,
2158 int end,
2159 int direction)
2160{
2161 if (start < 0)
2162 start += self->length;
2163 if (start < 0)
2164 start = 0;
2165
2166 if (substring->length == 0)
2167 return 1;
2168
2169 if (end > self->length)
2170 end = self->length;
2171 if (end < 0)
2172 end += self->length;
2173 if (end < 0)
2174 end = 0;
2175
2176 end -= substring->length;
2177 if (end < start)
2178 return 0;
2179
2180 if (direction > 0) {
2181 if (Py_UNICODE_MATCH(self, end, substring))
2182 return 1;
2183 } else {
2184 if (Py_UNICODE_MATCH(self, start, substring))
2185 return 1;
2186 }
2187
2188 return 0;
2189}
2190
2191int PyUnicode_Tailmatch(PyObject *str,
2192 PyObject *substr,
2193 int start,
2194 int end,
2195 int direction)
2196{
2197 int result;
2198
2199 str = PyUnicode_FromObject(str);
2200 if (str == NULL)
2201 return -1;
2202 substr = PyUnicode_FromObject(substr);
2203 if (substr == NULL) {
2204 Py_DECREF(substr);
2205 return -1;
2206 }
2207
2208 result = tailmatch((PyUnicodeObject *)str,
2209 (PyUnicodeObject *)substr,
2210 start, end, direction);
2211 Py_DECREF(str);
2212 Py_DECREF(substr);
2213 return result;
2214}
2215
2216static
2217const Py_UNICODE *findchar(const Py_UNICODE *s,
2218 int size,
2219 Py_UNICODE ch)
2220{
2221 /* like wcschr, but doesn't stop at NULL characters */
2222
2223 while (size-- > 0) {
2224 if (*s == ch)
2225 return s;
2226 s++;
2227 }
2228
2229 return NULL;
2230}
2231
2232/* Apply fixfct filter to the Unicode object self and return a
2233 reference to the modified object */
2234
2235static
2236PyObject *fixup(PyUnicodeObject *self,
2237 int (*fixfct)(PyUnicodeObject *s))
2238{
2239
2240 PyUnicodeObject *u;
2241
2242 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2243 self->length);
2244 if (u == NULL)
2245 return NULL;
2246 if (!fixfct(u)) {
2247 /* fixfct should return TRUE if it modified the buffer. If
2248 FALSE, return a reference to the original buffer instead
2249 (to save space, not time) */
2250 Py_INCREF(self);
2251 Py_DECREF(u);
2252 return (PyObject*) self;
2253 }
2254 return (PyObject*) u;
2255}
2256
2257static
2258int fixupper(PyUnicodeObject *self)
2259{
2260 int len = self->length;
2261 Py_UNICODE *s = self->str;
2262 int status = 0;
2263
2264 while (len-- > 0) {
2265 register Py_UNICODE ch;
2266
2267 ch = Py_UNICODE_TOUPPER(*s);
2268 if (ch != *s) {
2269 status = 1;
2270 *s = ch;
2271 }
2272 s++;
2273 }
2274
2275 return status;
2276}
2277
2278static
2279int fixlower(PyUnicodeObject *self)
2280{
2281 int len = self->length;
2282 Py_UNICODE *s = self->str;
2283 int status = 0;
2284
2285 while (len-- > 0) {
2286 register Py_UNICODE ch;
2287
2288 ch = Py_UNICODE_TOLOWER(*s);
2289 if (ch != *s) {
2290 status = 1;
2291 *s = ch;
2292 }
2293 s++;
2294 }
2295
2296 return status;
2297}
2298
2299static
2300int fixswapcase(PyUnicodeObject *self)
2301{
2302 int len = self->length;
2303 Py_UNICODE *s = self->str;
2304 int status = 0;
2305
2306 while (len-- > 0) {
2307 if (Py_UNICODE_ISUPPER(*s)) {
2308 *s = Py_UNICODE_TOLOWER(*s);
2309 status = 1;
2310 } else if (Py_UNICODE_ISLOWER(*s)) {
2311 *s = Py_UNICODE_TOUPPER(*s);
2312 status = 1;
2313 }
2314 s++;
2315 }
2316
2317 return status;
2318}
2319
2320static
2321int fixcapitalize(PyUnicodeObject *self)
2322{
2323 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2324 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2325 return 1;
2326 }
2327 return 0;
2328}
2329
2330static
2331int fixtitle(PyUnicodeObject *self)
2332{
2333 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2334 register Py_UNICODE *e;
2335 int previous_is_cased;
2336
2337 /* Shortcut for single character strings */
2338 if (PyUnicode_GET_SIZE(self) == 1) {
2339 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2340 if (*p != ch) {
2341 *p = ch;
2342 return 1;
2343 }
2344 else
2345 return 0;
2346 }
2347
2348 e = p + PyUnicode_GET_SIZE(self);
2349 previous_is_cased = 0;
2350 for (; p < e; p++) {
2351 register const Py_UNICODE ch = *p;
2352
2353 if (previous_is_cased)
2354 *p = Py_UNICODE_TOLOWER(ch);
2355 else
2356 *p = Py_UNICODE_TOTITLE(ch);
2357
2358 if (Py_UNICODE_ISLOWER(ch) ||
2359 Py_UNICODE_ISUPPER(ch) ||
2360 Py_UNICODE_ISTITLE(ch))
2361 previous_is_cased = 1;
2362 else
2363 previous_is_cased = 0;
2364 }
2365 return 1;
2366}
2367
2368PyObject *PyUnicode_Join(PyObject *separator,
2369 PyObject *seq)
2370{
2371 Py_UNICODE *sep;
2372 int seplen;
2373 PyUnicodeObject *res = NULL;
2374 int reslen = 0;
2375 Py_UNICODE *p;
2376 int seqlen = 0;
2377 int sz = 100;
2378 int i;
2379
2380 seqlen = PySequence_Length(seq);
2381 if (seqlen < 0 && PyErr_Occurred())
2382 return NULL;
2383
2384 if (separator == NULL) {
2385 Py_UNICODE blank = ' ';
2386 sep = &blank;
2387 seplen = 1;
2388 }
2389 else {
2390 separator = PyUnicode_FromObject(separator);
2391 if (separator == NULL)
2392 return NULL;
2393 sep = PyUnicode_AS_UNICODE(separator);
2394 seplen = PyUnicode_GET_SIZE(separator);
2395 }
2396
2397 res = _PyUnicode_New(sz);
2398 if (res == NULL)
2399 goto onError;
2400 p = PyUnicode_AS_UNICODE(res);
2401 reslen = 0;
2402
2403 for (i = 0; i < seqlen; i++) {
2404 int itemlen;
2405 PyObject *item;
2406
2407 item = PySequence_GetItem(seq, i);
2408 if (item == NULL)
2409 goto onError;
2410 if (!PyUnicode_Check(item)) {
2411 PyObject *v;
2412 v = PyUnicode_FromObject(item);
2413 Py_DECREF(item);
2414 item = v;
2415 if (item == NULL)
2416 goto onError;
2417 }
2418 itemlen = PyUnicode_GET_SIZE(item);
2419 while (reslen + itemlen + seplen >= sz) {
2420 if (_PyUnicode_Resize(res, sz*2))
2421 goto onError;
2422 sz *= 2;
2423 p = PyUnicode_AS_UNICODE(res) + reslen;
2424 }
2425 if (i > 0) {
2426 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2427 p += seplen;
2428 reslen += seplen;
2429 }
2430 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2431 p += itemlen;
2432 reslen += itemlen;
2433 Py_DECREF(item);
2434 }
2435 if (_PyUnicode_Resize(res, reslen))
2436 goto onError;
2437
2438 Py_XDECREF(separator);
2439 return (PyObject *)res;
2440
2441 onError:
2442 Py_XDECREF(separator);
2443 Py_DECREF(res);
2444 return NULL;
2445}
2446
2447static
2448PyUnicodeObject *pad(PyUnicodeObject *self,
2449 int left,
2450 int right,
2451 Py_UNICODE fill)
2452{
2453 PyUnicodeObject *u;
2454
2455 if (left < 0)
2456 left = 0;
2457 if (right < 0)
2458 right = 0;
2459
2460 if (left == 0 && right == 0) {
2461 Py_INCREF(self);
2462 return self;
2463 }
2464
2465 u = _PyUnicode_New(left + self->length + right);
2466 if (u) {
2467 if (left)
2468 Py_UNICODE_FILL(u->str, fill, left);
2469 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2470 if (right)
2471 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2472 }
2473
2474 return u;
2475}
2476
2477#define SPLIT_APPEND(data, left, right) \
2478 str = PyUnicode_FromUnicode(data + left, right - left); \
2479 if (!str) \
2480 goto onError; \
2481 if (PyList_Append(list, str)) { \
2482 Py_DECREF(str); \
2483 goto onError; \
2484 } \
2485 else \
2486 Py_DECREF(str);
2487
2488static
2489PyObject *split_whitespace(PyUnicodeObject *self,
2490 PyObject *list,
2491 int maxcount)
2492{
2493 register int i;
2494 register int j;
2495 int len = self->length;
2496 PyObject *str;
2497
2498 for (i = j = 0; i < len; ) {
2499 /* find a token */
2500 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2501 i++;
2502 j = i;
2503 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2504 i++;
2505 if (j < i) {
2506 if (maxcount-- <= 0)
2507 break;
2508 SPLIT_APPEND(self->str, j, i);
2509 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2510 i++;
2511 j = i;
2512 }
2513 }
2514 if (j < len) {
2515 SPLIT_APPEND(self->str, j, len);
2516 }
2517 return list;
2518
2519 onError:
2520 Py_DECREF(list);
2521 return NULL;
2522}
2523
2524PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002525 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526{
2527 register int i;
2528 register int j;
2529 int len;
2530 PyObject *list;
2531 PyObject *str;
2532 Py_UNICODE *data;
2533
2534 string = PyUnicode_FromObject(string);
2535 if (string == NULL)
2536 return NULL;
2537 data = PyUnicode_AS_UNICODE(string);
2538 len = PyUnicode_GET_SIZE(string);
2539
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 list = PyList_New(0);
2541 if (!list)
2542 goto onError;
2543
2544 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002545 int eol;
2546
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 /* Find a line and append it */
2548 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2549 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550
2551 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002552 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 if (i < len) {
2554 if (data[i] == '\r' && i + 1 < len &&
2555 data[i+1] == '\n')
2556 i += 2;
2557 else
2558 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002559 if (keepends)
2560 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 }
Guido van Rossum86662912000-04-11 15:38:46 +00002562 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 j = i;
2564 }
2565 if (j < len) {
2566 SPLIT_APPEND(data, j, len);
2567 }
2568
2569 Py_DECREF(string);
2570 return list;
2571
2572 onError:
2573 Py_DECREF(list);
2574 Py_DECREF(string);
2575 return NULL;
2576}
2577
2578static
2579PyObject *split_char(PyUnicodeObject *self,
2580 PyObject *list,
2581 Py_UNICODE ch,
2582 int maxcount)
2583{
2584 register int i;
2585 register int j;
2586 int len = self->length;
2587 PyObject *str;
2588
2589 for (i = j = 0; i < len; ) {
2590 if (self->str[i] == ch) {
2591 if (maxcount-- <= 0)
2592 break;
2593 SPLIT_APPEND(self->str, j, i);
2594 i = j = i + 1;
2595 } else
2596 i++;
2597 }
2598 if (j <= len) {
2599 SPLIT_APPEND(self->str, j, len);
2600 }
2601 return list;
2602
2603 onError:
2604 Py_DECREF(list);
2605 return NULL;
2606}
2607
2608static
2609PyObject *split_substring(PyUnicodeObject *self,
2610 PyObject *list,
2611 PyUnicodeObject *substring,
2612 int maxcount)
2613{
2614 register int i;
2615 register int j;
2616 int len = self->length;
2617 int sublen = substring->length;
2618 PyObject *str;
2619
2620 for (i = j = 0; i < len - sublen; ) {
2621 if (Py_UNICODE_MATCH(self, i, substring)) {
2622 if (maxcount-- <= 0)
2623 break;
2624 SPLIT_APPEND(self->str, j, i);
2625 i = j = i + sublen;
2626 } else
2627 i++;
2628 }
2629 if (j <= len) {
2630 SPLIT_APPEND(self->str, j, len);
2631 }
2632 return list;
2633
2634 onError:
2635 Py_DECREF(list);
2636 return NULL;
2637}
2638
2639#undef SPLIT_APPEND
2640
2641static
2642PyObject *split(PyUnicodeObject *self,
2643 PyUnicodeObject *substring,
2644 int maxcount)
2645{
2646 PyObject *list;
2647
2648 if (maxcount < 0)
2649 maxcount = INT_MAX;
2650
2651 list = PyList_New(0);
2652 if (!list)
2653 return NULL;
2654
2655 if (substring == NULL)
2656 return split_whitespace(self,list,maxcount);
2657
2658 else if (substring->length == 1)
2659 return split_char(self,list,substring->str[0],maxcount);
2660
2661 else if (substring->length == 0) {
2662 Py_DECREF(list);
2663 PyErr_SetString(PyExc_ValueError, "empty separator");
2664 return NULL;
2665 }
2666 else
2667 return split_substring(self,list,substring,maxcount);
2668}
2669
2670static
2671PyObject *strip(PyUnicodeObject *self,
2672 int left,
2673 int right)
2674{
2675 Py_UNICODE *p = self->str;
2676 int start = 0;
2677 int end = self->length;
2678
2679 if (left)
2680 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2681 start++;
2682
2683 if (right)
2684 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2685 end--;
2686
2687 if (start == 0 && end == self->length) {
2688 /* couldn't strip anything off, return original string */
2689 Py_INCREF(self);
2690 return (PyObject*) self;
2691 }
2692
2693 return (PyObject*) PyUnicode_FromUnicode(
2694 self->str + start,
2695 end - start
2696 );
2697}
2698
2699static
2700PyObject *replace(PyUnicodeObject *self,
2701 PyUnicodeObject *str1,
2702 PyUnicodeObject *str2,
2703 int maxcount)
2704{
2705 PyUnicodeObject *u;
2706
2707 if (maxcount < 0)
2708 maxcount = INT_MAX;
2709
2710 if (str1->length == 1 && str2->length == 1) {
2711 int i;
2712
2713 /* replace characters */
2714 if (!findchar(self->str, self->length, str1->str[0])) {
2715 /* nothing to replace, return original string */
2716 Py_INCREF(self);
2717 u = self;
2718 } else {
2719 Py_UNICODE u1 = str1->str[0];
2720 Py_UNICODE u2 = str2->str[0];
2721
2722 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2723 self->str,
2724 self->length
2725 );
2726 if (u)
2727 for (i = 0; i < u->length; i++)
2728 if (u->str[i] == u1) {
2729 if (--maxcount < 0)
2730 break;
2731 u->str[i] = u2;
2732 }
2733 }
2734
2735 } else {
2736 int n, i;
2737 Py_UNICODE *p;
2738
2739 /* replace strings */
2740 n = count(self, 0, self->length, str1);
2741 if (n > maxcount)
2742 n = maxcount;
2743 if (n == 0) {
2744 /* nothing to replace, return original string */
2745 Py_INCREF(self);
2746 u = self;
2747 } else {
2748 u = _PyUnicode_New(
2749 self->length + n * (str2->length - str1->length));
2750 if (u) {
2751 i = 0;
2752 p = u->str;
2753 while (i <= self->length - str1->length)
2754 if (Py_UNICODE_MATCH(self, i, str1)) {
2755 /* replace string segment */
2756 Py_UNICODE_COPY(p, str2->str, str2->length);
2757 p += str2->length;
2758 i += str1->length;
2759 if (--n <= 0) {
2760 /* copy remaining part */
2761 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2762 break;
2763 }
2764 } else
2765 *p++ = self->str[i++];
2766 }
2767 }
2768 }
2769
2770 return (PyObject *) u;
2771}
2772
2773/* --- Unicode Object Methods --------------------------------------------- */
2774
2775static char title__doc__[] =
2776"S.title() -> unicode\n\
2777\n\
2778Return a titlecased version of S, i.e. words start with title case\n\
2779characters, all remaining cased characters have lower case.";
2780
2781static PyObject*
2782unicode_title(PyUnicodeObject *self, PyObject *args)
2783{
2784 if (!PyArg_NoArgs(args))
2785 return NULL;
2786 return fixup(self, fixtitle);
2787}
2788
2789static char capitalize__doc__[] =
2790"S.capitalize() -> unicode\n\
2791\n\
2792Return a capitalized version of S, i.e. make the first character\n\
2793have upper case.";
2794
2795static PyObject*
2796unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2797{
2798 if (!PyArg_NoArgs(args))
2799 return NULL;
2800 return fixup(self, fixcapitalize);
2801}
2802
2803#if 0
2804static char capwords__doc__[] =
2805"S.capwords() -> unicode\n\
2806\n\
2807Apply .capitalize() to all words in S and return the result with\n\
2808normalized whitespace (all whitespace strings are replaced by ' ').";
2809
2810static PyObject*
2811unicode_capwords(PyUnicodeObject *self, PyObject *args)
2812{
2813 PyObject *list;
2814 PyObject *item;
2815 int i;
2816
2817 if (!PyArg_NoArgs(args))
2818 return NULL;
2819
2820 /* Split into words */
2821 list = split(self, NULL, -1);
2822 if (!list)
2823 return NULL;
2824
2825 /* Capitalize each word */
2826 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2827 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2828 fixcapitalize);
2829 if (item == NULL)
2830 goto onError;
2831 Py_DECREF(PyList_GET_ITEM(list, i));
2832 PyList_SET_ITEM(list, i, item);
2833 }
2834
2835 /* Join the words to form a new string */
2836 item = PyUnicode_Join(NULL, list);
2837
2838onError:
2839 Py_DECREF(list);
2840 return (PyObject *)item;
2841}
2842#endif
2843
2844static char center__doc__[] =
2845"S.center(width) -> unicode\n\
2846\n\
2847Return S centered in a Unicode string of length width. Padding is done\n\
2848using spaces.";
2849
2850static PyObject *
2851unicode_center(PyUnicodeObject *self, PyObject *args)
2852{
2853 int marg, left;
2854 int width;
2855
2856 if (!PyArg_ParseTuple(args, "i:center", &width))
2857 return NULL;
2858
2859 if (self->length >= width) {
2860 Py_INCREF(self);
2861 return (PyObject*) self;
2862 }
2863
2864 marg = width - self->length;
2865 left = marg / 2 + (marg & width & 1);
2866
2867 return (PyObject*) pad(self, left, marg - left, ' ');
2868}
2869
2870static int
2871unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2872{
2873 int len1, len2;
2874 Py_UNICODE *s1 = str1->str;
2875 Py_UNICODE *s2 = str2->str;
2876
2877 len1 = str1->length;
2878 len2 = str2->length;
2879
2880 while (len1 > 0 && len2 > 0) {
2881 int cmp = (*s1++) - (*s2++);
2882 if (cmp)
2883 /* This should make Christian happy! */
2884 return (cmp < 0) ? -1 : (cmp != 0);
2885 len1--, len2--;
2886 }
2887
2888 return (len1 < len2) ? -1 : (len1 != len2);
2889}
2890
2891int PyUnicode_Compare(PyObject *left,
2892 PyObject *right)
2893{
2894 PyUnicodeObject *u = NULL, *v = NULL;
2895 int result;
2896
2897 /* Coerce the two arguments */
2898 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2899 if (u == NULL)
2900 goto onError;
2901 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2902 if (v == NULL)
2903 goto onError;
2904
2905 /* Shortcut for emtpy or interned objects */
2906 if (v == u) {
2907 Py_DECREF(u);
2908 Py_DECREF(v);
2909 return 0;
2910 }
2911
2912 result = unicode_compare(u, v);
2913
2914 Py_DECREF(u);
2915 Py_DECREF(v);
2916 return result;
2917
2918onError:
2919 Py_XDECREF(u);
2920 Py_XDECREF(v);
2921 return -1;
2922}
2923
Guido van Rossum403d68b2000-03-13 15:55:09 +00002924int PyUnicode_Contains(PyObject *container,
2925 PyObject *element)
2926{
2927 PyUnicodeObject *u = NULL, *v = NULL;
2928 int result;
2929 register const Py_UNICODE *p, *e;
2930 register Py_UNICODE ch;
2931
2932 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002933 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2934 if (v == NULL)
2935 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002936 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2937 if (u == NULL) {
2938 Py_DECREF(v);
2939 goto onError;
2940 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002941
2942 /* Check v in u */
2943 if (PyUnicode_GET_SIZE(v) != 1) {
2944 PyErr_SetString(PyExc_TypeError,
2945 "string member test needs char left operand");
2946 goto onError;
2947 }
2948 ch = *PyUnicode_AS_UNICODE(v);
2949 p = PyUnicode_AS_UNICODE(u);
2950 e = p + PyUnicode_GET_SIZE(u);
2951 result = 0;
2952 while (p < e) {
2953 if (*p++ == ch) {
2954 result = 1;
2955 break;
2956 }
2957 }
2958
2959 Py_DECREF(u);
2960 Py_DECREF(v);
2961 return result;
2962
2963onError:
2964 Py_XDECREF(u);
2965 Py_XDECREF(v);
2966 return -1;
2967}
2968
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969/* Concat to string or Unicode object giving a new Unicode object. */
2970
2971PyObject *PyUnicode_Concat(PyObject *left,
2972 PyObject *right)
2973{
2974 PyUnicodeObject *u = NULL, *v = NULL, *w;
2975
2976 /* Coerce the two arguments */
2977 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2978 if (u == NULL)
2979 goto onError;
2980 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2981 if (v == NULL)
2982 goto onError;
2983
2984 /* Shortcuts */
2985 if (v == unicode_empty) {
2986 Py_DECREF(v);
2987 return (PyObject *)u;
2988 }
2989 if (u == unicode_empty) {
2990 Py_DECREF(u);
2991 return (PyObject *)v;
2992 }
2993
2994 /* Concat the two Unicode strings */
2995 w = _PyUnicode_New(u->length + v->length);
2996 if (w == NULL)
2997 goto onError;
2998 Py_UNICODE_COPY(w->str, u->str, u->length);
2999 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3000
3001 Py_DECREF(u);
3002 Py_DECREF(v);
3003 return (PyObject *)w;
3004
3005onError:
3006 Py_XDECREF(u);
3007 Py_XDECREF(v);
3008 return NULL;
3009}
3010
3011static char count__doc__[] =
3012"S.count(sub[, start[, end]]) -> int\n\
3013\n\
3014Return the number of occurrences of substring sub in Unicode string\n\
3015S[start:end]. Optional arguments start and end are\n\
3016interpreted as in slice notation.";
3017
3018static PyObject *
3019unicode_count(PyUnicodeObject *self, PyObject *args)
3020{
3021 PyUnicodeObject *substring;
3022 int start = 0;
3023 int end = INT_MAX;
3024 PyObject *result;
3025
3026 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
3027 return NULL;
3028
3029 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3030 (PyObject *)substring);
3031 if (substring == NULL)
3032 return NULL;
3033
3034 if (substring->length == 0) {
3035 Py_DECREF(substring);
3036 return PyInt_FromLong((long) 0);
3037 }
3038
3039 if (start < 0)
3040 start += self->length;
3041 if (start < 0)
3042 start = 0;
3043 if (end > self->length)
3044 end = self->length;
3045 if (end < 0)
3046 end += self->length;
3047 if (end < 0)
3048 end = 0;
3049
3050 result = PyInt_FromLong((long) count(self, start, end, substring));
3051
3052 Py_DECREF(substring);
3053 return result;
3054}
3055
3056static char encode__doc__[] =
3057"S.encode([encoding[,errors]]) -> string\n\
3058\n\
3059Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
3060errors may be given to set a different error handling scheme. Default\n\
3061is 'strict' meaning that encoding errors raise a ValueError. Other\n\
3062possible values are 'ignore' and 'replace'.";
3063
3064static PyObject *
3065unicode_encode(PyUnicodeObject *self, PyObject *args)
3066{
3067 char *encoding = NULL;
3068 char *errors = NULL;
3069 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3070 return NULL;
3071 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3072}
3073
3074static char expandtabs__doc__[] =
3075"S.expandtabs([tabsize]) -> unicode\n\
3076\n\
3077Return a copy of S where all tab characters are expanded using spaces.\n\
3078If tabsize is not given, a tab size of 8 characters is assumed.";
3079
3080static PyObject*
3081unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3082{
3083 Py_UNICODE *e;
3084 Py_UNICODE *p;
3085 Py_UNICODE *q;
3086 int i, j;
3087 PyUnicodeObject *u;
3088 int tabsize = 8;
3089
3090 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3091 return NULL;
3092
3093 /* First pass: determine size of ouput string */
3094 i = j = 0;
3095 e = self->str + self->length;
3096 for (p = self->str; p < e; p++)
3097 if (*p == '\t') {
3098 if (tabsize > 0)
3099 j += tabsize - (j % tabsize);
3100 }
3101 else {
3102 j++;
3103 if (*p == '\n' || *p == '\r') {
3104 i += j;
3105 j = 0;
3106 }
3107 }
3108
3109 /* Second pass: create output string and fill it */
3110 u = _PyUnicode_New(i + j);
3111 if (!u)
3112 return NULL;
3113
3114 j = 0;
3115 q = u->str;
3116
3117 for (p = self->str; p < e; p++)
3118 if (*p == '\t') {
3119 if (tabsize > 0) {
3120 i = tabsize - (j % tabsize);
3121 j += i;
3122 while (i--)
3123 *q++ = ' ';
3124 }
3125 }
3126 else {
3127 j++;
3128 *q++ = *p;
3129 if (*p == '\n' || *p == '\r')
3130 j = 0;
3131 }
3132
3133 return (PyObject*) u;
3134}
3135
3136static char find__doc__[] =
3137"S.find(sub [,start [,end]]) -> int\n\
3138\n\
3139Return the lowest index in S where substring sub is found,\n\
3140such that sub is contained within s[start,end]. Optional\n\
3141arguments start and end are interpreted as in slice notation.\n\
3142\n\
3143Return -1 on failure.";
3144
3145static PyObject *
3146unicode_find(PyUnicodeObject *self, PyObject *args)
3147{
3148 PyUnicodeObject *substring;
3149 int start = 0;
3150 int end = INT_MAX;
3151 PyObject *result;
3152
3153 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
3154 return NULL;
3155 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3156 (PyObject *)substring);
3157 if (substring == NULL)
3158 return NULL;
3159
3160 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3161
3162 Py_DECREF(substring);
3163 return result;
3164}
3165
3166static PyObject *
3167unicode_getitem(PyUnicodeObject *self, int index)
3168{
3169 if (index < 0 || index >= self->length) {
3170 PyErr_SetString(PyExc_IndexError, "string index out of range");
3171 return NULL;
3172 }
3173
3174 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3175}
3176
3177static long
3178unicode_hash(PyUnicodeObject *self)
3179{
3180 long hash;
3181 PyObject *utf8;
3182
3183 /* Since Unicode objects compare equal to their UTF-8 string
3184 counterparts, they should also use the UTF-8 strings as basis
3185 for their hash value. This is needed to assure that strings and
3186 Unicode objects behave in the same way as dictionary
3187 keys. Unfortunately, this costs some performance and also some
3188 memory if the cached UTF-8 representation is not used later
3189 on. */
3190 if (self->hash != -1)
3191 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003192 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003193 if (utf8 == NULL)
3194 return -1;
3195 hash = PyObject_Hash(utf8);
3196 if (hash == -1)
3197 return -1;
3198 self->hash = hash;
3199 return hash;
3200}
3201
3202static char index__doc__[] =
3203"S.index(sub [,start [,end]]) -> int\n\
3204\n\
3205Like S.find() but raise ValueError when the substring is not found.";
3206
3207static PyObject *
3208unicode_index(PyUnicodeObject *self, PyObject *args)
3209{
3210 int result;
3211 PyUnicodeObject *substring;
3212 int start = 0;
3213 int end = INT_MAX;
3214
3215 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
3216 return NULL;
3217
3218 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3219 (PyObject *)substring);
3220 if (substring == NULL)
3221 return NULL;
3222
3223 result = findstring(self, substring, start, end, 1);
3224
3225 Py_DECREF(substring);
3226 if (result < 0) {
3227 PyErr_SetString(PyExc_ValueError, "substring not found");
3228 return NULL;
3229 }
3230 return PyInt_FromLong(result);
3231}
3232
3233static char islower__doc__[] =
3234"S.islower() -> int\n\
3235\n\
3236Return 1 if all cased characters in S are lowercase and there is\n\
3237at least one cased character in S, 0 otherwise.";
3238
3239static PyObject*
3240unicode_islower(PyUnicodeObject *self, PyObject *args)
3241{
3242 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3243 register const Py_UNICODE *e;
3244 int cased;
3245
3246 if (!PyArg_NoArgs(args))
3247 return NULL;
3248
3249 /* Shortcut for single character strings */
3250 if (PyUnicode_GET_SIZE(self) == 1)
3251 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3252
3253 e = p + PyUnicode_GET_SIZE(self);
3254 cased = 0;
3255 for (; p < e; p++) {
3256 register const Py_UNICODE ch = *p;
3257
3258 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3259 return PyInt_FromLong(0);
3260 else if (!cased && Py_UNICODE_ISLOWER(ch))
3261 cased = 1;
3262 }
3263 return PyInt_FromLong(cased);
3264}
3265
3266static char isupper__doc__[] =
3267"S.isupper() -> int\n\
3268\n\
3269Return 1 if all cased characters in S are uppercase and there is\n\
3270at least one cased character in S, 0 otherwise.";
3271
3272static PyObject*
3273unicode_isupper(PyUnicodeObject *self, PyObject *args)
3274{
3275 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3276 register const Py_UNICODE *e;
3277 int cased;
3278
3279 if (!PyArg_NoArgs(args))
3280 return NULL;
3281
3282 /* Shortcut for single character strings */
3283 if (PyUnicode_GET_SIZE(self) == 1)
3284 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3285
3286 e = p + PyUnicode_GET_SIZE(self);
3287 cased = 0;
3288 for (; p < e; p++) {
3289 register const Py_UNICODE ch = *p;
3290
3291 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3292 return PyInt_FromLong(0);
3293 else if (!cased && Py_UNICODE_ISUPPER(ch))
3294 cased = 1;
3295 }
3296 return PyInt_FromLong(cased);
3297}
3298
3299static char istitle__doc__[] =
3300"S.istitle() -> int\n\
3301\n\
3302Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3303may only follow uncased characters and lowercase characters only cased\n\
3304ones. Return 0 otherwise.";
3305
3306static PyObject*
3307unicode_istitle(PyUnicodeObject *self, PyObject *args)
3308{
3309 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3310 register const Py_UNICODE *e;
3311 int cased, previous_is_cased;
3312
3313 if (!PyArg_NoArgs(args))
3314 return NULL;
3315
3316 /* Shortcut for single character strings */
3317 if (PyUnicode_GET_SIZE(self) == 1)
3318 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3319 (Py_UNICODE_ISUPPER(*p) != 0));
3320
3321 e = p + PyUnicode_GET_SIZE(self);
3322 cased = 0;
3323 previous_is_cased = 0;
3324 for (; p < e; p++) {
3325 register const Py_UNICODE ch = *p;
3326
3327 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3328 if (previous_is_cased)
3329 return PyInt_FromLong(0);
3330 previous_is_cased = 1;
3331 cased = 1;
3332 }
3333 else if (Py_UNICODE_ISLOWER(ch)) {
3334 if (!previous_is_cased)
3335 return PyInt_FromLong(0);
3336 previous_is_cased = 1;
3337 cased = 1;
3338 }
3339 else
3340 previous_is_cased = 0;
3341 }
3342 return PyInt_FromLong(cased);
3343}
3344
3345static char isspace__doc__[] =
3346"S.isspace() -> int\n\
3347\n\
3348Return 1 if there are only whitespace characters in S,\n\
33490 otherwise.";
3350
3351static PyObject*
3352unicode_isspace(PyUnicodeObject *self, PyObject *args)
3353{
3354 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3355 register const Py_UNICODE *e;
3356
3357 if (!PyArg_NoArgs(args))
3358 return NULL;
3359
3360 /* Shortcut for single character strings */
3361 if (PyUnicode_GET_SIZE(self) == 1 &&
3362 Py_UNICODE_ISSPACE(*p))
3363 return PyInt_FromLong(1);
3364
3365 e = p + PyUnicode_GET_SIZE(self);
3366 for (; p < e; p++) {
3367 if (!Py_UNICODE_ISSPACE(*p))
3368 return PyInt_FromLong(0);
3369 }
3370 return PyInt_FromLong(1);
3371}
3372
3373static char isdecimal__doc__[] =
3374"S.isdecimal() -> int\n\
3375\n\
3376Return 1 if there are only decimal characters in S,\n\
33770 otherwise.";
3378
3379static PyObject*
3380unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3381{
3382 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3383 register const Py_UNICODE *e;
3384
3385 if (!PyArg_NoArgs(args))
3386 return NULL;
3387
3388 /* Shortcut for single character strings */
3389 if (PyUnicode_GET_SIZE(self) == 1 &&
3390 Py_UNICODE_ISDECIMAL(*p))
3391 return PyInt_FromLong(1);
3392
3393 e = p + PyUnicode_GET_SIZE(self);
3394 for (; p < e; p++) {
3395 if (!Py_UNICODE_ISDECIMAL(*p))
3396 return PyInt_FromLong(0);
3397 }
3398 return PyInt_FromLong(1);
3399}
3400
3401static char isdigit__doc__[] =
3402"S.isdigit() -> int\n\
3403\n\
3404Return 1 if there are only digit characters in S,\n\
34050 otherwise.";
3406
3407static PyObject*
3408unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3409{
3410 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3411 register const Py_UNICODE *e;
3412
3413 if (!PyArg_NoArgs(args))
3414 return NULL;
3415
3416 /* Shortcut for single character strings */
3417 if (PyUnicode_GET_SIZE(self) == 1 &&
3418 Py_UNICODE_ISDIGIT(*p))
3419 return PyInt_FromLong(1);
3420
3421 e = p + PyUnicode_GET_SIZE(self);
3422 for (; p < e; p++) {
3423 if (!Py_UNICODE_ISDIGIT(*p))
3424 return PyInt_FromLong(0);
3425 }
3426 return PyInt_FromLong(1);
3427}
3428
3429static char isnumeric__doc__[] =
3430"S.isnumeric() -> int\n\
3431\n\
3432Return 1 if there are only numeric characters in S,\n\
34330 otherwise.";
3434
3435static PyObject*
3436unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3437{
3438 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3439 register const Py_UNICODE *e;
3440
3441 if (!PyArg_NoArgs(args))
3442 return NULL;
3443
3444 /* Shortcut for single character strings */
3445 if (PyUnicode_GET_SIZE(self) == 1 &&
3446 Py_UNICODE_ISNUMERIC(*p))
3447 return PyInt_FromLong(1);
3448
3449 e = p + PyUnicode_GET_SIZE(self);
3450 for (; p < e; p++) {
3451 if (!Py_UNICODE_ISNUMERIC(*p))
3452 return PyInt_FromLong(0);
3453 }
3454 return PyInt_FromLong(1);
3455}
3456
3457static char join__doc__[] =
3458"S.join(sequence) -> unicode\n\
3459\n\
3460Return a string which is the concatenation of the strings in the\n\
3461sequence. The separator between elements is S.";
3462
3463static PyObject*
3464unicode_join(PyUnicodeObject *self, PyObject *args)
3465{
3466 PyObject *data;
3467 if (!PyArg_ParseTuple(args, "O:join", &data))
3468 return NULL;
3469
3470 return PyUnicode_Join((PyObject *)self, data);
3471}
3472
3473static int
3474unicode_length(PyUnicodeObject *self)
3475{
3476 return self->length;
3477}
3478
3479static char ljust__doc__[] =
3480"S.ljust(width) -> unicode\n\
3481\n\
3482Return S left justified in a Unicode string of length width. Padding is\n\
3483done using spaces.";
3484
3485static PyObject *
3486unicode_ljust(PyUnicodeObject *self, PyObject *args)
3487{
3488 int width;
3489 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3490 return NULL;
3491
3492 if (self->length >= width) {
3493 Py_INCREF(self);
3494 return (PyObject*) self;
3495 }
3496
3497 return (PyObject*) pad(self, 0, width - self->length, ' ');
3498}
3499
3500static char lower__doc__[] =
3501"S.lower() -> unicode\n\
3502\n\
3503Return a copy of the string S converted to lowercase.";
3504
3505static PyObject*
3506unicode_lower(PyUnicodeObject *self, PyObject *args)
3507{
3508 if (!PyArg_NoArgs(args))
3509 return NULL;
3510 return fixup(self, fixlower);
3511}
3512
3513static char lstrip__doc__[] =
3514"S.lstrip() -> unicode\n\
3515\n\
3516Return a copy of the string S with leading whitespace removed.";
3517
3518static PyObject *
3519unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3520{
3521 if (!PyArg_NoArgs(args))
3522 return NULL;
3523 return strip(self, 1, 0);
3524}
3525
3526static PyObject*
3527unicode_repeat(PyUnicodeObject *str, int len)
3528{
3529 PyUnicodeObject *u;
3530 Py_UNICODE *p;
3531
3532 if (len < 0)
3533 len = 0;
3534
3535 if (len == 1) {
3536 /* no repeat, return original string */
3537 Py_INCREF(str);
3538 return (PyObject*) str;
3539 }
3540
3541 u = _PyUnicode_New(len * str->length);
3542 if (!u)
3543 return NULL;
3544
3545 p = u->str;
3546
3547 while (len-- > 0) {
3548 Py_UNICODE_COPY(p, str->str, str->length);
3549 p += str->length;
3550 }
3551
3552 return (PyObject*) u;
3553}
3554
3555PyObject *PyUnicode_Replace(PyObject *obj,
3556 PyObject *subobj,
3557 PyObject *replobj,
3558 int maxcount)
3559{
3560 PyObject *self;
3561 PyObject *str1;
3562 PyObject *str2;
3563 PyObject *result;
3564
3565 self = PyUnicode_FromObject(obj);
3566 if (self == NULL)
3567 return NULL;
3568 str1 = PyUnicode_FromObject(subobj);
3569 if (str1 == NULL) {
3570 Py_DECREF(self);
3571 return NULL;
3572 }
3573 str2 = PyUnicode_FromObject(replobj);
3574 if (str2 == NULL) {
3575 Py_DECREF(self);
3576 Py_DECREF(str1);
3577 return NULL;
3578 }
3579 result = replace((PyUnicodeObject *)self,
3580 (PyUnicodeObject *)str1,
3581 (PyUnicodeObject *)str2,
3582 maxcount);
3583 Py_DECREF(self);
3584 Py_DECREF(str1);
3585 Py_DECREF(str2);
3586 return result;
3587}
3588
3589static char replace__doc__[] =
3590"S.replace (old, new[, maxsplit]) -> unicode\n\
3591\n\
3592Return a copy of S with all occurrences of substring\n\
3593old replaced by new. If the optional argument maxsplit is\n\
3594given, only the first maxsplit occurrences are replaced.";
3595
3596static PyObject*
3597unicode_replace(PyUnicodeObject *self, PyObject *args)
3598{
3599 PyUnicodeObject *str1;
3600 PyUnicodeObject *str2;
3601 int maxcount = -1;
3602 PyObject *result;
3603
3604 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3605 return NULL;
3606 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3607 if (str1 == NULL)
3608 return NULL;
3609 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3610 if (str2 == NULL)
3611 return NULL;
3612
3613 result = replace(self, str1, str2, maxcount);
3614
3615 Py_DECREF(str1);
3616 Py_DECREF(str2);
3617 return result;
3618}
3619
3620static
3621PyObject *unicode_repr(PyObject *unicode)
3622{
3623 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3624 PyUnicode_GET_SIZE(unicode),
3625 1);
3626}
3627
3628static char rfind__doc__[] =
3629"S.rfind(sub [,start [,end]]) -> int\n\
3630\n\
3631Return the highest index in S where substring sub is found,\n\
3632such that sub is contained within s[start,end]. Optional\n\
3633arguments start and end are interpreted as in slice notation.\n\
3634\n\
3635Return -1 on failure.";
3636
3637static PyObject *
3638unicode_rfind(PyUnicodeObject *self, PyObject *args)
3639{
3640 PyUnicodeObject *substring;
3641 int start = 0;
3642 int end = INT_MAX;
3643 PyObject *result;
3644
3645 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3646 return NULL;
3647 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3648 (PyObject *)substring);
3649 if (substring == NULL)
3650 return NULL;
3651
3652 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3653
3654 Py_DECREF(substring);
3655 return result;
3656}
3657
3658static char rindex__doc__[] =
3659"S.rindex(sub [,start [,end]]) -> int\n\
3660\n\
3661Like S.rfind() but raise ValueError when the substring is not found.";
3662
3663static PyObject *
3664unicode_rindex(PyUnicodeObject *self, PyObject *args)
3665{
3666 int result;
3667 PyUnicodeObject *substring;
3668 int start = 0;
3669 int end = INT_MAX;
3670
3671 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3672 return NULL;
3673 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3674 (PyObject *)substring);
3675 if (substring == NULL)
3676 return NULL;
3677
3678 result = findstring(self, substring, start, end, -1);
3679
3680 Py_DECREF(substring);
3681 if (result < 0) {
3682 PyErr_SetString(PyExc_ValueError, "substring not found");
3683 return NULL;
3684 }
3685 return PyInt_FromLong(result);
3686}
3687
3688static char rjust__doc__[] =
3689"S.rjust(width) -> unicode\n\
3690\n\
3691Return S right justified in a Unicode string of length width. Padding is\n\
3692done using spaces.";
3693
3694static PyObject *
3695unicode_rjust(PyUnicodeObject *self, PyObject *args)
3696{
3697 int width;
3698 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3699 return NULL;
3700
3701 if (self->length >= width) {
3702 Py_INCREF(self);
3703 return (PyObject*) self;
3704 }
3705
3706 return (PyObject*) pad(self, width - self->length, 0, ' ');
3707}
3708
3709static char rstrip__doc__[] =
3710"S.rstrip() -> unicode\n\
3711\n\
3712Return a copy of the string S with trailing whitespace removed.";
3713
3714static PyObject *
3715unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3716{
3717 if (!PyArg_NoArgs(args))
3718 return NULL;
3719 return strip(self, 0, 1);
3720}
3721
3722static PyObject*
3723unicode_slice(PyUnicodeObject *self, int start, int end)
3724{
3725 /* standard clamping */
3726 if (start < 0)
3727 start = 0;
3728 if (end < 0)
3729 end = 0;
3730 if (end > self->length)
3731 end = self->length;
3732 if (start == 0 && end == self->length) {
3733 /* full slice, return original string */
3734 Py_INCREF(self);
3735 return (PyObject*) self;
3736 }
3737 if (start > end)
3738 start = end;
3739 /* copy slice */
3740 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3741 end - start);
3742}
3743
3744PyObject *PyUnicode_Split(PyObject *s,
3745 PyObject *sep,
3746 int maxsplit)
3747{
3748 PyObject *result;
3749
3750 s = PyUnicode_FromObject(s);
3751 if (s == NULL)
3752 return NULL;
3753 if (sep != NULL) {
3754 sep = PyUnicode_FromObject(sep);
3755 if (sep == NULL) {
3756 Py_DECREF(s);
3757 return NULL;
3758 }
3759 }
3760
3761 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3762
3763 Py_DECREF(s);
3764 Py_XDECREF(sep);
3765 return result;
3766}
3767
3768static char split__doc__[] =
3769"S.split([sep [,maxsplit]]) -> list of strings\n\
3770\n\
3771Return a list of the words in S, using sep as the\n\
3772delimiter string. If maxsplit is given, at most maxsplit\n\
3773splits are done. If sep is not specified, any whitespace string\n\
3774is a separator.";
3775
3776static PyObject*
3777unicode_split(PyUnicodeObject *self, PyObject *args)
3778{
3779 PyObject *substring = Py_None;
3780 int maxcount = -1;
3781
3782 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3783 return NULL;
3784
3785 if (substring == Py_None)
3786 return split(self, NULL, maxcount);
3787 else if (PyUnicode_Check(substring))
3788 return split(self, (PyUnicodeObject *)substring, maxcount);
3789 else
3790 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3791}
3792
3793static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00003794"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795\n\
3796Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00003797Line breaks are not included in the resulting list unless keepends\n\
3798is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799
3800static PyObject*
3801unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3802{
Guido van Rossum86662912000-04-11 15:38:46 +00003803 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804
Guido van Rossum86662912000-04-11 15:38:46 +00003805 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003806 return NULL;
3807
Guido van Rossum86662912000-04-11 15:38:46 +00003808 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809}
3810
3811static
3812PyObject *unicode_str(PyUnicodeObject *self)
3813{
3814 return PyUnicode_AsUTF8String((PyObject *)self);
3815}
3816
3817static char strip__doc__[] =
3818"S.strip() -> unicode\n\
3819\n\
3820Return a copy of S with leading and trailing whitespace removed.";
3821
3822static PyObject *
3823unicode_strip(PyUnicodeObject *self, PyObject *args)
3824{
3825 if (!PyArg_NoArgs(args))
3826 return NULL;
3827 return strip(self, 1, 1);
3828}
3829
3830static char swapcase__doc__[] =
3831"S.swapcase() -> unicode\n\
3832\n\
3833Return a copy of S with uppercase characters converted to lowercase\n\
3834and vice versa.";
3835
3836static PyObject*
3837unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3838{
3839 if (!PyArg_NoArgs(args))
3840 return NULL;
3841 return fixup(self, fixswapcase);
3842}
3843
3844static char translate__doc__[] =
3845"S.translate(table) -> unicode\n\
3846\n\
3847Return a copy of the string S, where all characters have been mapped\n\
3848through the given translation table, which must be a mapping of\n\
3849Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3850are left untouched. Characters mapped to None are deleted.";
3851
3852static PyObject*
3853unicode_translate(PyUnicodeObject *self, PyObject *args)
3854{
3855 PyObject *table;
3856
3857 if (!PyArg_ParseTuple(args, "O:translate", &table))
3858 return NULL;
3859 return PyUnicode_TranslateCharmap(self->str,
3860 self->length,
3861 table,
3862 "ignore");
3863}
3864
3865static char upper__doc__[] =
3866"S.upper() -> unicode\n\
3867\n\
3868Return a copy of S converted to uppercase.";
3869
3870static PyObject*
3871unicode_upper(PyUnicodeObject *self, PyObject *args)
3872{
3873 if (!PyArg_NoArgs(args))
3874 return NULL;
3875 return fixup(self, fixupper);
3876}
3877
3878#if 0
3879static char zfill__doc__[] =
3880"S.zfill(width) -> unicode\n\
3881\n\
3882Pad a numeric string x with zeros on the left, to fill a field\n\
3883of the specified width. The string x is never truncated.";
3884
3885static PyObject *
3886unicode_zfill(PyUnicodeObject *self, PyObject *args)
3887{
3888 int fill;
3889 PyUnicodeObject *u;
3890
3891 int width;
3892 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3893 return NULL;
3894
3895 if (self->length >= width) {
3896 Py_INCREF(self);
3897 return (PyObject*) self;
3898 }
3899
3900 fill = width - self->length;
3901
3902 u = pad(self, fill, 0, '0');
3903
3904 if (u->str[fill] == '+' || u->str[fill] == '-') {
3905 /* move sign to beginning of string */
3906 u->str[0] = u->str[fill];
3907 u->str[fill] = '0';
3908 }
3909
3910 return (PyObject*) u;
3911}
3912#endif
3913
3914#if 0
3915static PyObject*
3916unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3917{
3918 if (!PyArg_NoArgs(args))
3919 return NULL;
3920 return PyInt_FromLong(unicode_freelist_size);
3921}
3922#endif
3923
3924static char startswith__doc__[] =
3925"S.startswith(prefix[, start[, end]]) -> int\n\
3926\n\
3927Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3928optional start, test S beginning at that position. With optional end, stop\n\
3929comparing S at that position.";
3930
3931static PyObject *
3932unicode_startswith(PyUnicodeObject *self,
3933 PyObject *args)
3934{
3935 PyUnicodeObject *substring;
3936 int start = 0;
3937 int end = INT_MAX;
3938 PyObject *result;
3939
3940 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3941 return NULL;
3942 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3943 (PyObject *)substring);
3944 if (substring == NULL)
3945 return NULL;
3946
3947 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3948
3949 Py_DECREF(substring);
3950 return result;
3951}
3952
3953
3954static char endswith__doc__[] =
3955"S.endswith(suffix[, start[, end]]) -> int\n\
3956\n\
3957Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3958optional start, test S beginning at that position. With optional end, stop\n\
3959comparing S at that position.";
3960
3961static PyObject *
3962unicode_endswith(PyUnicodeObject *self,
3963 PyObject *args)
3964{
3965 PyUnicodeObject *substring;
3966 int start = 0;
3967 int end = INT_MAX;
3968 PyObject *result;
3969
3970 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3971 return NULL;
3972 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3973 (PyObject *)substring);
3974 if (substring == NULL)
3975 return NULL;
3976
3977 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3978
3979 Py_DECREF(substring);
3980 return result;
3981}
3982
3983
3984static PyMethodDef unicode_methods[] = {
3985
3986 /* Order is according to common usage: often used methods should
3987 appear first, since lookup is done sequentially. */
3988
3989 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3990 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3991 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3992 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3993 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3994 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3995 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3996 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3997 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3998 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3999 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4000 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4001 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4002 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4003/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4004 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4005 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4006 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4007 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4008 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4009 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4010 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4011 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4012 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4013 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4014 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4015 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4016 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4017 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4018 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4019 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4020 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4021 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4022#if 0
4023 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4024 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4025#endif
4026
4027#if 0
4028 /* This one is just used for debugging the implementation. */
4029 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4030#endif
4031
4032 {NULL, NULL}
4033};
4034
4035static PyObject *
4036unicode_getattr(PyUnicodeObject *self, char *name)
4037{
4038 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4039}
4040
4041static PySequenceMethods unicode_as_sequence = {
4042 (inquiry) unicode_length, /* sq_length */
4043 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4044 (intargfunc) unicode_repeat, /* sq_repeat */
4045 (intargfunc) unicode_getitem, /* sq_item */
4046 (intintargfunc) unicode_slice, /* sq_slice */
4047 0, /* sq_ass_item */
4048 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004049 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004050};
4051
4052static int
4053unicode_buffer_getreadbuf(PyUnicodeObject *self,
4054 int index,
4055 const void **ptr)
4056{
4057 if (index != 0) {
4058 PyErr_SetString(PyExc_SystemError,
4059 "accessing non-existent unicode segment");
4060 return -1;
4061 }
4062 *ptr = (void *) self->str;
4063 return PyUnicode_GET_DATA_SIZE(self);
4064}
4065
4066static int
4067unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4068 const void **ptr)
4069{
4070 PyErr_SetString(PyExc_TypeError,
4071 "cannot use unicode as modifyable buffer");
4072 return -1;
4073}
4074
4075static int
4076unicode_buffer_getsegcount(PyUnicodeObject *self,
4077 int *lenp)
4078{
4079 if (lenp)
4080 *lenp = PyUnicode_GET_DATA_SIZE(self);
4081 return 1;
4082}
4083
4084static int
4085unicode_buffer_getcharbuf(PyUnicodeObject *self,
4086 int index,
4087 const void **ptr)
4088{
4089 PyObject *str;
4090
4091 if (index != 0) {
4092 PyErr_SetString(PyExc_SystemError,
4093 "accessing non-existent unicode segment");
4094 return -1;
4095 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004096 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004097 if (str == NULL)
4098 return -1;
4099 *ptr = (void *) PyString_AS_STRING(str);
4100 return PyString_GET_SIZE(str);
4101}
4102
4103/* Helpers for PyUnicode_Format() */
4104
4105static PyObject *
4106getnextarg(args, arglen, p_argidx)
4107 PyObject *args;
4108int arglen;
4109int *p_argidx;
4110{
4111 int argidx = *p_argidx;
4112 if (argidx < arglen) {
4113 (*p_argidx)++;
4114 if (arglen < 0)
4115 return args;
4116 else
4117 return PyTuple_GetItem(args, argidx);
4118 }
4119 PyErr_SetString(PyExc_TypeError,
4120 "not enough arguments for format string");
4121 return NULL;
4122}
4123
4124#define F_LJUST (1<<0)
4125#define F_SIGN (1<<1)
4126#define F_BLANK (1<<2)
4127#define F_ALT (1<<3)
4128#define F_ZERO (1<<4)
4129
4130static
4131#ifdef HAVE_STDARG_PROTOTYPES
4132int usprintf(register Py_UNICODE *buffer, char *format, ...)
4133#else
4134int usprintf(va_alist) va_dcl
4135#endif
4136{
4137 register int i;
4138 int len;
4139 va_list va;
4140 char *charbuffer;
4141#ifdef HAVE_STDARG_PROTOTYPES
4142 va_start(va, format);
4143#else
4144 Py_UNICODE *args;
4145 char *format;
4146
4147 va_start(va);
4148 buffer = va_arg(va, Py_UNICODE *);
4149 format = va_arg(va, char *);
4150#endif
4151
4152 /* First, format the string as char array, then expand to Py_UNICODE
4153 array. */
4154 charbuffer = (char *)buffer;
4155 len = vsprintf(charbuffer, format, va);
4156 for (i = len - 1; i >= 0; i--)
4157 buffer[i] = (Py_UNICODE) charbuffer[i];
4158
4159 va_end(va);
4160 return len;
4161}
4162
4163static int
4164formatfloat(Py_UNICODE *buf,
4165 int flags,
4166 int prec,
4167 int type,
4168 PyObject *v)
4169{
4170 char fmt[20];
4171 double x;
4172
4173 x = PyFloat_AsDouble(v);
4174 if (x == -1.0 && PyErr_Occurred())
4175 return -1;
4176 if (prec < 0)
4177 prec = 6;
4178 if (prec > 50)
4179 prec = 50; /* Arbitrary limitation */
4180 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4181 type = 'g';
4182 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4183 return usprintf(buf, fmt, x);
4184}
4185
4186static int
4187formatint(Py_UNICODE *buf,
4188 int flags,
4189 int prec,
4190 int type,
4191 PyObject *v)
4192{
4193 char fmt[20];
4194 long x;
4195
4196 x = PyInt_AsLong(v);
4197 if (x == -1 && PyErr_Occurred())
4198 return -1;
4199 if (prec < 0)
4200 prec = 1;
4201 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4202 return usprintf(buf, fmt, x);
4203}
4204
4205static int
4206formatchar(Py_UNICODE *buf,
4207 PyObject *v)
4208{
4209 if (PyUnicode_Check(v))
4210 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4211
4212 else if (PyString_Check(v))
4213 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4214
4215 else {
4216 /* Integer input truncated to a character */
4217 long x;
4218 x = PyInt_AsLong(v);
4219 if (x == -1 && PyErr_Occurred())
4220 return -1;
4221 buf[0] = (char) x;
4222 }
4223 buf[1] = '\0';
4224 return 1;
4225}
4226
4227PyObject *PyUnicode_Format(PyObject *format,
4228 PyObject *args)
4229{
4230 Py_UNICODE *fmt, *res;
4231 int fmtcnt, rescnt, reslen, arglen, argidx;
4232 int args_owned = 0;
4233 PyUnicodeObject *result = NULL;
4234 PyObject *dict = NULL;
4235 PyObject *uformat;
4236
4237 if (format == NULL || args == NULL) {
4238 PyErr_BadInternalCall();
4239 return NULL;
4240 }
4241 uformat = PyUnicode_FromObject(format);
4242 fmt = PyUnicode_AS_UNICODE(uformat);
4243 fmtcnt = PyUnicode_GET_SIZE(uformat);
4244
4245 reslen = rescnt = fmtcnt + 100;
4246 result = _PyUnicode_New(reslen);
4247 if (result == NULL)
4248 goto onError;
4249 res = PyUnicode_AS_UNICODE(result);
4250
4251 if (PyTuple_Check(args)) {
4252 arglen = PyTuple_Size(args);
4253 argidx = 0;
4254 }
4255 else {
4256 arglen = -1;
4257 argidx = -2;
4258 }
4259 if (args->ob_type->tp_as_mapping)
4260 dict = args;
4261
4262 while (--fmtcnt >= 0) {
4263 if (*fmt != '%') {
4264 if (--rescnt < 0) {
4265 rescnt = fmtcnt + 100;
4266 reslen += rescnt;
4267 if (_PyUnicode_Resize(result, reslen) < 0)
4268 return NULL;
4269 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4270 --rescnt;
4271 }
4272 *res++ = *fmt++;
4273 }
4274 else {
4275 /* Got a format specifier */
4276 int flags = 0;
4277 int width = -1;
4278 int prec = -1;
4279 int size = 0;
4280 Py_UNICODE c = '\0';
4281 Py_UNICODE fill;
4282 PyObject *v = NULL;
4283 PyObject *temp = NULL;
4284 Py_UNICODE *buf;
4285 Py_UNICODE sign;
4286 int len;
4287 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4288
4289 fmt++;
4290 if (*fmt == '(') {
4291 Py_UNICODE *keystart;
4292 int keylen;
4293 PyObject *key;
4294 int pcount = 1;
4295
4296 if (dict == NULL) {
4297 PyErr_SetString(PyExc_TypeError,
4298 "format requires a mapping");
4299 goto onError;
4300 }
4301 ++fmt;
4302 --fmtcnt;
4303 keystart = fmt;
4304 /* Skip over balanced parentheses */
4305 while (pcount > 0 && --fmtcnt >= 0) {
4306 if (*fmt == ')')
4307 --pcount;
4308 else if (*fmt == '(')
4309 ++pcount;
4310 fmt++;
4311 }
4312 keylen = fmt - keystart - 1;
4313 if (fmtcnt < 0 || pcount > 0) {
4314 PyErr_SetString(PyExc_ValueError,
4315 "incomplete format key");
4316 goto onError;
4317 }
4318 /* keys are converted to strings (using UTF-8) and
4319 then looked up since Python uses strings to hold
4320 variables names etc. in its namespaces and we
4321 wouldn't want to break common idioms. The
4322 alternative would be using Unicode objects for the
4323 lookup but u"abc" and "abc" have different hash
4324 values (on purpose). */
4325 key = PyUnicode_EncodeUTF8(keystart,
4326 keylen,
4327 NULL);
4328 if (key == NULL)
4329 goto onError;
4330 if (args_owned) {
4331 Py_DECREF(args);
4332 args_owned = 0;
4333 }
4334 args = PyObject_GetItem(dict, key);
4335 Py_DECREF(key);
4336 if (args == NULL) {
4337 goto onError;
4338 }
4339 args_owned = 1;
4340 arglen = -1;
4341 argidx = -2;
4342 }
4343 while (--fmtcnt >= 0) {
4344 switch (c = *fmt++) {
4345 case '-': flags |= F_LJUST; continue;
4346 case '+': flags |= F_SIGN; continue;
4347 case ' ': flags |= F_BLANK; continue;
4348 case '#': flags |= F_ALT; continue;
4349 case '0': flags |= F_ZERO; continue;
4350 }
4351 break;
4352 }
4353 if (c == '*') {
4354 v = getnextarg(args, arglen, &argidx);
4355 if (v == NULL)
4356 goto onError;
4357 if (!PyInt_Check(v)) {
4358 PyErr_SetString(PyExc_TypeError,
4359 "* wants int");
4360 goto onError;
4361 }
4362 width = PyInt_AsLong(v);
4363 if (width < 0) {
4364 flags |= F_LJUST;
4365 width = -width;
4366 }
4367 if (--fmtcnt >= 0)
4368 c = *fmt++;
4369 }
4370 else if (c >= '0' && c <= '9') {
4371 width = c - '0';
4372 while (--fmtcnt >= 0) {
4373 c = *fmt++;
4374 if (c < '0' || c > '9')
4375 break;
4376 if ((width*10) / 10 != width) {
4377 PyErr_SetString(PyExc_ValueError,
4378 "width too big");
4379 goto onError;
4380 }
4381 width = width*10 + (c - '0');
4382 }
4383 }
4384 if (c == '.') {
4385 prec = 0;
4386 if (--fmtcnt >= 0)
4387 c = *fmt++;
4388 if (c == '*') {
4389 v = getnextarg(args, arglen, &argidx);
4390 if (v == NULL)
4391 goto onError;
4392 if (!PyInt_Check(v)) {
4393 PyErr_SetString(PyExc_TypeError,
4394 "* wants int");
4395 goto onError;
4396 }
4397 prec = PyInt_AsLong(v);
4398 if (prec < 0)
4399 prec = 0;
4400 if (--fmtcnt >= 0)
4401 c = *fmt++;
4402 }
4403 else if (c >= '0' && c <= '9') {
4404 prec = c - '0';
4405 while (--fmtcnt >= 0) {
4406 c = Py_CHARMASK(*fmt++);
4407 if (c < '0' || c > '9')
4408 break;
4409 if ((prec*10) / 10 != prec) {
4410 PyErr_SetString(PyExc_ValueError,
4411 "prec too big");
4412 goto onError;
4413 }
4414 prec = prec*10 + (c - '0');
4415 }
4416 }
4417 } /* prec */
4418 if (fmtcnt >= 0) {
4419 if (c == 'h' || c == 'l' || c == 'L') {
4420 size = c;
4421 if (--fmtcnt >= 0)
4422 c = *fmt++;
4423 }
4424 }
4425 if (fmtcnt < 0) {
4426 PyErr_SetString(PyExc_ValueError,
4427 "incomplete format");
4428 goto onError;
4429 }
4430 if (c != '%') {
4431 v = getnextarg(args, arglen, &argidx);
4432 if (v == NULL)
4433 goto onError;
4434 }
4435 sign = 0;
4436 fill = ' ';
4437 switch (c) {
4438
4439 case '%':
4440 buf = tmpbuf;
4441 buf[0] = '%';
4442 len = 1;
4443 break;
4444
4445 case 's':
4446 case 'r':
4447 if (PyUnicode_Check(v) && c == 's') {
4448 temp = v;
4449 Py_INCREF(temp);
4450 }
4451 else {
4452 PyObject *unicode;
4453 if (c == 's')
4454 temp = PyObject_Str(v);
4455 else
4456 temp = PyObject_Repr(v);
4457 if (temp == NULL)
4458 goto onError;
4459 if (!PyString_Check(temp)) {
4460 /* XXX Note: this should never happen, since
4461 PyObject_Repr() and PyObject_Str() assure
4462 this */
4463 Py_DECREF(temp);
4464 PyErr_SetString(PyExc_TypeError,
4465 "%s argument has non-string str()");
4466 goto onError;
4467 }
4468 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4469 PyString_GET_SIZE(temp),
4470 "strict");
4471 Py_DECREF(temp);
4472 temp = unicode;
4473 if (temp == NULL)
4474 goto onError;
4475 }
4476 buf = PyUnicode_AS_UNICODE(temp);
4477 len = PyUnicode_GET_SIZE(temp);
4478 if (prec >= 0 && len > prec)
4479 len = prec;
4480 break;
4481
4482 case 'i':
4483 case 'd':
4484 case 'u':
4485 case 'o':
4486 case 'x':
4487 case 'X':
4488 if (c == 'i')
4489 c = 'd';
4490 buf = tmpbuf;
4491 len = formatint(buf, flags, prec, c, v);
4492 if (len < 0)
4493 goto onError;
4494 sign = (c == 'd');
4495 if (flags & F_ZERO) {
4496 fill = '0';
4497 if ((flags&F_ALT) &&
4498 (c == 'x' || c == 'X') &&
4499 buf[0] == '0' && buf[1] == c) {
4500 *res++ = *buf++;
4501 *res++ = *buf++;
4502 rescnt -= 2;
4503 len -= 2;
4504 width -= 2;
4505 if (width < 0)
4506 width = 0;
4507 }
4508 }
4509 break;
4510
4511 case 'e':
4512 case 'E':
4513 case 'f':
4514 case 'g':
4515 case 'G':
4516 buf = tmpbuf;
4517 len = formatfloat(buf, flags, prec, c, v);
4518 if (len < 0)
4519 goto onError;
4520 sign = 1;
4521 if (flags&F_ZERO)
4522 fill = '0';
4523 break;
4524
4525 case 'c':
4526 buf = tmpbuf;
4527 len = formatchar(buf, v);
4528 if (len < 0)
4529 goto onError;
4530 break;
4531
4532 default:
4533 PyErr_Format(PyExc_ValueError,
4534 "unsupported format character '%c' (0x%x)",
4535 c, c);
4536 goto onError;
4537 }
4538 if (sign) {
4539 if (*buf == '-' || *buf == '+') {
4540 sign = *buf++;
4541 len--;
4542 }
4543 else if (flags & F_SIGN)
4544 sign = '+';
4545 else if (flags & F_BLANK)
4546 sign = ' ';
4547 else
4548 sign = 0;
4549 }
4550 if (width < len)
4551 width = len;
4552 if (rescnt < width + (sign != 0)) {
4553 reslen -= rescnt;
4554 rescnt = width + fmtcnt + 100;
4555 reslen += rescnt;
4556 if (_PyUnicode_Resize(result, reslen) < 0)
4557 return NULL;
4558 res = PyUnicode_AS_UNICODE(result)
4559 + reslen - rescnt;
4560 }
4561 if (sign) {
4562 if (fill != ' ')
4563 *res++ = sign;
4564 rescnt--;
4565 if (width > len)
4566 width--;
4567 }
4568 if (width > len && !(flags & F_LJUST)) {
4569 do {
4570 --rescnt;
4571 *res++ = fill;
4572 } while (--width > len);
4573 }
4574 if (sign && fill == ' ')
4575 *res++ = sign;
4576 memcpy(res, buf, len * sizeof(Py_UNICODE));
4577 res += len;
4578 rescnt -= len;
4579 while (--width >= len) {
4580 --rescnt;
4581 *res++ = ' ';
4582 }
4583 if (dict && (argidx < arglen) && c != '%') {
4584 PyErr_SetString(PyExc_TypeError,
4585 "not all arguments converted");
4586 goto onError;
4587 }
4588 Py_XDECREF(temp);
4589 } /* '%' */
4590 } /* until end */
4591 if (argidx < arglen && !dict) {
4592 PyErr_SetString(PyExc_TypeError,
4593 "not all arguments converted");
4594 goto onError;
4595 }
4596
4597 if (args_owned) {
4598 Py_DECREF(args);
4599 }
4600 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004601 if (_PyUnicode_Resize(result, reslen - rescnt))
4602 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004603 return (PyObject *)result;
4604
4605 onError:
4606 Py_XDECREF(result);
4607 Py_DECREF(uformat);
4608 if (args_owned) {
4609 Py_DECREF(args);
4610 }
4611 return NULL;
4612}
4613
4614static PyBufferProcs unicode_as_buffer = {
4615 (getreadbufferproc) unicode_buffer_getreadbuf,
4616 (getwritebufferproc) unicode_buffer_getwritebuf,
4617 (getsegcountproc) unicode_buffer_getsegcount,
4618 (getcharbufferproc) unicode_buffer_getcharbuf,
4619};
4620
4621PyTypeObject PyUnicode_Type = {
4622 PyObject_HEAD_INIT(&PyType_Type)
4623 0, /* ob_size */
4624 "unicode", /* tp_name */
4625 sizeof(PyUnicodeObject), /* tp_size */
4626 0, /* tp_itemsize */
4627 /* Slots */
4628 (destructor)_PyUnicode_Free, /* tp_dealloc */
4629 0, /* tp_print */
4630 (getattrfunc)unicode_getattr, /* tp_getattr */
4631 0, /* tp_setattr */
4632 (cmpfunc) unicode_compare, /* tp_compare */
4633 (reprfunc) unicode_repr, /* tp_repr */
4634 0, /* tp_as_number */
4635 &unicode_as_sequence, /* tp_as_sequence */
4636 0, /* tp_as_mapping */
4637 (hashfunc) unicode_hash, /* tp_hash*/
4638 0, /* tp_call*/
4639 (reprfunc) unicode_str, /* tp_str */
4640 (getattrofunc) NULL, /* tp_getattro */
4641 (setattrofunc) NULL, /* tp_setattro */
4642 &unicode_as_buffer, /* tp_as_buffer */
4643 Py_TPFLAGS_DEFAULT, /* tp_flags */
4644};
4645
4646/* Initialize the Unicode implementation */
4647
4648void _PyUnicode_Init()
4649{
4650 /* Doublecheck the configuration... */
4651 if (sizeof(Py_UNICODE) != 2)
4652 Py_FatalError("Unicode configuration error: "
4653 "sizeof(Py_UNICODE) != 2 bytes");
4654
4655 unicode_empty = _PyUnicode_New(0);
4656}
4657
4658/* Finalize the Unicode implementation */
4659
4660void
4661_PyUnicode_Fini()
4662{
4663 PyUnicodeObject *u = unicode_freelist;
4664
4665 while (u != NULL) {
4666 PyUnicodeObject *v = u;
4667 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004668 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00004669 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004670 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00004671 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004672 }
4673 Py_XDECREF(unicode_empty);
4674}