blob: 6cb1ea83ffc54eff99f6d07461a1465a7b4c4393 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------ */
112
113/* The empty Unicode object */
114static PyUnicodeObject *unicode_empty = NULL;
115
116/* Free list for Unicode objects */
117static PyUnicodeObject *unicode_freelist = NULL;
118static int unicode_freelist_size = 0;
119
120/* --- Unicode Object ----------------------------------------------------- */
121
122static
123int _PyUnicode_Resize(register PyUnicodeObject *unicode,
124 int length)
125{
126 void *oldstr;
127
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000130 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000131
132 /* Resizing unicode_empty is not allowed. */
133 if (unicode == unicode_empty) {
134 PyErr_SetString(PyExc_SystemError,
135 "can't resize empty unicode object");
136 return -1;
137 }
138
139 /* We allocate one more byte to make sure the string is
140 Ux0000 terminated -- XXX is this needed ? */
141 oldstr = unicode->str;
142 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
143 if (!unicode->str) {
144 unicode->str = oldstr;
145 PyErr_NoMemory();
146 return -1;
147 }
148 unicode->str[length] = 0;
149 unicode->length = length;
150
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000151 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000152 /* Reset the object caches */
153 if (unicode->utf8str) {
154 Py_DECREF(unicode->utf8str);
155 unicode->utf8str = NULL;
156 }
157 unicode->hash = -1;
158
159 return 0;
160}
161
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000162int PyUnicode_Resize(PyObject **unicode,
163 int length)
164{
165 PyUnicodeObject *v;
166
167 if (unicode == NULL) {
168 PyErr_BadInternalCall();
169 return -1;
170 }
171 v = (PyUnicodeObject *)*unicode;
172 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
173 PyErr_BadInternalCall();
174 return -1;
175 }
176 return _PyUnicode_Resize(v, length);
177}
178
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179/* We allocate one more byte to make sure the string is
180 Ux0000 terminated -- XXX is this needed ?
181
182 XXX This allocator could further be enhanced by assuring that the
183 free list never reduces its size below 1.
184
185*/
186
187static
188PyUnicodeObject *_PyUnicode_New(int length)
189{
190 register PyUnicodeObject *unicode;
191
192 /* Optimization for empty strings */
193 if (length == 0 && unicode_empty != NULL) {
194 Py_INCREF(unicode_empty);
195 return unicode_empty;
196 }
197
198 /* Unicode freelist & memory allocation */
199 if (unicode_freelist) {
200 unicode = unicode_freelist;
201 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
202 unicode_freelist_size--;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000203 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000205 /* Keep-Alive optimization: we only upsize the buffer,
206 never downsize it. */
207 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000209 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 }
212 }
213 else
214 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
215 }
216 else {
217 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
218 if (unicode == NULL)
219 return NULL;
220 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
221 }
222
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000223 if (!unicode->str) {
224 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000225 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
230 unicode->utf8str = NULL;
231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000235 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
240void _PyUnicode_Free(register PyUnicodeObject *unicode)
241{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 /* Keep-Alive optimization */
244 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode->str = NULL;
247 unicode->length = 0;
248 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000249 if (unicode->utf8str) {
250 Py_DECREF(unicode->utf8str);
251 unicode->utf8str = NULL;
252 }
253 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 *(PyUnicodeObject **)unicode = unicode_freelist;
255 unicode_freelist = unicode;
256 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 }
258 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000259 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 }
263}
264
265PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
266 int size)
267{
268 PyUnicodeObject *unicode;
269
270 unicode = _PyUnicode_New(size);
271 if (!unicode)
272 return NULL;
273
274 /* Copy the Unicode data into the new object */
275 if (u != NULL)
276 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
277
278 return (PyObject *)unicode;
279}
280
281#ifdef HAVE_WCHAR_H
282
283PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
284 int size)
285{
286 PyUnicodeObject *unicode;
287
288 if (w == NULL) {
289 PyErr_BadInternalCall();
290 return NULL;
291 }
292
293 unicode = _PyUnicode_New(size);
294 if (!unicode)
295 return NULL;
296
297 /* Copy the wchar_t data into the new object */
298#ifdef HAVE_USABLE_WCHAR_T
299 memcpy(unicode->str, w, size * sizeof(wchar_t));
300#else
301 {
302 register Py_UNICODE *u;
303 register int i;
304 u = PyUnicode_AS_UNICODE(unicode);
305 for (i = size; i >= 0; i--)
306 *u++ = *w++;
307 }
308#endif
309
310 return (PyObject *)unicode;
311}
312
313int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
314 register wchar_t *w,
315 int size)
316{
317 if (unicode == NULL) {
318 PyErr_BadInternalCall();
319 return -1;
320 }
321 if (size > PyUnicode_GET_SIZE(unicode))
322 size = PyUnicode_GET_SIZE(unicode);
323#ifdef HAVE_USABLE_WCHAR_T
324 memcpy(w, unicode->str, size * sizeof(wchar_t));
325#else
326 {
327 register Py_UNICODE *u;
328 register int i;
329 u = PyUnicode_AS_UNICODE(unicode);
330 for (i = size; i >= 0; i--)
331 *w++ = *u++;
332 }
333#endif
334
335 return size;
336}
337
338#endif
339
340PyObject *PyUnicode_FromObject(register PyObject *obj)
341{
342 const char *s;
343 int len;
344
345 if (obj == NULL) {
346 PyErr_BadInternalCall();
347 return NULL;
348 }
349 else if (PyUnicode_Check(obj)) {
350 Py_INCREF(obj);
351 return obj;
352 }
353 else if (PyString_Check(obj)) {
354 s = PyString_AS_STRING(obj);
355 len = PyString_GET_SIZE(obj);
356 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000357 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
358 /* Overwrite the error message with something more useful in
359 case of a TypeError. */
360 if (PyErr_ExceptionMatches(PyExc_TypeError))
361 PyErr_SetString(PyExc_TypeError,
362 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 if (len == 0) {
366 Py_INCREF(unicode_empty);
367 return (PyObject *)unicode_empty;
368 }
369 return PyUnicode_DecodeUTF8(s, len, "strict");
370}
371
372PyObject *PyUnicode_Decode(const char *s,
373 int size,
374 const char *encoding,
375 const char *errors)
376{
377 PyObject *buffer = NULL, *unicode;
378
379 /* Shortcut for the default encoding UTF-8 */
380 if (encoding == NULL ||
381 (strcmp(encoding, "utf-8") == 0))
382 return PyUnicode_DecodeUTF8(s, size, errors);
383
384 /* Decode via the codec registry */
385 buffer = PyBuffer_FromMemory((void *)s, size);
386 if (buffer == NULL)
387 goto onError;
388 unicode = PyCodec_Decode(buffer, encoding, errors);
389 if (unicode == NULL)
390 goto onError;
391 if (!PyUnicode_Check(unicode)) {
392 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000393 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 unicode->ob_type->tp_name);
395 Py_DECREF(unicode);
396 goto onError;
397 }
398 Py_DECREF(buffer);
399 return unicode;
400
401 onError:
402 Py_XDECREF(buffer);
403 return NULL;
404}
405
406PyObject *PyUnicode_Encode(const Py_UNICODE *s,
407 int size,
408 const char *encoding,
409 const char *errors)
410{
411 PyObject *v, *unicode;
412
413 unicode = PyUnicode_FromUnicode(s, size);
414 if (unicode == NULL)
415 return NULL;
416 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
417 Py_DECREF(unicode);
418 return v;
419}
420
421PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
422 const char *encoding,
423 const char *errors)
424{
425 PyObject *v;
426
427 if (!PyUnicode_Check(unicode)) {
428 PyErr_BadArgument();
429 goto onError;
430 }
431 /* Shortcut for the default encoding UTF-8 */
432 if ((encoding == NULL ||
433 (strcmp(encoding, "utf-8") == 0)) &&
434 errors == NULL)
435 return PyUnicode_AsUTF8String(unicode);
436
437 /* Encode via the codec registry */
438 v = PyCodec_Encode(unicode, encoding, errors);
439 if (v == NULL)
440 goto onError;
441 /* XXX Should we really enforce this ? */
442 if (!PyString_Check(v)) {
443 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000444 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445 v->ob_type->tp_name);
446 Py_DECREF(v);
447 goto onError;
448 }
449 return v;
450
451 onError:
452 return NULL;
453}
454
455Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
456{
457 if (!PyUnicode_Check(unicode)) {
458 PyErr_BadArgument();
459 goto onError;
460 }
461 return PyUnicode_AS_UNICODE(unicode);
462
463 onError:
464 return NULL;
465}
466
467int PyUnicode_GetSize(PyObject *unicode)
468{
469 if (!PyUnicode_Check(unicode)) {
470 PyErr_BadArgument();
471 goto onError;
472 }
473 return PyUnicode_GET_SIZE(unicode);
474
475 onError:
476 return -1;
477}
478
479/* --- UTF-8 Codec -------------------------------------------------------- */
480
481static
482char utf8_code_length[256] = {
483 /* Map UTF-8 encoded prefix byte to sequence length. zero means
484 illegal prefix. see RFC 2279 for details */
485 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
487 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
488 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
489 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
490 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
491 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
492 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
497 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
498 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
499 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
500 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
501};
502
503static
504int utf8_decoding_error(const char **source,
505 Py_UNICODE **dest,
506 const char *errors,
507 const char *details)
508{
509 if ((errors == NULL) ||
510 (strcmp(errors,"strict") == 0)) {
511 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000512 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513 details);
514 return -1;
515 }
516 else if (strcmp(errors,"ignore") == 0) {
517 (*source)++;
518 return 0;
519 }
520 else if (strcmp(errors,"replace") == 0) {
521 (*source)++;
522 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
523 (*dest)++;
524 return 0;
525 }
526 else {
527 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000528 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529 errors);
530 return -1;
531 }
532}
533
534#define UTF8_ERROR(details) do { \
535 if (utf8_decoding_error(&s, &p, errors, details)) \
536 goto onError; \
537 continue; \
538} while (0)
539
540PyObject *PyUnicode_DecodeUTF8(const char *s,
541 int size,
542 const char *errors)
543{
544 int n;
545 const char *e;
546 PyUnicodeObject *unicode;
547 Py_UNICODE *p;
548
549 /* Note: size will always be longer than the resulting Unicode
550 character count */
551 unicode = _PyUnicode_New(size);
552 if (!unicode)
553 return NULL;
554 if (size == 0)
555 return (PyObject *)unicode;
556
557 /* Unpack UTF-8 encoded data */
558 p = unicode->str;
559 e = s + size;
560
561 while (s < e) {
562 register Py_UNICODE ch = (unsigned char)*s;
563
564 if (ch < 0x80) {
565 *p++ = ch;
566 s++;
567 continue;
568 }
569
570 n = utf8_code_length[ch];
571
572 if (s + n > e)
573 UTF8_ERROR("unexpected end of data");
574
575 switch (n) {
576
577 case 0:
578 UTF8_ERROR("unexpected code byte");
579 break;
580
581 case 1:
582 UTF8_ERROR("internal error");
583 break;
584
585 case 2:
586 if ((s[1] & 0xc0) != 0x80)
587 UTF8_ERROR("invalid data");
588 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
589 if (ch < 0x80)
590 UTF8_ERROR("illegal encoding");
591 else
592 *p++ = ch;
593 break;
594
595 case 3:
596 if ((s[1] & 0xc0) != 0x80 ||
597 (s[2] & 0xc0) != 0x80)
598 UTF8_ERROR("invalid data");
599 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
600 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
601 UTF8_ERROR("illegal encoding");
602 else
603 *p++ = ch;
604 break;
605
606 default:
607 /* Other sizes are only needed for UCS-4 */
608 UTF8_ERROR("unsupported Unicode code range");
609 }
610 s += n;
611 }
612
613 /* Adjust length */
614 if (_PyUnicode_Resize(unicode, p - unicode->str))
615 goto onError;
616
617 return (PyObject *)unicode;
618
619onError:
620 Py_DECREF(unicode);
621 return NULL;
622}
623
624#undef UTF8_ERROR
625
626static
627int utf8_encoding_error(const Py_UNICODE **source,
628 char **dest,
629 const char *errors,
630 const char *details)
631{
632 if ((errors == NULL) ||
633 (strcmp(errors,"strict") == 0)) {
634 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000635 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636 details);
637 return -1;
638 }
639 else if (strcmp(errors,"ignore") == 0) {
640 return 0;
641 }
642 else if (strcmp(errors,"replace") == 0) {
643 **dest = '?';
644 (*dest)++;
645 return 0;
646 }
647 else {
648 PyErr_Format(PyExc_ValueError,
649 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 errors);
652 return -1;
653 }
654}
655
656PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
657 int size,
658 const char *errors)
659{
660 PyObject *v;
661 char *p;
662 char *q;
663
664 v = PyString_FromStringAndSize(NULL, 3 * size);
665 if (v == NULL)
666 return NULL;
667 if (size == 0)
668 goto done;
669
670 p = q = PyString_AS_STRING(v);
671 while (size-- > 0) {
672 Py_UNICODE ch = *s++;
673 if (ch < 0x80)
674 *p++ = (char) ch;
675 else if (ch < 0x0800) {
676 *p++ = 0xc0 | (ch >> 6);
677 *p++ = 0x80 | (ch & 0x3f);
678 } else if (0xD800 <= ch && ch <= 0xDFFF) {
679 /* These byte ranges are reserved for UTF-16 surrogate
680 bytes which the Python implementation currently does
681 not support. */
682 printf("code range problem: U+%04x\n", ch);
683 if (utf8_encoding_error(&s, &p, errors,
684 "unsupported code range"))
685 goto onError;
686 } else {
687 *p++ = 0xe0 | (ch >> 12);
688 *p++ = 0x80 | ((ch >> 6) & 0x3f);
689 *p++ = 0x80 | (ch & 0x3f);
690 }
691 }
692 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000693 if (_PyString_Resize(&v, p - q))
694 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000695
696 done:
697 return v;
698
699 onError:
700 Py_DECREF(v);
701 return NULL;
702}
703
704/* Return a Python string holding the UTF-8 encoded value of the
705 Unicode object.
706
707 The resulting string is cached in the Unicode object for subsequent
708 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000709 the character buffer interface and will live (at least) as long as
710 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711
712 The refcount of the string is *not* incremented.
713
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000714 *** Exported for internal use by the interpreter only !!! ***
715
Guido van Rossumd57fd912000-03-10 22:53:23 +0000716*/
717
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000718PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 const char *errors)
720{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000721 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722
723 if (v)
724 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000725 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
726 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 errors);
728 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000729 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730 return v;
731}
732
733PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
734{
735 PyObject *str;
736
737 if (!PyUnicode_Check(unicode)) {
738 PyErr_BadArgument();
739 return NULL;
740 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000741 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000742 if (str == NULL)
743 return NULL;
744 Py_INCREF(str);
745 return str;
746}
747
748/* --- UTF-16 Codec ------------------------------------------------------- */
749
750static
751int utf16_decoding_error(const Py_UNICODE **source,
752 Py_UNICODE **dest,
753 const char *errors,
754 const char *details)
755{
756 if ((errors == NULL) ||
757 (strcmp(errors,"strict") == 0)) {
758 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000759 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760 details);
761 return -1;
762 }
763 else if (strcmp(errors,"ignore") == 0) {
764 return 0;
765 }
766 else if (strcmp(errors,"replace") == 0) {
767 if (dest) {
768 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
769 (*dest)++;
770 }
771 return 0;
772 }
773 else {
774 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000775 "UTF-16 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 errors);
777 return -1;
778 }
779}
780
781#define UTF16_ERROR(details) do { \
782 if (utf16_decoding_error(&q, &p, errors, details)) \
783 goto onError; \
784 continue; \
785} while(0)
786
787PyObject *PyUnicode_DecodeUTF16(const char *s,
788 int size,
789 const char *errors,
790 int *byteorder)
791{
792 PyUnicodeObject *unicode;
793 Py_UNICODE *p;
794 const Py_UNICODE *q, *e;
795 int bo = 0;
796
797 /* size should be an even number */
798 if (size % sizeof(Py_UNICODE) != 0) {
799 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
800 return NULL;
801 /* The remaining input chars are ignored if we fall through
802 here... */
803 }
804
805 /* Note: size will always be longer than the resulting Unicode
806 character count */
807 unicode = _PyUnicode_New(size);
808 if (!unicode)
809 return NULL;
810 if (size == 0)
811 return (PyObject *)unicode;
812
813 /* Unpack UTF-16 encoded data */
814 p = unicode->str;
815 q = (Py_UNICODE *)s;
816 e = q + (size / sizeof(Py_UNICODE));
817
818 if (byteorder)
819 bo = *byteorder;
820
821 while (q < e) {
822 register Py_UNICODE ch = *q++;
823
824 /* Check for BOM marks (U+FEFF) in the input and adjust
825 current byte order setting accordingly. Swap input
826 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
827 !) */
828#ifdef BYTEORDER_IS_LITTLE_ENDIAN
829 if (ch == 0xFEFF) {
830 bo = -1;
831 continue;
832 } else if (ch == 0xFFFE) {
833 bo = 1;
834 continue;
835 }
836 if (bo == 1)
837 ch = (ch >> 8) | (ch << 8);
838#else
839 if (ch == 0xFEFF) {
840 bo = 1;
841 continue;
842 } else if (ch == 0xFFFE) {
843 bo = -1;
844 continue;
845 }
846 if (bo == -1)
847 ch = (ch >> 8) | (ch << 8);
848#endif
849 if (ch < 0xD800 || ch > 0xDFFF) {
850 *p++ = ch;
851 continue;
852 }
853
854 /* UTF-16 code pair: */
855 if (q >= e)
856 UTF16_ERROR("unexpected end of data");
857 if (0xDC00 <= *q && *q <= 0xDFFF) {
858 q++;
859 if (0xD800 <= *q && *q <= 0xDBFF)
860 /* This is valid data (a UTF-16 surrogate pair), but
861 we are not able to store this information since our
862 Py_UNICODE type only has 16 bits... this might
863 change someday, even though it's unlikely. */
864 UTF16_ERROR("code pairs are not supported");
865 else
866 continue;
867 }
868 UTF16_ERROR("illegal encoding");
869 }
870
871 if (byteorder)
872 *byteorder = bo;
873
874 /* Adjust length */
875 if (_PyUnicode_Resize(unicode, p - unicode->str))
876 goto onError;
877
878 return (PyObject *)unicode;
879
880onError:
881 Py_DECREF(unicode);
882 return NULL;
883}
884
885#undef UTF16_ERROR
886
887PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
888 int size,
889 const char *errors,
890 int byteorder)
891{
892 PyObject *v;
893 Py_UNICODE *p;
894 char *q;
895
896 /* We don't create UTF-16 pairs... */
897 v = PyString_FromStringAndSize(NULL,
898 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
899 if (v == NULL)
900 return NULL;
901 if (size == 0)
902 goto done;
903
904 q = PyString_AS_STRING(v);
905 p = (Py_UNICODE *)q;
906
907 if (byteorder == 0)
908 *p++ = 0xFEFF;
909 if (byteorder == 0 ||
910#ifdef BYTEORDER_IS_LITTLE_ENDIAN
911 byteorder == -1
912#else
913 byteorder == 1
914#endif
915 )
916 memcpy(p, s, size * sizeof(Py_UNICODE));
917 else
918 while (size-- > 0) {
919 Py_UNICODE ch = *s++;
920 *p++ = (ch >> 8) | (ch << 8);
921 }
922 done:
923 return v;
924}
925
926PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
927{
928 if (!PyUnicode_Check(unicode)) {
929 PyErr_BadArgument();
930 return NULL;
931 }
932 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
933 PyUnicode_GET_SIZE(unicode),
934 NULL,
935 0);
936}
937
938/* --- Unicode Escape Codec ----------------------------------------------- */
939
940static
941int unicodeescape_decoding_error(const char **source,
942 unsigned int *x,
943 const char *errors,
944 const char *details)
945{
946 if ((errors == NULL) ||
947 (strcmp(errors,"strict") == 0)) {
948 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000949 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 details);
951 return -1;
952 }
953 else if (strcmp(errors,"ignore") == 0) {
954 return 0;
955 }
956 else if (strcmp(errors,"replace") == 0) {
957 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
958 return 0;
959 }
960 else {
961 PyErr_Format(PyExc_ValueError,
962 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000963 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 errors);
965 return -1;
966 }
967}
968
969PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
970 int size,
971 const char *errors)
972{
973 PyUnicodeObject *v;
974 Py_UNICODE *p = NULL, *buf = NULL;
975 const char *end;
976
977 /* Escaped strings will always be longer than the resulting
978 Unicode string, so we start with size here and then reduce the
979 length after conversion to the true value. */
980 v = _PyUnicode_New(size);
981 if (v == NULL)
982 goto onError;
983 if (size == 0)
984 return (PyObject *)v;
985 p = buf = PyUnicode_AS_UNICODE(v);
986 end = s + size;
987 while (s < end) {
988 unsigned char c;
989 unsigned int x;
990 int i;
991
992 /* Non-escape characters are interpreted as Unicode ordinals */
993 if (*s != '\\') {
994 *p++ = (unsigned char)*s++;
995 continue;
996 }
997
998 /* \ - Escapes */
999 s++;
1000 switch (*s++) {
1001
1002 /* \x escapes */
1003 case '\n': break;
1004 case '\\': *p++ = '\\'; break;
1005 case '\'': *p++ = '\''; break;
1006 case '\"': *p++ = '\"'; break;
1007 case 'b': *p++ = '\b'; break;
1008 case 'f': *p++ = '\014'; break; /* FF */
1009 case 't': *p++ = '\t'; break;
1010 case 'n': *p++ = '\n'; break;
1011 case 'r': *p++ = '\r'; break;
1012 case 'v': *p++ = '\013'; break; /* VT */
1013 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1014
1015 /* \OOO (octal) escapes */
1016 case '0': case '1': case '2': case '3':
1017 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001018 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001020 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001022 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001024 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025 break;
1026
1027 /* \xXXXX escape with 0-4 hex digits */
1028 case 'x':
1029 x = 0;
1030 c = (unsigned char)*s;
1031 if (isxdigit(c)) {
1032 do {
1033 x = (x<<4) & ~0xF;
1034 if ('0' <= c && c <= '9')
1035 x += c - '0';
1036 else if ('a' <= c && c <= 'f')
1037 x += 10 + c - 'a';
1038 else
1039 x += 10 + c - 'A';
1040 c = (unsigned char)*++s;
1041 } while (isxdigit(c));
1042 *p++ = x;
1043 } else {
1044 *p++ = '\\';
1045 *p++ = (unsigned char)s[-1];
1046 }
1047 break;
1048
1049 /* \uXXXX with 4 hex digits */
1050 case 'u':
1051 for (x = 0, i = 0; i < 4; i++) {
1052 c = (unsigned char)s[i];
1053 if (!isxdigit(c)) {
1054 if (unicodeescape_decoding_error(&s, &x, errors,
1055 "truncated \\uXXXX"))
1056 goto onError;
1057 i++;
1058 break;
1059 }
1060 x = (x<<4) & ~0xF;
1061 if (c >= '0' && c <= '9')
1062 x += c - '0';
1063 else if (c >= 'a' && c <= 'f')
1064 x += 10 + c - 'a';
1065 else
1066 x += 10 + c - 'A';
1067 }
1068 s += i;
1069 *p++ = x;
1070 break;
1071
1072 default:
1073 *p++ = '\\';
1074 *p++ = (unsigned char)s[-1];
1075 break;
1076 }
1077 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001078 if (_PyUnicode_Resize(v, (int)(p - buf)))
1079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 return (PyObject *)v;
1081
1082 onError:
1083 Py_XDECREF(v);
1084 return NULL;
1085}
1086
1087/* Return a Unicode-Escape string version of the Unicode object.
1088
1089 If quotes is true, the string is enclosed in u"" or u'' quotes as
1090 appropriate.
1091
1092*/
1093
Barry Warsaw51ac5802000-03-20 16:36:48 +00001094static const Py_UNICODE *findchar(const Py_UNICODE *s,
1095 int size,
1096 Py_UNICODE ch);
1097
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098static
1099PyObject *unicodeescape_string(const Py_UNICODE *s,
1100 int size,
1101 int quotes)
1102{
1103 PyObject *repr;
1104 char *p;
1105 char *q;
1106
1107 static const char *hexdigit = "0123456789ABCDEF";
1108
1109 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1110 if (repr == NULL)
1111 return NULL;
1112
1113 p = q = PyString_AS_STRING(repr);
1114
1115 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 *p++ = 'u';
1117 *p++ = (findchar(s, size, '\'') &&
1118 !findchar(s, size, '"')) ? '"' : '\'';
1119 }
1120 while (size-- > 0) {
1121 Py_UNICODE ch = *s++;
1122 /* Escape quotes */
1123 if (quotes && (ch == q[1] || ch == '\\')) {
1124 *p++ = '\\';
1125 *p++ = (char) ch;
1126 }
1127 /* Map 16-bit characters to '\uxxxx' */
1128 else if (ch >= 256) {
1129 *p++ = '\\';
1130 *p++ = 'u';
1131 *p++ = hexdigit[(ch >> 12) & 0xf];
1132 *p++ = hexdigit[(ch >> 8) & 0xf];
1133 *p++ = hexdigit[(ch >> 4) & 0xf];
1134 *p++ = hexdigit[ch & 15];
1135 }
1136 /* Map non-printable US ASCII to '\ooo' */
1137 else if (ch < ' ' || ch >= 128) {
1138 *p++ = '\\';
1139 *p++ = hexdigit[(ch >> 6) & 7];
1140 *p++ = hexdigit[(ch >> 3) & 7];
1141 *p++ = hexdigit[ch & 7];
1142 }
1143 /* Copy everything else as-is */
1144 else
1145 *p++ = (char) ch;
1146 }
1147 if (quotes)
1148 *p++ = q[1];
1149
1150 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001151 if (_PyString_Resize(&repr, p - q))
1152 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153
1154 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001155
1156 onError:
1157 Py_DECREF(repr);
1158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159}
1160
1161PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1162 int size)
1163{
1164 return unicodeescape_string(s, size, 0);
1165}
1166
1167PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1168{
1169 if (!PyUnicode_Check(unicode)) {
1170 PyErr_BadArgument();
1171 return NULL;
1172 }
1173 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1174 PyUnicode_GET_SIZE(unicode));
1175}
1176
1177/* --- Raw Unicode Escape Codec ------------------------------------------- */
1178
1179PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1180 int size,
1181 const char *errors)
1182{
1183 PyUnicodeObject *v;
1184 Py_UNICODE *p, *buf;
1185 const char *end;
1186 const char *bs;
1187
1188 /* Escaped strings will always be longer than the resulting
1189 Unicode string, so we start with size here and then reduce the
1190 length after conversion to the true value. */
1191 v = _PyUnicode_New(size);
1192 if (v == NULL)
1193 goto onError;
1194 if (size == 0)
1195 return (PyObject *)v;
1196 p = buf = PyUnicode_AS_UNICODE(v);
1197 end = s + size;
1198 while (s < end) {
1199 unsigned char c;
1200 unsigned int x;
1201 int i;
1202
1203 /* Non-escape characters are interpreted as Unicode ordinals */
1204 if (*s != '\\') {
1205 *p++ = (unsigned char)*s++;
1206 continue;
1207 }
1208
1209 /* \u-escapes are only interpreted iff the number of leading
1210 backslashes if odd */
1211 bs = s;
1212 for (;s < end;) {
1213 if (*s != '\\')
1214 break;
1215 *p++ = (unsigned char)*s++;
1216 }
1217 if (((s - bs) & 1) == 0 ||
1218 s >= end ||
1219 *s != 'u') {
1220 continue;
1221 }
1222 p--;
1223 s++;
1224
1225 /* \uXXXX with 4 hex digits */
1226 for (x = 0, i = 0; i < 4; i++) {
1227 c = (unsigned char)s[i];
1228 if (!isxdigit(c)) {
1229 if (unicodeescape_decoding_error(&s, &x, errors,
1230 "truncated \\uXXXX"))
1231 goto onError;
1232 i++;
1233 break;
1234 }
1235 x = (x<<4) & ~0xF;
1236 if (c >= '0' && c <= '9')
1237 x += c - '0';
1238 else if (c >= 'a' && c <= 'f')
1239 x += 10 + c - 'a';
1240 else
1241 x += 10 + c - 'A';
1242 }
1243 s += i;
1244 *p++ = x;
1245 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001246 if (_PyUnicode_Resize(v, (int)(p - buf)))
1247 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 return (PyObject *)v;
1249
1250 onError:
1251 Py_XDECREF(v);
1252 return NULL;
1253}
1254
1255PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1256 int size)
1257{
1258 PyObject *repr;
1259 char *p;
1260 char *q;
1261
1262 static const char *hexdigit = "0123456789ABCDEF";
1263
1264 repr = PyString_FromStringAndSize(NULL, 6 * size);
1265 if (repr == NULL)
1266 return NULL;
1267
1268 p = q = PyString_AS_STRING(repr);
1269 while (size-- > 0) {
1270 Py_UNICODE ch = *s++;
1271 /* Map 16-bit characters to '\uxxxx' */
1272 if (ch >= 256) {
1273 *p++ = '\\';
1274 *p++ = 'u';
1275 *p++ = hexdigit[(ch >> 12) & 0xf];
1276 *p++ = hexdigit[(ch >> 8) & 0xf];
1277 *p++ = hexdigit[(ch >> 4) & 0xf];
1278 *p++ = hexdigit[ch & 15];
1279 }
1280 /* Copy everything else as-is */
1281 else
1282 *p++ = (char) ch;
1283 }
1284 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001285 if (_PyString_Resize(&repr, p - q))
1286 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287
1288 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001289
1290 onError:
1291 Py_DECREF(repr);
1292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293}
1294
1295PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1296{
1297 if (!PyUnicode_Check(unicode)) {
1298 PyErr_BadArgument();
1299 return NULL;
1300 }
1301 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1302 PyUnicode_GET_SIZE(unicode));
1303}
1304
1305/* --- Latin-1 Codec ------------------------------------------------------ */
1306
1307PyObject *PyUnicode_DecodeLatin1(const char *s,
1308 int size,
1309 const char *errors)
1310{
1311 PyUnicodeObject *v;
1312 Py_UNICODE *p;
1313
1314 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1315 v = _PyUnicode_New(size);
1316 if (v == NULL)
1317 goto onError;
1318 if (size == 0)
1319 return (PyObject *)v;
1320 p = PyUnicode_AS_UNICODE(v);
1321 while (size-- > 0)
1322 *p++ = (unsigned char)*s++;
1323 return (PyObject *)v;
1324
1325 onError:
1326 Py_XDECREF(v);
1327 return NULL;
1328}
1329
1330static
1331int latin1_encoding_error(const Py_UNICODE **source,
1332 char **dest,
1333 const char *errors,
1334 const char *details)
1335{
1336 if ((errors == NULL) ||
1337 (strcmp(errors,"strict") == 0)) {
1338 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001339 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340 details);
1341 return -1;
1342 }
1343 else if (strcmp(errors,"ignore") == 0) {
1344 return 0;
1345 }
1346 else if (strcmp(errors,"replace") == 0) {
1347 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001348 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 return 0;
1350 }
1351 else {
1352 PyErr_Format(PyExc_ValueError,
1353 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001354 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 errors);
1356 return -1;
1357 }
1358}
1359
1360PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1361 int size,
1362 const char *errors)
1363{
1364 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001365 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 repr = PyString_FromStringAndSize(NULL, size);
1367 if (repr == NULL)
1368 return NULL;
1369
1370 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001371 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 while (size-- > 0) {
1373 Py_UNICODE ch = *p++;
1374 if (ch >= 256) {
1375 if (latin1_encoding_error(&p, &s, errors,
1376 "ordinal not in range(256)"))
1377 goto onError;
1378 }
1379 else
1380 *s++ = (char)ch;
1381 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001382 /* Resize if error handling skipped some characters */
1383 if (s - start < PyString_GET_SIZE(repr))
1384 if (_PyString_Resize(&repr, s - start))
1385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 return repr;
1387
1388 onError:
1389 Py_DECREF(repr);
1390 return NULL;
1391}
1392
1393PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1394{
1395 if (!PyUnicode_Check(unicode)) {
1396 PyErr_BadArgument();
1397 return NULL;
1398 }
1399 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1400 PyUnicode_GET_SIZE(unicode),
1401 NULL);
1402}
1403
1404/* --- 7-bit ASCII Codec -------------------------------------------------- */
1405
1406static
1407int ascii_decoding_error(const char **source,
1408 Py_UNICODE **dest,
1409 const char *errors,
1410 const char *details)
1411{
1412 if ((errors == NULL) ||
1413 (strcmp(errors,"strict") == 0)) {
1414 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001415 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 details);
1417 return -1;
1418 }
1419 else if (strcmp(errors,"ignore") == 0) {
1420 return 0;
1421 }
1422 else if (strcmp(errors,"replace") == 0) {
1423 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1424 (*dest)++;
1425 return 0;
1426 }
1427 else {
1428 PyErr_Format(PyExc_ValueError,
1429 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001430 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 errors);
1432 return -1;
1433 }
1434}
1435
1436PyObject *PyUnicode_DecodeASCII(const char *s,
1437 int size,
1438 const char *errors)
1439{
1440 PyUnicodeObject *v;
1441 Py_UNICODE *p;
1442
1443 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1444 v = _PyUnicode_New(size);
1445 if (v == NULL)
1446 goto onError;
1447 if (size == 0)
1448 return (PyObject *)v;
1449 p = PyUnicode_AS_UNICODE(v);
1450 while (size-- > 0) {
1451 register unsigned char c;
1452
1453 c = (unsigned char)*s++;
1454 if (c < 128)
1455 *p++ = c;
1456 else if (ascii_decoding_error(&s, &p, errors,
1457 "ordinal not in range(128)"))
1458 goto onError;
1459 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001460 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1461 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463 return (PyObject *)v;
1464
1465 onError:
1466 Py_XDECREF(v);
1467 return NULL;
1468}
1469
1470static
1471int ascii_encoding_error(const Py_UNICODE **source,
1472 char **dest,
1473 const char *errors,
1474 const char *details)
1475{
1476 if ((errors == NULL) ||
1477 (strcmp(errors,"strict") == 0)) {
1478 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001479 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 details);
1481 return -1;
1482 }
1483 else if (strcmp(errors,"ignore") == 0) {
1484 return 0;
1485 }
1486 else if (strcmp(errors,"replace") == 0) {
1487 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001488 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 return 0;
1490 }
1491 else {
1492 PyErr_Format(PyExc_ValueError,
1493 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001494 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495 errors);
1496 return -1;
1497 }
1498}
1499
1500PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1501 int size,
1502 const char *errors)
1503{
1504 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001505 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506 repr = PyString_FromStringAndSize(NULL, size);
1507 if (repr == NULL)
1508 return NULL;
1509
1510 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001511 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 while (size-- > 0) {
1513 Py_UNICODE ch = *p++;
1514 if (ch >= 128) {
1515 if (ascii_encoding_error(&p, &s, errors,
1516 "ordinal not in range(128)"))
1517 goto onError;
1518 }
1519 else
1520 *s++ = (char)ch;
1521 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001522 /* Resize if error handling skipped some characters */
1523 if (s - start < PyString_GET_SIZE(repr))
1524 if (_PyString_Resize(&repr, s - start))
1525 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 return repr;
1527
1528 onError:
1529 Py_DECREF(repr);
1530 return NULL;
1531}
1532
1533PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1534{
1535 if (!PyUnicode_Check(unicode)) {
1536 PyErr_BadArgument();
1537 return NULL;
1538 }
1539 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1540 PyUnicode_GET_SIZE(unicode),
1541 NULL);
1542}
1543
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001544#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001545
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001546/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001547
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001548PyObject *PyUnicode_DecodeMBCS(const char *s,
1549 int size,
1550 const char *errors)
1551{
1552 PyUnicodeObject *v;
1553 Py_UNICODE *p;
1554
1555 /* First get the size of the result */
1556 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum4e751c32000-05-03 12:27:22 +00001557 if (usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001558 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1559
1560 v = _PyUnicode_New(usize);
1561 if (v == NULL)
1562 return NULL;
1563 if (usize == 0)
1564 return (PyObject *)v;
1565 p = PyUnicode_AS_UNICODE(v);
1566 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1567 Py_DECREF(v);
1568 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1569 }
1570
1571 return (PyObject *)v;
1572}
1573
1574PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1575 int size,
1576 const char *errors)
1577{
1578 PyObject *repr;
1579 char *s;
1580
1581 /* First get the size of the result */
Guido van Rossum4e751c32000-05-03 12:27:22 +00001582 DWORD mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001583 if (mbcssize==0)
1584 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1585
1586 repr = PyString_FromStringAndSize(NULL, mbcssize);
1587 if (repr == NULL)
1588 return NULL;
1589 if (mbcssize==0)
1590 return repr;
1591
1592 /* Do the conversion */
1593 s = PyString_AS_STRING(repr);
1594 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1595 Py_DECREF(repr);
1596 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1597 }
1598 return repr;
1599}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001600
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001601#endif /* MS_WIN32 */
1602
Guido van Rossumd57fd912000-03-10 22:53:23 +00001603/* --- Character Mapping Codec -------------------------------------------- */
1604
1605static
1606int charmap_decoding_error(const char **source,
1607 Py_UNICODE **dest,
1608 const char *errors,
1609 const char *details)
1610{
1611 if ((errors == NULL) ||
1612 (strcmp(errors,"strict") == 0)) {
1613 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001614 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001615 details);
1616 return -1;
1617 }
1618 else if (strcmp(errors,"ignore") == 0) {
1619 return 0;
1620 }
1621 else if (strcmp(errors,"replace") == 0) {
1622 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1623 (*dest)++;
1624 return 0;
1625 }
1626 else {
1627 PyErr_Format(PyExc_ValueError,
1628 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001629 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001630 errors);
1631 return -1;
1632 }
1633}
1634
1635PyObject *PyUnicode_DecodeCharmap(const char *s,
1636 int size,
1637 PyObject *mapping,
1638 const char *errors)
1639{
1640 PyUnicodeObject *v;
1641 Py_UNICODE *p;
1642
1643 /* Default to Latin-1 */
1644 if (mapping == NULL)
1645 return PyUnicode_DecodeLatin1(s, size, errors);
1646
1647 v = _PyUnicode_New(size);
1648 if (v == NULL)
1649 goto onError;
1650 if (size == 0)
1651 return (PyObject *)v;
1652 p = PyUnicode_AS_UNICODE(v);
1653 while (size-- > 0) {
1654 unsigned char ch = *s++;
1655 PyObject *w, *x;
1656
1657 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1658 w = PyInt_FromLong((long)ch);
1659 if (w == NULL)
1660 goto onError;
1661 x = PyObject_GetItem(mapping, w);
1662 Py_DECREF(w);
1663 if (x == NULL) {
1664 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1665 /* No mapping found: default to Latin-1 mapping */
1666 PyErr_Clear();
1667 *p++ = (Py_UNICODE)ch;
1668 continue;
1669 }
1670 goto onError;
1671 }
1672
1673 /* Apply mapping */
1674 if (PyInt_Check(x)) {
1675 int value = PyInt_AS_LONG(x);
1676 if (value < 0 || value > 65535) {
1677 PyErr_SetString(PyExc_TypeError,
1678 "character mapping must be in range(65336)");
1679 Py_DECREF(x);
1680 goto onError;
1681 }
1682 *p++ = (Py_UNICODE)value;
1683 }
1684 else if (x == Py_None) {
1685 /* undefined mapping */
1686 if (charmap_decoding_error(&s, &p, errors,
1687 "character maps to <undefined>")) {
1688 Py_DECREF(x);
1689 goto onError;
1690 }
1691 }
1692 else if (PyUnicode_Check(x)) {
1693 if (PyUnicode_GET_SIZE(x) != 1) {
1694 /* 1-n mapping */
1695 PyErr_SetString(PyExc_NotImplementedError,
1696 "1-n mappings are currently not implemented");
1697 Py_DECREF(x);
1698 goto onError;
1699 }
1700 *p++ = *PyUnicode_AS_UNICODE(x);
1701 }
1702 else {
1703 /* wrong return value */
1704 PyErr_SetString(PyExc_TypeError,
1705 "character mapping must return integer, None or unicode");
1706 Py_DECREF(x);
1707 goto onError;
1708 }
1709 Py_DECREF(x);
1710 }
1711 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1712 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1713 goto onError;
1714 return (PyObject *)v;
1715
1716 onError:
1717 Py_XDECREF(v);
1718 return NULL;
1719}
1720
1721static
1722int charmap_encoding_error(const Py_UNICODE **source,
1723 char **dest,
1724 const char *errors,
1725 const char *details)
1726{
1727 if ((errors == NULL) ||
1728 (strcmp(errors,"strict") == 0)) {
1729 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001730 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001731 details);
1732 return -1;
1733 }
1734 else if (strcmp(errors,"ignore") == 0) {
1735 return 0;
1736 }
1737 else if (strcmp(errors,"replace") == 0) {
1738 **dest = '?';
1739 (*dest)++;
1740 return 0;
1741 }
1742 else {
1743 PyErr_Format(PyExc_ValueError,
1744 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001745 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001746 errors);
1747 return -1;
1748 }
1749}
1750
1751PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1752 int size,
1753 PyObject *mapping,
1754 const char *errors)
1755{
1756 PyObject *v;
1757 char *s;
1758
1759 /* Default to Latin-1 */
1760 if (mapping == NULL)
1761 return PyUnicode_EncodeLatin1(p, size, errors);
1762
1763 v = PyString_FromStringAndSize(NULL, size);
1764 if (v == NULL)
1765 return NULL;
1766 s = PyString_AS_STRING(v);
1767 while (size-- > 0) {
1768 Py_UNICODE ch = *p++;
1769 PyObject *w, *x;
1770
1771 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1772 w = PyInt_FromLong((long)ch);
1773 if (w == NULL)
1774 goto onError;
1775 x = PyObject_GetItem(mapping, w);
1776 Py_DECREF(w);
1777 if (x == NULL) {
1778 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1779 /* No mapping found: default to Latin-1 mapping if possible */
1780 PyErr_Clear();
1781 if (ch < 256) {
1782 *s++ = (char)ch;
1783 continue;
1784 }
1785 else if (!charmap_encoding_error(&p, &s, errors,
1786 "missing character mapping"))
1787 continue;
1788 }
1789 goto onError;
1790 }
1791
1792 /* Apply mapping */
1793 if (PyInt_Check(x)) {
1794 int value = PyInt_AS_LONG(x);
1795 if (value < 0 || value > 255) {
1796 PyErr_SetString(PyExc_TypeError,
1797 "character mapping must be in range(256)");
1798 Py_DECREF(x);
1799 goto onError;
1800 }
1801 *s++ = (char)value;
1802 }
1803 else if (x == Py_None) {
1804 /* undefined mapping */
1805 if (charmap_encoding_error(&p, &s, errors,
1806 "character maps to <undefined>")) {
1807 Py_DECREF(x);
1808 goto onError;
1809 }
1810 }
1811 else if (PyString_Check(x)) {
1812 if (PyString_GET_SIZE(x) != 1) {
1813 /* 1-n mapping */
1814 PyErr_SetString(PyExc_NotImplementedError,
1815 "1-n mappings are currently not implemented");
1816 Py_DECREF(x);
1817 goto onError;
1818 }
1819 *s++ = *PyString_AS_STRING(x);
1820 }
1821 else {
1822 /* wrong return value */
1823 PyErr_SetString(PyExc_TypeError,
1824 "character mapping must return integer, None or unicode");
1825 Py_DECREF(x);
1826 goto onError;
1827 }
1828 Py_DECREF(x);
1829 }
1830 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1831 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1832 goto onError;
1833 return v;
1834
1835 onError:
1836 Py_DECREF(v);
1837 return NULL;
1838}
1839
1840PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1841 PyObject *mapping)
1842{
1843 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1844 PyErr_BadArgument();
1845 return NULL;
1846 }
1847 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1848 PyUnicode_GET_SIZE(unicode),
1849 mapping,
1850 NULL);
1851}
1852
1853static
1854int translate_error(const Py_UNICODE **source,
1855 Py_UNICODE **dest,
1856 const char *errors,
1857 const char *details)
1858{
1859 if ((errors == NULL) ||
1860 (strcmp(errors,"strict") == 0)) {
1861 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001862 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001863 details);
1864 return -1;
1865 }
1866 else if (strcmp(errors,"ignore") == 0) {
1867 return 0;
1868 }
1869 else if (strcmp(errors,"replace") == 0) {
1870 **dest = '?';
1871 (*dest)++;
1872 return 0;
1873 }
1874 else {
1875 PyErr_Format(PyExc_ValueError,
1876 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001877 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001878 errors);
1879 return -1;
1880 }
1881}
1882
1883PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1884 int size,
1885 PyObject *mapping,
1886 const char *errors)
1887{
1888 PyUnicodeObject *v;
1889 Py_UNICODE *p;
1890
1891 if (mapping == NULL) {
1892 PyErr_BadArgument();
1893 return NULL;
1894 }
1895
1896 /* Output will never be longer than input */
1897 v = _PyUnicode_New(size);
1898 if (v == NULL)
1899 goto onError;
1900 if (size == 0)
1901 goto done;
1902 p = PyUnicode_AS_UNICODE(v);
1903 while (size-- > 0) {
1904 Py_UNICODE ch = *s++;
1905 PyObject *w, *x;
1906
1907 /* Get mapping */
1908 w = PyInt_FromLong(ch);
1909 if (w == NULL)
1910 goto onError;
1911 x = PyObject_GetItem(mapping, w);
1912 Py_DECREF(w);
1913 if (x == NULL) {
1914 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1915 /* No mapping found: default to 1-1 mapping */
1916 PyErr_Clear();
1917 *p++ = ch;
1918 continue;
1919 }
1920 goto onError;
1921 }
1922
1923 /* Apply mapping */
1924 if (PyInt_Check(x))
1925 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1926 else if (x == Py_None) {
1927 /* undefined mapping */
1928 if (translate_error(&s, &p, errors,
1929 "character maps to <undefined>")) {
1930 Py_DECREF(x);
1931 goto onError;
1932 }
1933 }
1934 else if (PyUnicode_Check(x)) {
1935 if (PyUnicode_GET_SIZE(x) != 1) {
1936 /* 1-n mapping */
1937 PyErr_SetString(PyExc_NotImplementedError,
1938 "1-n mappings are currently not implemented");
1939 Py_DECREF(x);
1940 goto onError;
1941 }
1942 *p++ = *PyUnicode_AS_UNICODE(x);
1943 }
1944 else {
1945 /* wrong return value */
1946 PyErr_SetString(PyExc_TypeError,
1947 "translate mapping must return integer, None or unicode");
1948 Py_DECREF(x);
1949 goto onError;
1950 }
1951 Py_DECREF(x);
1952 }
1953 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001954 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1955 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001956
1957 done:
1958 return (PyObject *)v;
1959
1960 onError:
1961 Py_XDECREF(v);
1962 return NULL;
1963}
1964
1965PyObject *PyUnicode_Translate(PyObject *str,
1966 PyObject *mapping,
1967 const char *errors)
1968{
1969 PyObject *result;
1970
1971 str = PyUnicode_FromObject(str);
1972 if (str == NULL)
1973 goto onError;
1974 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1975 PyUnicode_GET_SIZE(str),
1976 mapping,
1977 errors);
1978 Py_DECREF(str);
1979 return result;
1980
1981 onError:
1982 Py_XDECREF(str);
1983 return NULL;
1984}
1985
Guido van Rossum9e896b32000-04-05 20:11:21 +00001986/* --- Decimal Encoder ---------------------------------------------------- */
1987
1988int PyUnicode_EncodeDecimal(Py_UNICODE *s,
1989 int length,
1990 char *output,
1991 const char *errors)
1992{
1993 Py_UNICODE *p, *end;
1994
1995 if (output == NULL) {
1996 PyErr_BadArgument();
1997 return -1;
1998 }
1999
2000 p = s;
2001 end = s + length;
2002 while (p < end) {
2003 register Py_UNICODE ch = *p++;
2004 int decimal;
2005
2006 if (Py_UNICODE_ISSPACE(ch)) {
2007 *output++ = ' ';
2008 continue;
2009 }
2010 decimal = Py_UNICODE_TODECIMAL(ch);
2011 if (decimal >= 0) {
2012 *output++ = '0' + decimal;
2013 continue;
2014 }
Guido van Rossumba477042000-04-06 18:18:10 +00002015 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002016 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002017 continue;
2018 }
2019 /* All other characters are considered invalid */
2020 if (errors == NULL || strcmp(errors, "strict") == 0) {
2021 PyErr_SetString(PyExc_ValueError,
2022 "invalid decimal Unicode string");
2023 goto onError;
2024 }
2025 else if (strcmp(errors, "ignore") == 0)
2026 continue;
2027 else if (strcmp(errors, "replace") == 0) {
2028 *output++ = '?';
2029 continue;
2030 }
2031 }
2032 /* 0-terminate the output string */
2033 *output++ = '\0';
2034 return 0;
2035
2036 onError:
2037 return -1;
2038}
2039
Guido van Rossumd57fd912000-03-10 22:53:23 +00002040/* --- Helpers ------------------------------------------------------------ */
2041
2042static
2043int count(PyUnicodeObject *self,
2044 int start,
2045 int end,
2046 PyUnicodeObject *substring)
2047{
2048 int count = 0;
2049
2050 end -= substring->length;
2051
2052 while (start <= end)
2053 if (Py_UNICODE_MATCH(self, start, substring)) {
2054 count++;
2055 start += substring->length;
2056 } else
2057 start++;
2058
2059 return count;
2060}
2061
2062int PyUnicode_Count(PyObject *str,
2063 PyObject *substr,
2064 int start,
2065 int end)
2066{
2067 int result;
2068
2069 str = PyUnicode_FromObject(str);
2070 if (str == NULL)
2071 return -1;
2072 substr = PyUnicode_FromObject(substr);
2073 if (substr == NULL) {
2074 Py_DECREF(substr);
2075 return -1;
2076 }
2077
2078 result = count((PyUnicodeObject *)str,
2079 start, end,
2080 (PyUnicodeObject *)substr);
2081
2082 Py_DECREF(str);
2083 Py_DECREF(substr);
2084 return result;
2085}
2086
2087static
2088int findstring(PyUnicodeObject *self,
2089 PyUnicodeObject *substring,
2090 int start,
2091 int end,
2092 int direction)
2093{
2094 if (start < 0)
2095 start += self->length;
2096 if (start < 0)
2097 start = 0;
2098
2099 if (substring->length == 0)
2100 return start;
2101
2102 if (end > self->length)
2103 end = self->length;
2104 if (end < 0)
2105 end += self->length;
2106 if (end < 0)
2107 end = 0;
2108
2109 end -= substring->length;
2110
2111 if (direction < 0) {
2112 for (; end >= start; end--)
2113 if (Py_UNICODE_MATCH(self, end, substring))
2114 return end;
2115 } else {
2116 for (; start <= end; start++)
2117 if (Py_UNICODE_MATCH(self, start, substring))
2118 return start;
2119 }
2120
2121 return -1;
2122}
2123
2124int PyUnicode_Find(PyObject *str,
2125 PyObject *substr,
2126 int start,
2127 int end,
2128 int direction)
2129{
2130 int result;
2131
2132 str = PyUnicode_FromObject(str);
2133 if (str == NULL)
2134 return -1;
2135 substr = PyUnicode_FromObject(substr);
2136 if (substr == NULL) {
2137 Py_DECREF(substr);
2138 return -1;
2139 }
2140
2141 result = findstring((PyUnicodeObject *)str,
2142 (PyUnicodeObject *)substr,
2143 start, end, direction);
2144 Py_DECREF(str);
2145 Py_DECREF(substr);
2146 return result;
2147}
2148
2149static
2150int tailmatch(PyUnicodeObject *self,
2151 PyUnicodeObject *substring,
2152 int start,
2153 int end,
2154 int direction)
2155{
2156 if (start < 0)
2157 start += self->length;
2158 if (start < 0)
2159 start = 0;
2160
2161 if (substring->length == 0)
2162 return 1;
2163
2164 if (end > self->length)
2165 end = self->length;
2166 if (end < 0)
2167 end += self->length;
2168 if (end < 0)
2169 end = 0;
2170
2171 end -= substring->length;
2172 if (end < start)
2173 return 0;
2174
2175 if (direction > 0) {
2176 if (Py_UNICODE_MATCH(self, end, substring))
2177 return 1;
2178 } else {
2179 if (Py_UNICODE_MATCH(self, start, substring))
2180 return 1;
2181 }
2182
2183 return 0;
2184}
2185
2186int PyUnicode_Tailmatch(PyObject *str,
2187 PyObject *substr,
2188 int start,
2189 int end,
2190 int direction)
2191{
2192 int result;
2193
2194 str = PyUnicode_FromObject(str);
2195 if (str == NULL)
2196 return -1;
2197 substr = PyUnicode_FromObject(substr);
2198 if (substr == NULL) {
2199 Py_DECREF(substr);
2200 return -1;
2201 }
2202
2203 result = tailmatch((PyUnicodeObject *)str,
2204 (PyUnicodeObject *)substr,
2205 start, end, direction);
2206 Py_DECREF(str);
2207 Py_DECREF(substr);
2208 return result;
2209}
2210
2211static
2212const Py_UNICODE *findchar(const Py_UNICODE *s,
2213 int size,
2214 Py_UNICODE ch)
2215{
2216 /* like wcschr, but doesn't stop at NULL characters */
2217
2218 while (size-- > 0) {
2219 if (*s == ch)
2220 return s;
2221 s++;
2222 }
2223
2224 return NULL;
2225}
2226
2227/* Apply fixfct filter to the Unicode object self and return a
2228 reference to the modified object */
2229
2230static
2231PyObject *fixup(PyUnicodeObject *self,
2232 int (*fixfct)(PyUnicodeObject *s))
2233{
2234
2235 PyUnicodeObject *u;
2236
2237 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2238 self->length);
2239 if (u == NULL)
2240 return NULL;
2241 if (!fixfct(u)) {
2242 /* fixfct should return TRUE if it modified the buffer. If
2243 FALSE, return a reference to the original buffer instead
2244 (to save space, not time) */
2245 Py_INCREF(self);
2246 Py_DECREF(u);
2247 return (PyObject*) self;
2248 }
2249 return (PyObject*) u;
2250}
2251
2252static
2253int fixupper(PyUnicodeObject *self)
2254{
2255 int len = self->length;
2256 Py_UNICODE *s = self->str;
2257 int status = 0;
2258
2259 while (len-- > 0) {
2260 register Py_UNICODE ch;
2261
2262 ch = Py_UNICODE_TOUPPER(*s);
2263 if (ch != *s) {
2264 status = 1;
2265 *s = ch;
2266 }
2267 s++;
2268 }
2269
2270 return status;
2271}
2272
2273static
2274int fixlower(PyUnicodeObject *self)
2275{
2276 int len = self->length;
2277 Py_UNICODE *s = self->str;
2278 int status = 0;
2279
2280 while (len-- > 0) {
2281 register Py_UNICODE ch;
2282
2283 ch = Py_UNICODE_TOLOWER(*s);
2284 if (ch != *s) {
2285 status = 1;
2286 *s = ch;
2287 }
2288 s++;
2289 }
2290
2291 return status;
2292}
2293
2294static
2295int fixswapcase(PyUnicodeObject *self)
2296{
2297 int len = self->length;
2298 Py_UNICODE *s = self->str;
2299 int status = 0;
2300
2301 while (len-- > 0) {
2302 if (Py_UNICODE_ISUPPER(*s)) {
2303 *s = Py_UNICODE_TOLOWER(*s);
2304 status = 1;
2305 } else if (Py_UNICODE_ISLOWER(*s)) {
2306 *s = Py_UNICODE_TOUPPER(*s);
2307 status = 1;
2308 }
2309 s++;
2310 }
2311
2312 return status;
2313}
2314
2315static
2316int fixcapitalize(PyUnicodeObject *self)
2317{
2318 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2319 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2320 return 1;
2321 }
2322 return 0;
2323}
2324
2325static
2326int fixtitle(PyUnicodeObject *self)
2327{
2328 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2329 register Py_UNICODE *e;
2330 int previous_is_cased;
2331
2332 /* Shortcut for single character strings */
2333 if (PyUnicode_GET_SIZE(self) == 1) {
2334 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2335 if (*p != ch) {
2336 *p = ch;
2337 return 1;
2338 }
2339 else
2340 return 0;
2341 }
2342
2343 e = p + PyUnicode_GET_SIZE(self);
2344 previous_is_cased = 0;
2345 for (; p < e; p++) {
2346 register const Py_UNICODE ch = *p;
2347
2348 if (previous_is_cased)
2349 *p = Py_UNICODE_TOLOWER(ch);
2350 else
2351 *p = Py_UNICODE_TOTITLE(ch);
2352
2353 if (Py_UNICODE_ISLOWER(ch) ||
2354 Py_UNICODE_ISUPPER(ch) ||
2355 Py_UNICODE_ISTITLE(ch))
2356 previous_is_cased = 1;
2357 else
2358 previous_is_cased = 0;
2359 }
2360 return 1;
2361}
2362
2363PyObject *PyUnicode_Join(PyObject *separator,
2364 PyObject *seq)
2365{
2366 Py_UNICODE *sep;
2367 int seplen;
2368 PyUnicodeObject *res = NULL;
2369 int reslen = 0;
2370 Py_UNICODE *p;
2371 int seqlen = 0;
2372 int sz = 100;
2373 int i;
2374
2375 seqlen = PySequence_Length(seq);
2376 if (seqlen < 0 && PyErr_Occurred())
2377 return NULL;
2378
2379 if (separator == NULL) {
2380 Py_UNICODE blank = ' ';
2381 sep = &blank;
2382 seplen = 1;
2383 }
2384 else {
2385 separator = PyUnicode_FromObject(separator);
2386 if (separator == NULL)
2387 return NULL;
2388 sep = PyUnicode_AS_UNICODE(separator);
2389 seplen = PyUnicode_GET_SIZE(separator);
2390 }
2391
2392 res = _PyUnicode_New(sz);
2393 if (res == NULL)
2394 goto onError;
2395 p = PyUnicode_AS_UNICODE(res);
2396 reslen = 0;
2397
2398 for (i = 0; i < seqlen; i++) {
2399 int itemlen;
2400 PyObject *item;
2401
2402 item = PySequence_GetItem(seq, i);
2403 if (item == NULL)
2404 goto onError;
2405 if (!PyUnicode_Check(item)) {
2406 PyObject *v;
2407 v = PyUnicode_FromObject(item);
2408 Py_DECREF(item);
2409 item = v;
2410 if (item == NULL)
2411 goto onError;
2412 }
2413 itemlen = PyUnicode_GET_SIZE(item);
2414 while (reslen + itemlen + seplen >= sz) {
2415 if (_PyUnicode_Resize(res, sz*2))
2416 goto onError;
2417 sz *= 2;
2418 p = PyUnicode_AS_UNICODE(res) + reslen;
2419 }
2420 if (i > 0) {
2421 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2422 p += seplen;
2423 reslen += seplen;
2424 }
2425 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2426 p += itemlen;
2427 reslen += itemlen;
2428 Py_DECREF(item);
2429 }
2430 if (_PyUnicode_Resize(res, reslen))
2431 goto onError;
2432
2433 Py_XDECREF(separator);
2434 return (PyObject *)res;
2435
2436 onError:
2437 Py_XDECREF(separator);
2438 Py_DECREF(res);
2439 return NULL;
2440}
2441
2442static
2443PyUnicodeObject *pad(PyUnicodeObject *self,
2444 int left,
2445 int right,
2446 Py_UNICODE fill)
2447{
2448 PyUnicodeObject *u;
2449
2450 if (left < 0)
2451 left = 0;
2452 if (right < 0)
2453 right = 0;
2454
2455 if (left == 0 && right == 0) {
2456 Py_INCREF(self);
2457 return self;
2458 }
2459
2460 u = _PyUnicode_New(left + self->length + right);
2461 if (u) {
2462 if (left)
2463 Py_UNICODE_FILL(u->str, fill, left);
2464 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2465 if (right)
2466 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2467 }
2468
2469 return u;
2470}
2471
2472#define SPLIT_APPEND(data, left, right) \
2473 str = PyUnicode_FromUnicode(data + left, right - left); \
2474 if (!str) \
2475 goto onError; \
2476 if (PyList_Append(list, str)) { \
2477 Py_DECREF(str); \
2478 goto onError; \
2479 } \
2480 else \
2481 Py_DECREF(str);
2482
2483static
2484PyObject *split_whitespace(PyUnicodeObject *self,
2485 PyObject *list,
2486 int maxcount)
2487{
2488 register int i;
2489 register int j;
2490 int len = self->length;
2491 PyObject *str;
2492
2493 for (i = j = 0; i < len; ) {
2494 /* find a token */
2495 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2496 i++;
2497 j = i;
2498 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2499 i++;
2500 if (j < i) {
2501 if (maxcount-- <= 0)
2502 break;
2503 SPLIT_APPEND(self->str, j, i);
2504 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2505 i++;
2506 j = i;
2507 }
2508 }
2509 if (j < len) {
2510 SPLIT_APPEND(self->str, j, len);
2511 }
2512 return list;
2513
2514 onError:
2515 Py_DECREF(list);
2516 return NULL;
2517}
2518
2519PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002520 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002521{
2522 register int i;
2523 register int j;
2524 int len;
2525 PyObject *list;
2526 PyObject *str;
2527 Py_UNICODE *data;
2528
2529 string = PyUnicode_FromObject(string);
2530 if (string == NULL)
2531 return NULL;
2532 data = PyUnicode_AS_UNICODE(string);
2533 len = PyUnicode_GET_SIZE(string);
2534
Guido van Rossumd57fd912000-03-10 22:53:23 +00002535 list = PyList_New(0);
2536 if (!list)
2537 goto onError;
2538
2539 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002540 int eol;
2541
Guido van Rossumd57fd912000-03-10 22:53:23 +00002542 /* Find a line and append it */
2543 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2544 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002545
2546 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002547 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002548 if (i < len) {
2549 if (data[i] == '\r' && i + 1 < len &&
2550 data[i+1] == '\n')
2551 i += 2;
2552 else
2553 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002554 if (keepends)
2555 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002556 }
Guido van Rossum86662912000-04-11 15:38:46 +00002557 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002558 j = i;
2559 }
2560 if (j < len) {
2561 SPLIT_APPEND(data, j, len);
2562 }
2563
2564 Py_DECREF(string);
2565 return list;
2566
2567 onError:
2568 Py_DECREF(list);
2569 Py_DECREF(string);
2570 return NULL;
2571}
2572
2573static
2574PyObject *split_char(PyUnicodeObject *self,
2575 PyObject *list,
2576 Py_UNICODE ch,
2577 int maxcount)
2578{
2579 register int i;
2580 register int j;
2581 int len = self->length;
2582 PyObject *str;
2583
2584 for (i = j = 0; i < len; ) {
2585 if (self->str[i] == ch) {
2586 if (maxcount-- <= 0)
2587 break;
2588 SPLIT_APPEND(self->str, j, i);
2589 i = j = i + 1;
2590 } else
2591 i++;
2592 }
2593 if (j <= len) {
2594 SPLIT_APPEND(self->str, j, len);
2595 }
2596 return list;
2597
2598 onError:
2599 Py_DECREF(list);
2600 return NULL;
2601}
2602
2603static
2604PyObject *split_substring(PyUnicodeObject *self,
2605 PyObject *list,
2606 PyUnicodeObject *substring,
2607 int maxcount)
2608{
2609 register int i;
2610 register int j;
2611 int len = self->length;
2612 int sublen = substring->length;
2613 PyObject *str;
2614
2615 for (i = j = 0; i < len - sublen; ) {
2616 if (Py_UNICODE_MATCH(self, i, substring)) {
2617 if (maxcount-- <= 0)
2618 break;
2619 SPLIT_APPEND(self->str, j, i);
2620 i = j = i + sublen;
2621 } else
2622 i++;
2623 }
2624 if (j <= len) {
2625 SPLIT_APPEND(self->str, j, len);
2626 }
2627 return list;
2628
2629 onError:
2630 Py_DECREF(list);
2631 return NULL;
2632}
2633
2634#undef SPLIT_APPEND
2635
2636static
2637PyObject *split(PyUnicodeObject *self,
2638 PyUnicodeObject *substring,
2639 int maxcount)
2640{
2641 PyObject *list;
2642
2643 if (maxcount < 0)
2644 maxcount = INT_MAX;
2645
2646 list = PyList_New(0);
2647 if (!list)
2648 return NULL;
2649
2650 if (substring == NULL)
2651 return split_whitespace(self,list,maxcount);
2652
2653 else if (substring->length == 1)
2654 return split_char(self,list,substring->str[0],maxcount);
2655
2656 else if (substring->length == 0) {
2657 Py_DECREF(list);
2658 PyErr_SetString(PyExc_ValueError, "empty separator");
2659 return NULL;
2660 }
2661 else
2662 return split_substring(self,list,substring,maxcount);
2663}
2664
2665static
2666PyObject *strip(PyUnicodeObject *self,
2667 int left,
2668 int right)
2669{
2670 Py_UNICODE *p = self->str;
2671 int start = 0;
2672 int end = self->length;
2673
2674 if (left)
2675 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2676 start++;
2677
2678 if (right)
2679 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2680 end--;
2681
2682 if (start == 0 && end == self->length) {
2683 /* couldn't strip anything off, return original string */
2684 Py_INCREF(self);
2685 return (PyObject*) self;
2686 }
2687
2688 return (PyObject*) PyUnicode_FromUnicode(
2689 self->str + start,
2690 end - start
2691 );
2692}
2693
2694static
2695PyObject *replace(PyUnicodeObject *self,
2696 PyUnicodeObject *str1,
2697 PyUnicodeObject *str2,
2698 int maxcount)
2699{
2700 PyUnicodeObject *u;
2701
2702 if (maxcount < 0)
2703 maxcount = INT_MAX;
2704
2705 if (str1->length == 1 && str2->length == 1) {
2706 int i;
2707
2708 /* replace characters */
2709 if (!findchar(self->str, self->length, str1->str[0])) {
2710 /* nothing to replace, return original string */
2711 Py_INCREF(self);
2712 u = self;
2713 } else {
2714 Py_UNICODE u1 = str1->str[0];
2715 Py_UNICODE u2 = str2->str[0];
2716
2717 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2718 self->str,
2719 self->length
2720 );
2721 if (u)
2722 for (i = 0; i < u->length; i++)
2723 if (u->str[i] == u1) {
2724 if (--maxcount < 0)
2725 break;
2726 u->str[i] = u2;
2727 }
2728 }
2729
2730 } else {
2731 int n, i;
2732 Py_UNICODE *p;
2733
2734 /* replace strings */
2735 n = count(self, 0, self->length, str1);
2736 if (n > maxcount)
2737 n = maxcount;
2738 if (n == 0) {
2739 /* nothing to replace, return original string */
2740 Py_INCREF(self);
2741 u = self;
2742 } else {
2743 u = _PyUnicode_New(
2744 self->length + n * (str2->length - str1->length));
2745 if (u) {
2746 i = 0;
2747 p = u->str;
2748 while (i <= self->length - str1->length)
2749 if (Py_UNICODE_MATCH(self, i, str1)) {
2750 /* replace string segment */
2751 Py_UNICODE_COPY(p, str2->str, str2->length);
2752 p += str2->length;
2753 i += str1->length;
2754 if (--n <= 0) {
2755 /* copy remaining part */
2756 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2757 break;
2758 }
2759 } else
2760 *p++ = self->str[i++];
2761 }
2762 }
2763 }
2764
2765 return (PyObject *) u;
2766}
2767
2768/* --- Unicode Object Methods --------------------------------------------- */
2769
2770static char title__doc__[] =
2771"S.title() -> unicode\n\
2772\n\
2773Return a titlecased version of S, i.e. words start with title case\n\
2774characters, all remaining cased characters have lower case.";
2775
2776static PyObject*
2777unicode_title(PyUnicodeObject *self, PyObject *args)
2778{
2779 if (!PyArg_NoArgs(args))
2780 return NULL;
2781 return fixup(self, fixtitle);
2782}
2783
2784static char capitalize__doc__[] =
2785"S.capitalize() -> unicode\n\
2786\n\
2787Return a capitalized version of S, i.e. make the first character\n\
2788have upper case.";
2789
2790static PyObject*
2791unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2792{
2793 if (!PyArg_NoArgs(args))
2794 return NULL;
2795 return fixup(self, fixcapitalize);
2796}
2797
2798#if 0
2799static char capwords__doc__[] =
2800"S.capwords() -> unicode\n\
2801\n\
2802Apply .capitalize() to all words in S and return the result with\n\
2803normalized whitespace (all whitespace strings are replaced by ' ').";
2804
2805static PyObject*
2806unicode_capwords(PyUnicodeObject *self, PyObject *args)
2807{
2808 PyObject *list;
2809 PyObject *item;
2810 int i;
2811
2812 if (!PyArg_NoArgs(args))
2813 return NULL;
2814
2815 /* Split into words */
2816 list = split(self, NULL, -1);
2817 if (!list)
2818 return NULL;
2819
2820 /* Capitalize each word */
2821 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2822 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2823 fixcapitalize);
2824 if (item == NULL)
2825 goto onError;
2826 Py_DECREF(PyList_GET_ITEM(list, i));
2827 PyList_SET_ITEM(list, i, item);
2828 }
2829
2830 /* Join the words to form a new string */
2831 item = PyUnicode_Join(NULL, list);
2832
2833onError:
2834 Py_DECREF(list);
2835 return (PyObject *)item;
2836}
2837#endif
2838
2839static char center__doc__[] =
2840"S.center(width) -> unicode\n\
2841\n\
2842Return S centered in a Unicode string of length width. Padding is done\n\
2843using spaces.";
2844
2845static PyObject *
2846unicode_center(PyUnicodeObject *self, PyObject *args)
2847{
2848 int marg, left;
2849 int width;
2850
2851 if (!PyArg_ParseTuple(args, "i:center", &width))
2852 return NULL;
2853
2854 if (self->length >= width) {
2855 Py_INCREF(self);
2856 return (PyObject*) self;
2857 }
2858
2859 marg = width - self->length;
2860 left = marg / 2 + (marg & width & 1);
2861
2862 return (PyObject*) pad(self, left, marg - left, ' ');
2863}
2864
2865static int
2866unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2867{
2868 int len1, len2;
2869 Py_UNICODE *s1 = str1->str;
2870 Py_UNICODE *s2 = str2->str;
2871
2872 len1 = str1->length;
2873 len2 = str2->length;
2874
2875 while (len1 > 0 && len2 > 0) {
2876 int cmp = (*s1++) - (*s2++);
2877 if (cmp)
2878 /* This should make Christian happy! */
2879 return (cmp < 0) ? -1 : (cmp != 0);
2880 len1--, len2--;
2881 }
2882
2883 return (len1 < len2) ? -1 : (len1 != len2);
2884}
2885
2886int PyUnicode_Compare(PyObject *left,
2887 PyObject *right)
2888{
2889 PyUnicodeObject *u = NULL, *v = NULL;
2890 int result;
2891
2892 /* Coerce the two arguments */
2893 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2894 if (u == NULL)
2895 goto onError;
2896 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2897 if (v == NULL)
2898 goto onError;
2899
2900 /* Shortcut for emtpy or interned objects */
2901 if (v == u) {
2902 Py_DECREF(u);
2903 Py_DECREF(v);
2904 return 0;
2905 }
2906
2907 result = unicode_compare(u, v);
2908
2909 Py_DECREF(u);
2910 Py_DECREF(v);
2911 return result;
2912
2913onError:
2914 Py_XDECREF(u);
2915 Py_XDECREF(v);
2916 return -1;
2917}
2918
Guido van Rossum403d68b2000-03-13 15:55:09 +00002919int PyUnicode_Contains(PyObject *container,
2920 PyObject *element)
2921{
2922 PyUnicodeObject *u = NULL, *v = NULL;
2923 int result;
2924 register const Py_UNICODE *p, *e;
2925 register Py_UNICODE ch;
2926
2927 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002928 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2929 if (v == NULL)
2930 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002931 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2932 if (u == NULL) {
2933 Py_DECREF(v);
2934 goto onError;
2935 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002936
2937 /* Check v in u */
2938 if (PyUnicode_GET_SIZE(v) != 1) {
2939 PyErr_SetString(PyExc_TypeError,
2940 "string member test needs char left operand");
2941 goto onError;
2942 }
2943 ch = *PyUnicode_AS_UNICODE(v);
2944 p = PyUnicode_AS_UNICODE(u);
2945 e = p + PyUnicode_GET_SIZE(u);
2946 result = 0;
2947 while (p < e) {
2948 if (*p++ == ch) {
2949 result = 1;
2950 break;
2951 }
2952 }
2953
2954 Py_DECREF(u);
2955 Py_DECREF(v);
2956 return result;
2957
2958onError:
2959 Py_XDECREF(u);
2960 Py_XDECREF(v);
2961 return -1;
2962}
2963
Guido van Rossumd57fd912000-03-10 22:53:23 +00002964/* Concat to string or Unicode object giving a new Unicode object. */
2965
2966PyObject *PyUnicode_Concat(PyObject *left,
2967 PyObject *right)
2968{
2969 PyUnicodeObject *u = NULL, *v = NULL, *w;
2970
2971 /* Coerce the two arguments */
2972 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2973 if (u == NULL)
2974 goto onError;
2975 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2976 if (v == NULL)
2977 goto onError;
2978
2979 /* Shortcuts */
2980 if (v == unicode_empty) {
2981 Py_DECREF(v);
2982 return (PyObject *)u;
2983 }
2984 if (u == unicode_empty) {
2985 Py_DECREF(u);
2986 return (PyObject *)v;
2987 }
2988
2989 /* Concat the two Unicode strings */
2990 w = _PyUnicode_New(u->length + v->length);
2991 if (w == NULL)
2992 goto onError;
2993 Py_UNICODE_COPY(w->str, u->str, u->length);
2994 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
2995
2996 Py_DECREF(u);
2997 Py_DECREF(v);
2998 return (PyObject *)w;
2999
3000onError:
3001 Py_XDECREF(u);
3002 Py_XDECREF(v);
3003 return NULL;
3004}
3005
3006static char count__doc__[] =
3007"S.count(sub[, start[, end]]) -> int\n\
3008\n\
3009Return the number of occurrences of substring sub in Unicode string\n\
3010S[start:end]. Optional arguments start and end are\n\
3011interpreted as in slice notation.";
3012
3013static PyObject *
3014unicode_count(PyUnicodeObject *self, PyObject *args)
3015{
3016 PyUnicodeObject *substring;
3017 int start = 0;
3018 int end = INT_MAX;
3019 PyObject *result;
3020
3021 if (!PyArg_ParseTuple(args, "O|ii:count", &substring, &start, &end))
3022 return NULL;
3023
3024 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3025 (PyObject *)substring);
3026 if (substring == NULL)
3027 return NULL;
3028
3029 if (substring->length == 0) {
3030 Py_DECREF(substring);
3031 return PyInt_FromLong((long) 0);
3032 }
3033
3034 if (start < 0)
3035 start += self->length;
3036 if (start < 0)
3037 start = 0;
3038 if (end > self->length)
3039 end = self->length;
3040 if (end < 0)
3041 end += self->length;
3042 if (end < 0)
3043 end = 0;
3044
3045 result = PyInt_FromLong((long) count(self, start, end, substring));
3046
3047 Py_DECREF(substring);
3048 return result;
3049}
3050
3051static char encode__doc__[] =
3052"S.encode([encoding[,errors]]) -> string\n\
3053\n\
3054Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
3055errors may be given to set a different error handling scheme. Default\n\
3056is 'strict' meaning that encoding errors raise a ValueError. Other\n\
3057possible values are 'ignore' and 'replace'.";
3058
3059static PyObject *
3060unicode_encode(PyUnicodeObject *self, PyObject *args)
3061{
3062 char *encoding = NULL;
3063 char *errors = NULL;
3064 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3065 return NULL;
3066 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3067}
3068
3069static char expandtabs__doc__[] =
3070"S.expandtabs([tabsize]) -> unicode\n\
3071\n\
3072Return a copy of S where all tab characters are expanded using spaces.\n\
3073If tabsize is not given, a tab size of 8 characters is assumed.";
3074
3075static PyObject*
3076unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3077{
3078 Py_UNICODE *e;
3079 Py_UNICODE *p;
3080 Py_UNICODE *q;
3081 int i, j;
3082 PyUnicodeObject *u;
3083 int tabsize = 8;
3084
3085 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3086 return NULL;
3087
3088 /* First pass: determine size of ouput string */
3089 i = j = 0;
3090 e = self->str + self->length;
3091 for (p = self->str; p < e; p++)
3092 if (*p == '\t') {
3093 if (tabsize > 0)
3094 j += tabsize - (j % tabsize);
3095 }
3096 else {
3097 j++;
3098 if (*p == '\n' || *p == '\r') {
3099 i += j;
3100 j = 0;
3101 }
3102 }
3103
3104 /* Second pass: create output string and fill it */
3105 u = _PyUnicode_New(i + j);
3106 if (!u)
3107 return NULL;
3108
3109 j = 0;
3110 q = u->str;
3111
3112 for (p = self->str; p < e; p++)
3113 if (*p == '\t') {
3114 if (tabsize > 0) {
3115 i = tabsize - (j % tabsize);
3116 j += i;
3117 while (i--)
3118 *q++ = ' ';
3119 }
3120 }
3121 else {
3122 j++;
3123 *q++ = *p;
3124 if (*p == '\n' || *p == '\r')
3125 j = 0;
3126 }
3127
3128 return (PyObject*) u;
3129}
3130
3131static char find__doc__[] =
3132"S.find(sub [,start [,end]]) -> int\n\
3133\n\
3134Return the lowest index in S where substring sub is found,\n\
3135such that sub is contained within s[start,end]. Optional\n\
3136arguments start and end are interpreted as in slice notation.\n\
3137\n\
3138Return -1 on failure.";
3139
3140static PyObject *
3141unicode_find(PyUnicodeObject *self, PyObject *args)
3142{
3143 PyUnicodeObject *substring;
3144 int start = 0;
3145 int end = INT_MAX;
3146 PyObject *result;
3147
3148 if (!PyArg_ParseTuple(args, "O|ii:find", &substring, &start, &end))
3149 return NULL;
3150 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3151 (PyObject *)substring);
3152 if (substring == NULL)
3153 return NULL;
3154
3155 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3156
3157 Py_DECREF(substring);
3158 return result;
3159}
3160
3161static PyObject *
3162unicode_getitem(PyUnicodeObject *self, int index)
3163{
3164 if (index < 0 || index >= self->length) {
3165 PyErr_SetString(PyExc_IndexError, "string index out of range");
3166 return NULL;
3167 }
3168
3169 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3170}
3171
3172static long
3173unicode_hash(PyUnicodeObject *self)
3174{
3175 long hash;
3176 PyObject *utf8;
3177
3178 /* Since Unicode objects compare equal to their UTF-8 string
3179 counterparts, they should also use the UTF-8 strings as basis
3180 for their hash value. This is needed to assure that strings and
3181 Unicode objects behave in the same way as dictionary
3182 keys. Unfortunately, this costs some performance and also some
3183 memory if the cached UTF-8 representation is not used later
3184 on. */
3185 if (self->hash != -1)
3186 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003187 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003188 if (utf8 == NULL)
3189 return -1;
3190 hash = PyObject_Hash(utf8);
3191 if (hash == -1)
3192 return -1;
3193 self->hash = hash;
3194 return hash;
3195}
3196
3197static char index__doc__[] =
3198"S.index(sub [,start [,end]]) -> int\n\
3199\n\
3200Like S.find() but raise ValueError when the substring is not found.";
3201
3202static PyObject *
3203unicode_index(PyUnicodeObject *self, PyObject *args)
3204{
3205 int result;
3206 PyUnicodeObject *substring;
3207 int start = 0;
3208 int end = INT_MAX;
3209
3210 if (!PyArg_ParseTuple(args, "O|ii:index", &substring, &start, &end))
3211 return NULL;
3212
3213 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3214 (PyObject *)substring);
3215 if (substring == NULL)
3216 return NULL;
3217
3218 result = findstring(self, substring, start, end, 1);
3219
3220 Py_DECREF(substring);
3221 if (result < 0) {
3222 PyErr_SetString(PyExc_ValueError, "substring not found");
3223 return NULL;
3224 }
3225 return PyInt_FromLong(result);
3226}
3227
3228static char islower__doc__[] =
3229"S.islower() -> int\n\
3230\n\
3231Return 1 if all cased characters in S are lowercase and there is\n\
3232at least one cased character in S, 0 otherwise.";
3233
3234static PyObject*
3235unicode_islower(PyUnicodeObject *self, PyObject *args)
3236{
3237 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3238 register const Py_UNICODE *e;
3239 int cased;
3240
3241 if (!PyArg_NoArgs(args))
3242 return NULL;
3243
3244 /* Shortcut for single character strings */
3245 if (PyUnicode_GET_SIZE(self) == 1)
3246 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3247
3248 e = p + PyUnicode_GET_SIZE(self);
3249 cased = 0;
3250 for (; p < e; p++) {
3251 register const Py_UNICODE ch = *p;
3252
3253 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3254 return PyInt_FromLong(0);
3255 else if (!cased && Py_UNICODE_ISLOWER(ch))
3256 cased = 1;
3257 }
3258 return PyInt_FromLong(cased);
3259}
3260
3261static char isupper__doc__[] =
3262"S.isupper() -> int\n\
3263\n\
3264Return 1 if all cased characters in S are uppercase and there is\n\
3265at least one cased character in S, 0 otherwise.";
3266
3267static PyObject*
3268unicode_isupper(PyUnicodeObject *self, PyObject *args)
3269{
3270 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3271 register const Py_UNICODE *e;
3272 int cased;
3273
3274 if (!PyArg_NoArgs(args))
3275 return NULL;
3276
3277 /* Shortcut for single character strings */
3278 if (PyUnicode_GET_SIZE(self) == 1)
3279 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3280
3281 e = p + PyUnicode_GET_SIZE(self);
3282 cased = 0;
3283 for (; p < e; p++) {
3284 register const Py_UNICODE ch = *p;
3285
3286 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3287 return PyInt_FromLong(0);
3288 else if (!cased && Py_UNICODE_ISUPPER(ch))
3289 cased = 1;
3290 }
3291 return PyInt_FromLong(cased);
3292}
3293
3294static char istitle__doc__[] =
3295"S.istitle() -> int\n\
3296\n\
3297Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3298may only follow uncased characters and lowercase characters only cased\n\
3299ones. Return 0 otherwise.";
3300
3301static PyObject*
3302unicode_istitle(PyUnicodeObject *self, PyObject *args)
3303{
3304 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3305 register const Py_UNICODE *e;
3306 int cased, previous_is_cased;
3307
3308 if (!PyArg_NoArgs(args))
3309 return NULL;
3310
3311 /* Shortcut for single character strings */
3312 if (PyUnicode_GET_SIZE(self) == 1)
3313 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3314 (Py_UNICODE_ISUPPER(*p) != 0));
3315
3316 e = p + PyUnicode_GET_SIZE(self);
3317 cased = 0;
3318 previous_is_cased = 0;
3319 for (; p < e; p++) {
3320 register const Py_UNICODE ch = *p;
3321
3322 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3323 if (previous_is_cased)
3324 return PyInt_FromLong(0);
3325 previous_is_cased = 1;
3326 cased = 1;
3327 }
3328 else if (Py_UNICODE_ISLOWER(ch)) {
3329 if (!previous_is_cased)
3330 return PyInt_FromLong(0);
3331 previous_is_cased = 1;
3332 cased = 1;
3333 }
3334 else
3335 previous_is_cased = 0;
3336 }
3337 return PyInt_FromLong(cased);
3338}
3339
3340static char isspace__doc__[] =
3341"S.isspace() -> int\n\
3342\n\
3343Return 1 if there are only whitespace characters in S,\n\
33440 otherwise.";
3345
3346static PyObject*
3347unicode_isspace(PyUnicodeObject *self, PyObject *args)
3348{
3349 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3350 register const Py_UNICODE *e;
3351
3352 if (!PyArg_NoArgs(args))
3353 return NULL;
3354
3355 /* Shortcut for single character strings */
3356 if (PyUnicode_GET_SIZE(self) == 1 &&
3357 Py_UNICODE_ISSPACE(*p))
3358 return PyInt_FromLong(1);
3359
3360 e = p + PyUnicode_GET_SIZE(self);
3361 for (; p < e; p++) {
3362 if (!Py_UNICODE_ISSPACE(*p))
3363 return PyInt_FromLong(0);
3364 }
3365 return PyInt_FromLong(1);
3366}
3367
3368static char isdecimal__doc__[] =
3369"S.isdecimal() -> int\n\
3370\n\
3371Return 1 if there are only decimal characters in S,\n\
33720 otherwise.";
3373
3374static PyObject*
3375unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3376{
3377 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3378 register const Py_UNICODE *e;
3379
3380 if (!PyArg_NoArgs(args))
3381 return NULL;
3382
3383 /* Shortcut for single character strings */
3384 if (PyUnicode_GET_SIZE(self) == 1 &&
3385 Py_UNICODE_ISDECIMAL(*p))
3386 return PyInt_FromLong(1);
3387
3388 e = p + PyUnicode_GET_SIZE(self);
3389 for (; p < e; p++) {
3390 if (!Py_UNICODE_ISDECIMAL(*p))
3391 return PyInt_FromLong(0);
3392 }
3393 return PyInt_FromLong(1);
3394}
3395
3396static char isdigit__doc__[] =
3397"S.isdigit() -> int\n\
3398\n\
3399Return 1 if there are only digit characters in S,\n\
34000 otherwise.";
3401
3402static PyObject*
3403unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3404{
3405 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3406 register const Py_UNICODE *e;
3407
3408 if (!PyArg_NoArgs(args))
3409 return NULL;
3410
3411 /* Shortcut for single character strings */
3412 if (PyUnicode_GET_SIZE(self) == 1 &&
3413 Py_UNICODE_ISDIGIT(*p))
3414 return PyInt_FromLong(1);
3415
3416 e = p + PyUnicode_GET_SIZE(self);
3417 for (; p < e; p++) {
3418 if (!Py_UNICODE_ISDIGIT(*p))
3419 return PyInt_FromLong(0);
3420 }
3421 return PyInt_FromLong(1);
3422}
3423
3424static char isnumeric__doc__[] =
3425"S.isnumeric() -> int\n\
3426\n\
3427Return 1 if there are only numeric characters in S,\n\
34280 otherwise.";
3429
3430static PyObject*
3431unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3432{
3433 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3434 register const Py_UNICODE *e;
3435
3436 if (!PyArg_NoArgs(args))
3437 return NULL;
3438
3439 /* Shortcut for single character strings */
3440 if (PyUnicode_GET_SIZE(self) == 1 &&
3441 Py_UNICODE_ISNUMERIC(*p))
3442 return PyInt_FromLong(1);
3443
3444 e = p + PyUnicode_GET_SIZE(self);
3445 for (; p < e; p++) {
3446 if (!Py_UNICODE_ISNUMERIC(*p))
3447 return PyInt_FromLong(0);
3448 }
3449 return PyInt_FromLong(1);
3450}
3451
3452static char join__doc__[] =
3453"S.join(sequence) -> unicode\n\
3454\n\
3455Return a string which is the concatenation of the strings in the\n\
3456sequence. The separator between elements is S.";
3457
3458static PyObject*
3459unicode_join(PyUnicodeObject *self, PyObject *args)
3460{
3461 PyObject *data;
3462 if (!PyArg_ParseTuple(args, "O:join", &data))
3463 return NULL;
3464
3465 return PyUnicode_Join((PyObject *)self, data);
3466}
3467
3468static int
3469unicode_length(PyUnicodeObject *self)
3470{
3471 return self->length;
3472}
3473
3474static char ljust__doc__[] =
3475"S.ljust(width) -> unicode\n\
3476\n\
3477Return S left justified in a Unicode string of length width. Padding is\n\
3478done using spaces.";
3479
3480static PyObject *
3481unicode_ljust(PyUnicodeObject *self, PyObject *args)
3482{
3483 int width;
3484 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3485 return NULL;
3486
3487 if (self->length >= width) {
3488 Py_INCREF(self);
3489 return (PyObject*) self;
3490 }
3491
3492 return (PyObject*) pad(self, 0, width - self->length, ' ');
3493}
3494
3495static char lower__doc__[] =
3496"S.lower() -> unicode\n\
3497\n\
3498Return a copy of the string S converted to lowercase.";
3499
3500static PyObject*
3501unicode_lower(PyUnicodeObject *self, PyObject *args)
3502{
3503 if (!PyArg_NoArgs(args))
3504 return NULL;
3505 return fixup(self, fixlower);
3506}
3507
3508static char lstrip__doc__[] =
3509"S.lstrip() -> unicode\n\
3510\n\
3511Return a copy of the string S with leading whitespace removed.";
3512
3513static PyObject *
3514unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3515{
3516 if (!PyArg_NoArgs(args))
3517 return NULL;
3518 return strip(self, 1, 0);
3519}
3520
3521static PyObject*
3522unicode_repeat(PyUnicodeObject *str, int len)
3523{
3524 PyUnicodeObject *u;
3525 Py_UNICODE *p;
3526
3527 if (len < 0)
3528 len = 0;
3529
3530 if (len == 1) {
3531 /* no repeat, return original string */
3532 Py_INCREF(str);
3533 return (PyObject*) str;
3534 }
3535
3536 u = _PyUnicode_New(len * str->length);
3537 if (!u)
3538 return NULL;
3539
3540 p = u->str;
3541
3542 while (len-- > 0) {
3543 Py_UNICODE_COPY(p, str->str, str->length);
3544 p += str->length;
3545 }
3546
3547 return (PyObject*) u;
3548}
3549
3550PyObject *PyUnicode_Replace(PyObject *obj,
3551 PyObject *subobj,
3552 PyObject *replobj,
3553 int maxcount)
3554{
3555 PyObject *self;
3556 PyObject *str1;
3557 PyObject *str2;
3558 PyObject *result;
3559
3560 self = PyUnicode_FromObject(obj);
3561 if (self == NULL)
3562 return NULL;
3563 str1 = PyUnicode_FromObject(subobj);
3564 if (str1 == NULL) {
3565 Py_DECREF(self);
3566 return NULL;
3567 }
3568 str2 = PyUnicode_FromObject(replobj);
3569 if (str2 == NULL) {
3570 Py_DECREF(self);
3571 Py_DECREF(str1);
3572 return NULL;
3573 }
3574 result = replace((PyUnicodeObject *)self,
3575 (PyUnicodeObject *)str1,
3576 (PyUnicodeObject *)str2,
3577 maxcount);
3578 Py_DECREF(self);
3579 Py_DECREF(str1);
3580 Py_DECREF(str2);
3581 return result;
3582}
3583
3584static char replace__doc__[] =
3585"S.replace (old, new[, maxsplit]) -> unicode\n\
3586\n\
3587Return a copy of S with all occurrences of substring\n\
3588old replaced by new. If the optional argument maxsplit is\n\
3589given, only the first maxsplit occurrences are replaced.";
3590
3591static PyObject*
3592unicode_replace(PyUnicodeObject *self, PyObject *args)
3593{
3594 PyUnicodeObject *str1;
3595 PyUnicodeObject *str2;
3596 int maxcount = -1;
3597 PyObject *result;
3598
3599 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3600 return NULL;
3601 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3602 if (str1 == NULL)
3603 return NULL;
3604 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3605 if (str2 == NULL)
3606 return NULL;
3607
3608 result = replace(self, str1, str2, maxcount);
3609
3610 Py_DECREF(str1);
3611 Py_DECREF(str2);
3612 return result;
3613}
3614
3615static
3616PyObject *unicode_repr(PyObject *unicode)
3617{
3618 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3619 PyUnicode_GET_SIZE(unicode),
3620 1);
3621}
3622
3623static char rfind__doc__[] =
3624"S.rfind(sub [,start [,end]]) -> int\n\
3625\n\
3626Return the highest index in S where substring sub is found,\n\
3627such that sub is contained within s[start,end]. Optional\n\
3628arguments start and end are interpreted as in slice notation.\n\
3629\n\
3630Return -1 on failure.";
3631
3632static PyObject *
3633unicode_rfind(PyUnicodeObject *self, PyObject *args)
3634{
3635 PyUnicodeObject *substring;
3636 int start = 0;
3637 int end = INT_MAX;
3638 PyObject *result;
3639
3640 if (!PyArg_ParseTuple(args, "O|ii:rfind", &substring, &start, &end))
3641 return NULL;
3642 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3643 (PyObject *)substring);
3644 if (substring == NULL)
3645 return NULL;
3646
3647 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3648
3649 Py_DECREF(substring);
3650 return result;
3651}
3652
3653static char rindex__doc__[] =
3654"S.rindex(sub [,start [,end]]) -> int\n\
3655\n\
3656Like S.rfind() but raise ValueError when the substring is not found.";
3657
3658static PyObject *
3659unicode_rindex(PyUnicodeObject *self, PyObject *args)
3660{
3661 int result;
3662 PyUnicodeObject *substring;
3663 int start = 0;
3664 int end = INT_MAX;
3665
3666 if (!PyArg_ParseTuple(args, "O|ii:rindex", &substring, &start, &end))
3667 return NULL;
3668 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3669 (PyObject *)substring);
3670 if (substring == NULL)
3671 return NULL;
3672
3673 result = findstring(self, substring, start, end, -1);
3674
3675 Py_DECREF(substring);
3676 if (result < 0) {
3677 PyErr_SetString(PyExc_ValueError, "substring not found");
3678 return NULL;
3679 }
3680 return PyInt_FromLong(result);
3681}
3682
3683static char rjust__doc__[] =
3684"S.rjust(width) -> unicode\n\
3685\n\
3686Return S right justified in a Unicode string of length width. Padding is\n\
3687done using spaces.";
3688
3689static PyObject *
3690unicode_rjust(PyUnicodeObject *self, PyObject *args)
3691{
3692 int width;
3693 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3694 return NULL;
3695
3696 if (self->length >= width) {
3697 Py_INCREF(self);
3698 return (PyObject*) self;
3699 }
3700
3701 return (PyObject*) pad(self, width - self->length, 0, ' ');
3702}
3703
3704static char rstrip__doc__[] =
3705"S.rstrip() -> unicode\n\
3706\n\
3707Return a copy of the string S with trailing whitespace removed.";
3708
3709static PyObject *
3710unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3711{
3712 if (!PyArg_NoArgs(args))
3713 return NULL;
3714 return strip(self, 0, 1);
3715}
3716
3717static PyObject*
3718unicode_slice(PyUnicodeObject *self, int start, int end)
3719{
3720 /* standard clamping */
3721 if (start < 0)
3722 start = 0;
3723 if (end < 0)
3724 end = 0;
3725 if (end > self->length)
3726 end = self->length;
3727 if (start == 0 && end == self->length) {
3728 /* full slice, return original string */
3729 Py_INCREF(self);
3730 return (PyObject*) self;
3731 }
3732 if (start > end)
3733 start = end;
3734 /* copy slice */
3735 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3736 end - start);
3737}
3738
3739PyObject *PyUnicode_Split(PyObject *s,
3740 PyObject *sep,
3741 int maxsplit)
3742{
3743 PyObject *result;
3744
3745 s = PyUnicode_FromObject(s);
3746 if (s == NULL)
3747 return NULL;
3748 if (sep != NULL) {
3749 sep = PyUnicode_FromObject(sep);
3750 if (sep == NULL) {
3751 Py_DECREF(s);
3752 return NULL;
3753 }
3754 }
3755
3756 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3757
3758 Py_DECREF(s);
3759 Py_XDECREF(sep);
3760 return result;
3761}
3762
3763static char split__doc__[] =
3764"S.split([sep [,maxsplit]]) -> list of strings\n\
3765\n\
3766Return a list of the words in S, using sep as the\n\
3767delimiter string. If maxsplit is given, at most maxsplit\n\
3768splits are done. If sep is not specified, any whitespace string\n\
3769is a separator.";
3770
3771static PyObject*
3772unicode_split(PyUnicodeObject *self, PyObject *args)
3773{
3774 PyObject *substring = Py_None;
3775 int maxcount = -1;
3776
3777 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3778 return NULL;
3779
3780 if (substring == Py_None)
3781 return split(self, NULL, maxcount);
3782 else if (PyUnicode_Check(substring))
3783 return split(self, (PyUnicodeObject *)substring, maxcount);
3784 else
3785 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3786}
3787
3788static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00003789"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00003790\n\
3791Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00003792Line breaks are not included in the resulting list unless keepends\n\
3793is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003794
3795static PyObject*
3796unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3797{
Guido van Rossum86662912000-04-11 15:38:46 +00003798 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003799
Guido van Rossum86662912000-04-11 15:38:46 +00003800 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003801 return NULL;
3802
Guido van Rossum86662912000-04-11 15:38:46 +00003803 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804}
3805
3806static
3807PyObject *unicode_str(PyUnicodeObject *self)
3808{
3809 return PyUnicode_AsUTF8String((PyObject *)self);
3810}
3811
3812static char strip__doc__[] =
3813"S.strip() -> unicode\n\
3814\n\
3815Return a copy of S with leading and trailing whitespace removed.";
3816
3817static PyObject *
3818unicode_strip(PyUnicodeObject *self, PyObject *args)
3819{
3820 if (!PyArg_NoArgs(args))
3821 return NULL;
3822 return strip(self, 1, 1);
3823}
3824
3825static char swapcase__doc__[] =
3826"S.swapcase() -> unicode\n\
3827\n\
3828Return a copy of S with uppercase characters converted to lowercase\n\
3829and vice versa.";
3830
3831static PyObject*
3832unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3833{
3834 if (!PyArg_NoArgs(args))
3835 return NULL;
3836 return fixup(self, fixswapcase);
3837}
3838
3839static char translate__doc__[] =
3840"S.translate(table) -> unicode\n\
3841\n\
3842Return a copy of the string S, where all characters have been mapped\n\
3843through the given translation table, which must be a mapping of\n\
3844Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3845are left untouched. Characters mapped to None are deleted.";
3846
3847static PyObject*
3848unicode_translate(PyUnicodeObject *self, PyObject *args)
3849{
3850 PyObject *table;
3851
3852 if (!PyArg_ParseTuple(args, "O:translate", &table))
3853 return NULL;
3854 return PyUnicode_TranslateCharmap(self->str,
3855 self->length,
3856 table,
3857 "ignore");
3858}
3859
3860static char upper__doc__[] =
3861"S.upper() -> unicode\n\
3862\n\
3863Return a copy of S converted to uppercase.";
3864
3865static PyObject*
3866unicode_upper(PyUnicodeObject *self, PyObject *args)
3867{
3868 if (!PyArg_NoArgs(args))
3869 return NULL;
3870 return fixup(self, fixupper);
3871}
3872
3873#if 0
3874static char zfill__doc__[] =
3875"S.zfill(width) -> unicode\n\
3876\n\
3877Pad a numeric string x with zeros on the left, to fill a field\n\
3878of the specified width. The string x is never truncated.";
3879
3880static PyObject *
3881unicode_zfill(PyUnicodeObject *self, PyObject *args)
3882{
3883 int fill;
3884 PyUnicodeObject *u;
3885
3886 int width;
3887 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3888 return NULL;
3889
3890 if (self->length >= width) {
3891 Py_INCREF(self);
3892 return (PyObject*) self;
3893 }
3894
3895 fill = width - self->length;
3896
3897 u = pad(self, fill, 0, '0');
3898
3899 if (u->str[fill] == '+' || u->str[fill] == '-') {
3900 /* move sign to beginning of string */
3901 u->str[0] = u->str[fill];
3902 u->str[fill] = '0';
3903 }
3904
3905 return (PyObject*) u;
3906}
3907#endif
3908
3909#if 0
3910static PyObject*
3911unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3912{
3913 if (!PyArg_NoArgs(args))
3914 return NULL;
3915 return PyInt_FromLong(unicode_freelist_size);
3916}
3917#endif
3918
3919static char startswith__doc__[] =
3920"S.startswith(prefix[, start[, end]]) -> int\n\
3921\n\
3922Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3923optional start, test S beginning at that position. With optional end, stop\n\
3924comparing S at that position.";
3925
3926static PyObject *
3927unicode_startswith(PyUnicodeObject *self,
3928 PyObject *args)
3929{
3930 PyUnicodeObject *substring;
3931 int start = 0;
3932 int end = INT_MAX;
3933 PyObject *result;
3934
3935 if (!PyArg_ParseTuple(args, "O|ii:startswith", &substring, &start, &end))
3936 return NULL;
3937 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3938 (PyObject *)substring);
3939 if (substring == NULL)
3940 return NULL;
3941
3942 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3943
3944 Py_DECREF(substring);
3945 return result;
3946}
3947
3948
3949static char endswith__doc__[] =
3950"S.endswith(suffix[, start[, end]]) -> int\n\
3951\n\
3952Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3953optional start, test S beginning at that position. With optional end, stop\n\
3954comparing S at that position.";
3955
3956static PyObject *
3957unicode_endswith(PyUnicodeObject *self,
3958 PyObject *args)
3959{
3960 PyUnicodeObject *substring;
3961 int start = 0;
3962 int end = INT_MAX;
3963 PyObject *result;
3964
3965 if (!PyArg_ParseTuple(args, "O|ii:endswith", &substring, &start, &end))
3966 return NULL;
3967 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3968 (PyObject *)substring);
3969 if (substring == NULL)
3970 return NULL;
3971
3972 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3973
3974 Py_DECREF(substring);
3975 return result;
3976}
3977
3978
3979static PyMethodDef unicode_methods[] = {
3980
3981 /* Order is according to common usage: often used methods should
3982 appear first, since lookup is done sequentially. */
3983
3984 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3985 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3986 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3987 {"join", (PyCFunction) unicode_join, 1, join__doc__},
3988 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
3989 {"title", (PyCFunction) unicode_title, 0, title__doc__},
3990 {"center", (PyCFunction) unicode_center, 1, center__doc__},
3991 {"count", (PyCFunction) unicode_count, 1, count__doc__},
3992 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
3993 {"find", (PyCFunction) unicode_find, 1, find__doc__},
3994 {"index", (PyCFunction) unicode_index, 1, index__doc__},
3995 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
3996 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
3997 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
3998/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
3999 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4000 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4001 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4002 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4003 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4004 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4005 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4006 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4007 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4008 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4009 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4010 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4011 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4012 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4013 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4014 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4015 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4016 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4017#if 0
4018 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4019 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4020#endif
4021
4022#if 0
4023 /* This one is just used for debugging the implementation. */
4024 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4025#endif
4026
4027 {NULL, NULL}
4028};
4029
4030static PyObject *
4031unicode_getattr(PyUnicodeObject *self, char *name)
4032{
4033 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4034}
4035
4036static PySequenceMethods unicode_as_sequence = {
4037 (inquiry) unicode_length, /* sq_length */
4038 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4039 (intargfunc) unicode_repeat, /* sq_repeat */
4040 (intargfunc) unicode_getitem, /* sq_item */
4041 (intintargfunc) unicode_slice, /* sq_slice */
4042 0, /* sq_ass_item */
4043 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004044 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004045};
4046
4047static int
4048unicode_buffer_getreadbuf(PyUnicodeObject *self,
4049 int index,
4050 const void **ptr)
4051{
4052 if (index != 0) {
4053 PyErr_SetString(PyExc_SystemError,
4054 "accessing non-existent unicode segment");
4055 return -1;
4056 }
4057 *ptr = (void *) self->str;
4058 return PyUnicode_GET_DATA_SIZE(self);
4059}
4060
4061static int
4062unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4063 const void **ptr)
4064{
4065 PyErr_SetString(PyExc_TypeError,
4066 "cannot use unicode as modifyable buffer");
4067 return -1;
4068}
4069
4070static int
4071unicode_buffer_getsegcount(PyUnicodeObject *self,
4072 int *lenp)
4073{
4074 if (lenp)
4075 *lenp = PyUnicode_GET_DATA_SIZE(self);
4076 return 1;
4077}
4078
4079static int
4080unicode_buffer_getcharbuf(PyUnicodeObject *self,
4081 int index,
4082 const void **ptr)
4083{
4084 PyObject *str;
4085
4086 if (index != 0) {
4087 PyErr_SetString(PyExc_SystemError,
4088 "accessing non-existent unicode segment");
4089 return -1;
4090 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004091 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004092 if (str == NULL)
4093 return -1;
4094 *ptr = (void *) PyString_AS_STRING(str);
4095 return PyString_GET_SIZE(str);
4096}
4097
4098/* Helpers for PyUnicode_Format() */
4099
4100static PyObject *
4101getnextarg(args, arglen, p_argidx)
4102 PyObject *args;
4103int arglen;
4104int *p_argidx;
4105{
4106 int argidx = *p_argidx;
4107 if (argidx < arglen) {
4108 (*p_argidx)++;
4109 if (arglen < 0)
4110 return args;
4111 else
4112 return PyTuple_GetItem(args, argidx);
4113 }
4114 PyErr_SetString(PyExc_TypeError,
4115 "not enough arguments for format string");
4116 return NULL;
4117}
4118
4119#define F_LJUST (1<<0)
4120#define F_SIGN (1<<1)
4121#define F_BLANK (1<<2)
4122#define F_ALT (1<<3)
4123#define F_ZERO (1<<4)
4124
4125static
4126#ifdef HAVE_STDARG_PROTOTYPES
4127int usprintf(register Py_UNICODE *buffer, char *format, ...)
4128#else
4129int usprintf(va_alist) va_dcl
4130#endif
4131{
4132 register int i;
4133 int len;
4134 va_list va;
4135 char *charbuffer;
4136#ifdef HAVE_STDARG_PROTOTYPES
4137 va_start(va, format);
4138#else
4139 Py_UNICODE *args;
4140 char *format;
4141
4142 va_start(va);
4143 buffer = va_arg(va, Py_UNICODE *);
4144 format = va_arg(va, char *);
4145#endif
4146
4147 /* First, format the string as char array, then expand to Py_UNICODE
4148 array. */
4149 charbuffer = (char *)buffer;
4150 len = vsprintf(charbuffer, format, va);
4151 for (i = len - 1; i >= 0; i--)
4152 buffer[i] = (Py_UNICODE) charbuffer[i];
4153
4154 va_end(va);
4155 return len;
4156}
4157
4158static int
4159formatfloat(Py_UNICODE *buf,
4160 int flags,
4161 int prec,
4162 int type,
4163 PyObject *v)
4164{
4165 char fmt[20];
4166 double x;
4167
4168 x = PyFloat_AsDouble(v);
4169 if (x == -1.0 && PyErr_Occurred())
4170 return -1;
4171 if (prec < 0)
4172 prec = 6;
4173 if (prec > 50)
4174 prec = 50; /* Arbitrary limitation */
4175 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4176 type = 'g';
4177 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4178 return usprintf(buf, fmt, x);
4179}
4180
4181static int
4182formatint(Py_UNICODE *buf,
4183 int flags,
4184 int prec,
4185 int type,
4186 PyObject *v)
4187{
4188 char fmt[20];
4189 long x;
4190
4191 x = PyInt_AsLong(v);
4192 if (x == -1 && PyErr_Occurred())
4193 return -1;
4194 if (prec < 0)
4195 prec = 1;
4196 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4197 return usprintf(buf, fmt, x);
4198}
4199
4200static int
4201formatchar(Py_UNICODE *buf,
4202 PyObject *v)
4203{
4204 if (PyUnicode_Check(v))
4205 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4206
4207 else if (PyString_Check(v))
4208 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4209
4210 else {
4211 /* Integer input truncated to a character */
4212 long x;
4213 x = PyInt_AsLong(v);
4214 if (x == -1 && PyErr_Occurred())
4215 return -1;
4216 buf[0] = (char) x;
4217 }
4218 buf[1] = '\0';
4219 return 1;
4220}
4221
4222PyObject *PyUnicode_Format(PyObject *format,
4223 PyObject *args)
4224{
4225 Py_UNICODE *fmt, *res;
4226 int fmtcnt, rescnt, reslen, arglen, argidx;
4227 int args_owned = 0;
4228 PyUnicodeObject *result = NULL;
4229 PyObject *dict = NULL;
4230 PyObject *uformat;
4231
4232 if (format == NULL || args == NULL) {
4233 PyErr_BadInternalCall();
4234 return NULL;
4235 }
4236 uformat = PyUnicode_FromObject(format);
4237 fmt = PyUnicode_AS_UNICODE(uformat);
4238 fmtcnt = PyUnicode_GET_SIZE(uformat);
4239
4240 reslen = rescnt = fmtcnt + 100;
4241 result = _PyUnicode_New(reslen);
4242 if (result == NULL)
4243 goto onError;
4244 res = PyUnicode_AS_UNICODE(result);
4245
4246 if (PyTuple_Check(args)) {
4247 arglen = PyTuple_Size(args);
4248 argidx = 0;
4249 }
4250 else {
4251 arglen = -1;
4252 argidx = -2;
4253 }
4254 if (args->ob_type->tp_as_mapping)
4255 dict = args;
4256
4257 while (--fmtcnt >= 0) {
4258 if (*fmt != '%') {
4259 if (--rescnt < 0) {
4260 rescnt = fmtcnt + 100;
4261 reslen += rescnt;
4262 if (_PyUnicode_Resize(result, reslen) < 0)
4263 return NULL;
4264 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4265 --rescnt;
4266 }
4267 *res++ = *fmt++;
4268 }
4269 else {
4270 /* Got a format specifier */
4271 int flags = 0;
4272 int width = -1;
4273 int prec = -1;
4274 int size = 0;
4275 Py_UNICODE c = '\0';
4276 Py_UNICODE fill;
4277 PyObject *v = NULL;
4278 PyObject *temp = NULL;
4279 Py_UNICODE *buf;
4280 Py_UNICODE sign;
4281 int len;
4282 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4283
4284 fmt++;
4285 if (*fmt == '(') {
4286 Py_UNICODE *keystart;
4287 int keylen;
4288 PyObject *key;
4289 int pcount = 1;
4290
4291 if (dict == NULL) {
4292 PyErr_SetString(PyExc_TypeError,
4293 "format requires a mapping");
4294 goto onError;
4295 }
4296 ++fmt;
4297 --fmtcnt;
4298 keystart = fmt;
4299 /* Skip over balanced parentheses */
4300 while (pcount > 0 && --fmtcnt >= 0) {
4301 if (*fmt == ')')
4302 --pcount;
4303 else if (*fmt == '(')
4304 ++pcount;
4305 fmt++;
4306 }
4307 keylen = fmt - keystart - 1;
4308 if (fmtcnt < 0 || pcount > 0) {
4309 PyErr_SetString(PyExc_ValueError,
4310 "incomplete format key");
4311 goto onError;
4312 }
4313 /* keys are converted to strings (using UTF-8) and
4314 then looked up since Python uses strings to hold
4315 variables names etc. in its namespaces and we
4316 wouldn't want to break common idioms. The
4317 alternative would be using Unicode objects for the
4318 lookup but u"abc" and "abc" have different hash
4319 values (on purpose). */
4320 key = PyUnicode_EncodeUTF8(keystart,
4321 keylen,
4322 NULL);
4323 if (key == NULL)
4324 goto onError;
4325 if (args_owned) {
4326 Py_DECREF(args);
4327 args_owned = 0;
4328 }
4329 args = PyObject_GetItem(dict, key);
4330 Py_DECREF(key);
4331 if (args == NULL) {
4332 goto onError;
4333 }
4334 args_owned = 1;
4335 arglen = -1;
4336 argidx = -2;
4337 }
4338 while (--fmtcnt >= 0) {
4339 switch (c = *fmt++) {
4340 case '-': flags |= F_LJUST; continue;
4341 case '+': flags |= F_SIGN; continue;
4342 case ' ': flags |= F_BLANK; continue;
4343 case '#': flags |= F_ALT; continue;
4344 case '0': flags |= F_ZERO; continue;
4345 }
4346 break;
4347 }
4348 if (c == '*') {
4349 v = getnextarg(args, arglen, &argidx);
4350 if (v == NULL)
4351 goto onError;
4352 if (!PyInt_Check(v)) {
4353 PyErr_SetString(PyExc_TypeError,
4354 "* wants int");
4355 goto onError;
4356 }
4357 width = PyInt_AsLong(v);
4358 if (width < 0) {
4359 flags |= F_LJUST;
4360 width = -width;
4361 }
4362 if (--fmtcnt >= 0)
4363 c = *fmt++;
4364 }
4365 else if (c >= '0' && c <= '9') {
4366 width = c - '0';
4367 while (--fmtcnt >= 0) {
4368 c = *fmt++;
4369 if (c < '0' || c > '9')
4370 break;
4371 if ((width*10) / 10 != width) {
4372 PyErr_SetString(PyExc_ValueError,
4373 "width too big");
4374 goto onError;
4375 }
4376 width = width*10 + (c - '0');
4377 }
4378 }
4379 if (c == '.') {
4380 prec = 0;
4381 if (--fmtcnt >= 0)
4382 c = *fmt++;
4383 if (c == '*') {
4384 v = getnextarg(args, arglen, &argidx);
4385 if (v == NULL)
4386 goto onError;
4387 if (!PyInt_Check(v)) {
4388 PyErr_SetString(PyExc_TypeError,
4389 "* wants int");
4390 goto onError;
4391 }
4392 prec = PyInt_AsLong(v);
4393 if (prec < 0)
4394 prec = 0;
4395 if (--fmtcnt >= 0)
4396 c = *fmt++;
4397 }
4398 else if (c >= '0' && c <= '9') {
4399 prec = c - '0';
4400 while (--fmtcnt >= 0) {
4401 c = Py_CHARMASK(*fmt++);
4402 if (c < '0' || c > '9')
4403 break;
4404 if ((prec*10) / 10 != prec) {
4405 PyErr_SetString(PyExc_ValueError,
4406 "prec too big");
4407 goto onError;
4408 }
4409 prec = prec*10 + (c - '0');
4410 }
4411 }
4412 } /* prec */
4413 if (fmtcnt >= 0) {
4414 if (c == 'h' || c == 'l' || c == 'L') {
4415 size = c;
4416 if (--fmtcnt >= 0)
4417 c = *fmt++;
4418 }
4419 }
4420 if (fmtcnt < 0) {
4421 PyErr_SetString(PyExc_ValueError,
4422 "incomplete format");
4423 goto onError;
4424 }
4425 if (c != '%') {
4426 v = getnextarg(args, arglen, &argidx);
4427 if (v == NULL)
4428 goto onError;
4429 }
4430 sign = 0;
4431 fill = ' ';
4432 switch (c) {
4433
4434 case '%':
4435 buf = tmpbuf;
4436 buf[0] = '%';
4437 len = 1;
4438 break;
4439
4440 case 's':
4441 case 'r':
4442 if (PyUnicode_Check(v) && c == 's') {
4443 temp = v;
4444 Py_INCREF(temp);
4445 }
4446 else {
4447 PyObject *unicode;
4448 if (c == 's')
4449 temp = PyObject_Str(v);
4450 else
4451 temp = PyObject_Repr(v);
4452 if (temp == NULL)
4453 goto onError;
4454 if (!PyString_Check(temp)) {
4455 /* XXX Note: this should never happen, since
4456 PyObject_Repr() and PyObject_Str() assure
4457 this */
4458 Py_DECREF(temp);
4459 PyErr_SetString(PyExc_TypeError,
4460 "%s argument has non-string str()");
4461 goto onError;
4462 }
4463 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4464 PyString_GET_SIZE(temp),
4465 "strict");
4466 Py_DECREF(temp);
4467 temp = unicode;
4468 if (temp == NULL)
4469 goto onError;
4470 }
4471 buf = PyUnicode_AS_UNICODE(temp);
4472 len = PyUnicode_GET_SIZE(temp);
4473 if (prec >= 0 && len > prec)
4474 len = prec;
4475 break;
4476
4477 case 'i':
4478 case 'd':
4479 case 'u':
4480 case 'o':
4481 case 'x':
4482 case 'X':
4483 if (c == 'i')
4484 c = 'd';
4485 buf = tmpbuf;
4486 len = formatint(buf, flags, prec, c, v);
4487 if (len < 0)
4488 goto onError;
4489 sign = (c == 'd');
4490 if (flags & F_ZERO) {
4491 fill = '0';
4492 if ((flags&F_ALT) &&
4493 (c == 'x' || c == 'X') &&
4494 buf[0] == '0' && buf[1] == c) {
4495 *res++ = *buf++;
4496 *res++ = *buf++;
4497 rescnt -= 2;
4498 len -= 2;
4499 width -= 2;
4500 if (width < 0)
4501 width = 0;
4502 }
4503 }
4504 break;
4505
4506 case 'e':
4507 case 'E':
4508 case 'f':
4509 case 'g':
4510 case 'G':
4511 buf = tmpbuf;
4512 len = formatfloat(buf, flags, prec, c, v);
4513 if (len < 0)
4514 goto onError;
4515 sign = 1;
4516 if (flags&F_ZERO)
4517 fill = '0';
4518 break;
4519
4520 case 'c':
4521 buf = tmpbuf;
4522 len = formatchar(buf, v);
4523 if (len < 0)
4524 goto onError;
4525 break;
4526
4527 default:
4528 PyErr_Format(PyExc_ValueError,
4529 "unsupported format character '%c' (0x%x)",
4530 c, c);
4531 goto onError;
4532 }
4533 if (sign) {
4534 if (*buf == '-' || *buf == '+') {
4535 sign = *buf++;
4536 len--;
4537 }
4538 else if (flags & F_SIGN)
4539 sign = '+';
4540 else if (flags & F_BLANK)
4541 sign = ' ';
4542 else
4543 sign = 0;
4544 }
4545 if (width < len)
4546 width = len;
4547 if (rescnt < width + (sign != 0)) {
4548 reslen -= rescnt;
4549 rescnt = width + fmtcnt + 100;
4550 reslen += rescnt;
4551 if (_PyUnicode_Resize(result, reslen) < 0)
4552 return NULL;
4553 res = PyUnicode_AS_UNICODE(result)
4554 + reslen - rescnt;
4555 }
4556 if (sign) {
4557 if (fill != ' ')
4558 *res++ = sign;
4559 rescnt--;
4560 if (width > len)
4561 width--;
4562 }
4563 if (width > len && !(flags & F_LJUST)) {
4564 do {
4565 --rescnt;
4566 *res++ = fill;
4567 } while (--width > len);
4568 }
4569 if (sign && fill == ' ')
4570 *res++ = sign;
4571 memcpy(res, buf, len * sizeof(Py_UNICODE));
4572 res += len;
4573 rescnt -= len;
4574 while (--width >= len) {
4575 --rescnt;
4576 *res++ = ' ';
4577 }
4578 if (dict && (argidx < arglen) && c != '%') {
4579 PyErr_SetString(PyExc_TypeError,
4580 "not all arguments converted");
4581 goto onError;
4582 }
4583 Py_XDECREF(temp);
4584 } /* '%' */
4585 } /* until end */
4586 if (argidx < arglen && !dict) {
4587 PyErr_SetString(PyExc_TypeError,
4588 "not all arguments converted");
4589 goto onError;
4590 }
4591
4592 if (args_owned) {
4593 Py_DECREF(args);
4594 }
4595 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004596 if (_PyUnicode_Resize(result, reslen - rescnt))
4597 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004598 return (PyObject *)result;
4599
4600 onError:
4601 Py_XDECREF(result);
4602 Py_DECREF(uformat);
4603 if (args_owned) {
4604 Py_DECREF(args);
4605 }
4606 return NULL;
4607}
4608
4609static PyBufferProcs unicode_as_buffer = {
4610 (getreadbufferproc) unicode_buffer_getreadbuf,
4611 (getwritebufferproc) unicode_buffer_getwritebuf,
4612 (getsegcountproc) unicode_buffer_getsegcount,
4613 (getcharbufferproc) unicode_buffer_getcharbuf,
4614};
4615
4616PyTypeObject PyUnicode_Type = {
4617 PyObject_HEAD_INIT(&PyType_Type)
4618 0, /* ob_size */
4619 "unicode", /* tp_name */
4620 sizeof(PyUnicodeObject), /* tp_size */
4621 0, /* tp_itemsize */
4622 /* Slots */
4623 (destructor)_PyUnicode_Free, /* tp_dealloc */
4624 0, /* tp_print */
4625 (getattrfunc)unicode_getattr, /* tp_getattr */
4626 0, /* tp_setattr */
4627 (cmpfunc) unicode_compare, /* tp_compare */
4628 (reprfunc) unicode_repr, /* tp_repr */
4629 0, /* tp_as_number */
4630 &unicode_as_sequence, /* tp_as_sequence */
4631 0, /* tp_as_mapping */
4632 (hashfunc) unicode_hash, /* tp_hash*/
4633 0, /* tp_call*/
4634 (reprfunc) unicode_str, /* tp_str */
4635 (getattrofunc) NULL, /* tp_getattro */
4636 (setattrofunc) NULL, /* tp_setattro */
4637 &unicode_as_buffer, /* tp_as_buffer */
4638 Py_TPFLAGS_DEFAULT, /* tp_flags */
4639};
4640
4641/* Initialize the Unicode implementation */
4642
4643void _PyUnicode_Init()
4644{
4645 /* Doublecheck the configuration... */
4646 if (sizeof(Py_UNICODE) != 2)
4647 Py_FatalError("Unicode configuration error: "
4648 "sizeof(Py_UNICODE) != 2 bytes");
4649
4650 unicode_empty = _PyUnicode_New(0);
4651}
4652
4653/* Finalize the Unicode implementation */
4654
4655void
4656_PyUnicode_Fini()
4657{
4658 PyUnicodeObject *u = unicode_freelist;
4659
4660 while (u != NULL) {
4661 PyUnicodeObject *v = u;
4662 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004663 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00004664 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004665 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00004666 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004667 }
4668 Py_XDECREF(unicode_empty);
4669}