blob: e00a9b8f70bedf22755c04a8d60e1eb052be1a08 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg (mal@lemburg.com) according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------ */
112
113/* The empty Unicode object */
114static PyUnicodeObject *unicode_empty = NULL;
115
116/* Free list for Unicode objects */
117static PyUnicodeObject *unicode_freelist = NULL;
118static int unicode_freelist_size = 0;
119
120/* --- Unicode Object ----------------------------------------------------- */
121
122static
123int _PyUnicode_Resize(register PyUnicodeObject *unicode,
124 int length)
125{
126 void *oldstr;
127
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000128 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000130 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000131
132 /* Resizing unicode_empty is not allowed. */
133 if (unicode == unicode_empty) {
134 PyErr_SetString(PyExc_SystemError,
135 "can't resize empty unicode object");
136 return -1;
137 }
138
139 /* We allocate one more byte to make sure the string is
140 Ux0000 terminated -- XXX is this needed ? */
141 oldstr = unicode->str;
142 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
143 if (!unicode->str) {
144 unicode->str = oldstr;
145 PyErr_NoMemory();
146 return -1;
147 }
148 unicode->str[length] = 0;
149 unicode->length = length;
150
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000151 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000152 /* Reset the object caches */
153 if (unicode->utf8str) {
154 Py_DECREF(unicode->utf8str);
155 unicode->utf8str = NULL;
156 }
157 unicode->hash = -1;
158
159 return 0;
160}
161
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000162int PyUnicode_Resize(PyObject **unicode,
163 int length)
164{
165 PyUnicodeObject *v;
166
167 if (unicode == NULL) {
168 PyErr_BadInternalCall();
169 return -1;
170 }
171 v = (PyUnicodeObject *)*unicode;
172 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
173 PyErr_BadInternalCall();
174 return -1;
175 }
176 return _PyUnicode_Resize(v, length);
177}
178
Guido van Rossumd57fd912000-03-10 22:53:23 +0000179/* We allocate one more byte to make sure the string is
180 Ux0000 terminated -- XXX is this needed ?
181
182 XXX This allocator could further be enhanced by assuring that the
183 free list never reduces its size below 1.
184
185*/
186
187static
188PyUnicodeObject *_PyUnicode_New(int length)
189{
190 register PyUnicodeObject *unicode;
191
192 /* Optimization for empty strings */
193 if (length == 0 && unicode_empty != NULL) {
194 Py_INCREF(unicode_empty);
195 return unicode_empty;
196 }
197
198 /* Unicode freelist & memory allocation */
199 if (unicode_freelist) {
200 unicode = unicode_freelist;
201 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
202 unicode_freelist_size--;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000203 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000204 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000205 /* Keep-Alive optimization: we only upsize the buffer,
206 never downsize it. */
207 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000208 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000209 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000210 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 }
212 }
213 else
214 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
215 }
216 else {
217 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
218 if (unicode == NULL)
219 return NULL;
220 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
221 }
222
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000223 if (!unicode->str) {
224 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000225 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000226 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000227 unicode->str[length] = 0;
228 unicode->length = length;
229 unicode->hash = -1;
230 unicode->utf8str = NULL;
231 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000232
233 onError:
234 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000235 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000236 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237}
238
239static
240void _PyUnicode_Free(register PyUnicodeObject *unicode)
241{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000243 /* Keep-Alive optimization */
244 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000246 unicode->str = NULL;
247 unicode->length = 0;
248 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000249 if (unicode->utf8str) {
250 Py_DECREF(unicode->utf8str);
251 unicode->utf8str = NULL;
252 }
253 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000254 *(PyUnicodeObject **)unicode = unicode_freelist;
255 unicode_freelist = unicode;
256 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 }
258 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000259 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000260 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 }
263}
264
265PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
266 int size)
267{
268 PyUnicodeObject *unicode;
269
270 unicode = _PyUnicode_New(size);
271 if (!unicode)
272 return NULL;
273
274 /* Copy the Unicode data into the new object */
275 if (u != NULL)
276 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
277
278 return (PyObject *)unicode;
279}
280
281#ifdef HAVE_WCHAR_H
282
283PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
284 int size)
285{
286 PyUnicodeObject *unicode;
287
288 if (w == NULL) {
289 PyErr_BadInternalCall();
290 return NULL;
291 }
292
293 unicode = _PyUnicode_New(size);
294 if (!unicode)
295 return NULL;
296
297 /* Copy the wchar_t data into the new object */
298#ifdef HAVE_USABLE_WCHAR_T
299 memcpy(unicode->str, w, size * sizeof(wchar_t));
300#else
301 {
302 register Py_UNICODE *u;
303 register int i;
304 u = PyUnicode_AS_UNICODE(unicode);
305 for (i = size; i >= 0; i--)
306 *u++ = *w++;
307 }
308#endif
309
310 return (PyObject *)unicode;
311}
312
313int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
314 register wchar_t *w,
315 int size)
316{
317 if (unicode == NULL) {
318 PyErr_BadInternalCall();
319 return -1;
320 }
321 if (size > PyUnicode_GET_SIZE(unicode))
322 size = PyUnicode_GET_SIZE(unicode);
323#ifdef HAVE_USABLE_WCHAR_T
324 memcpy(w, unicode->str, size * sizeof(wchar_t));
325#else
326 {
327 register Py_UNICODE *u;
328 register int i;
329 u = PyUnicode_AS_UNICODE(unicode);
330 for (i = size; i >= 0; i--)
331 *w++ = *u++;
332 }
333#endif
334
335 return size;
336}
337
338#endif
339
340PyObject *PyUnicode_FromObject(register PyObject *obj)
341{
342 const char *s;
343 int len;
344
345 if (obj == NULL) {
346 PyErr_BadInternalCall();
347 return NULL;
348 }
349 else if (PyUnicode_Check(obj)) {
350 Py_INCREF(obj);
351 return obj;
352 }
353 else if (PyString_Check(obj)) {
354 s = PyString_AS_STRING(obj);
355 len = PyString_GET_SIZE(obj);
356 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000357 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
358 /* Overwrite the error message with something more useful in
359 case of a TypeError. */
360 if (PyErr_ExceptionMatches(PyExc_TypeError))
361 PyErr_SetString(PyExc_TypeError,
362 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000364 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000365 if (len == 0) {
366 Py_INCREF(unicode_empty);
367 return (PyObject *)unicode_empty;
368 }
369 return PyUnicode_DecodeUTF8(s, len, "strict");
370}
371
372PyObject *PyUnicode_Decode(const char *s,
373 int size,
374 const char *encoding,
375 const char *errors)
376{
377 PyObject *buffer = NULL, *unicode;
378
379 /* Shortcut for the default encoding UTF-8 */
380 if (encoding == NULL ||
381 (strcmp(encoding, "utf-8") == 0))
382 return PyUnicode_DecodeUTF8(s, size, errors);
383
384 /* Decode via the codec registry */
385 buffer = PyBuffer_FromMemory((void *)s, size);
386 if (buffer == NULL)
387 goto onError;
388 unicode = PyCodec_Decode(buffer, encoding, errors);
389 if (unicode == NULL)
390 goto onError;
391 if (!PyUnicode_Check(unicode)) {
392 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000393 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 unicode->ob_type->tp_name);
395 Py_DECREF(unicode);
396 goto onError;
397 }
398 Py_DECREF(buffer);
399 return unicode;
400
401 onError:
402 Py_XDECREF(buffer);
403 return NULL;
404}
405
406PyObject *PyUnicode_Encode(const Py_UNICODE *s,
407 int size,
408 const char *encoding,
409 const char *errors)
410{
411 PyObject *v, *unicode;
412
413 unicode = PyUnicode_FromUnicode(s, size);
414 if (unicode == NULL)
415 return NULL;
416 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
417 Py_DECREF(unicode);
418 return v;
419}
420
421PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
422 const char *encoding,
423 const char *errors)
424{
425 PyObject *v;
426
427 if (!PyUnicode_Check(unicode)) {
428 PyErr_BadArgument();
429 goto onError;
430 }
431 /* Shortcut for the default encoding UTF-8 */
432 if ((encoding == NULL ||
433 (strcmp(encoding, "utf-8") == 0)) &&
434 errors == NULL)
435 return PyUnicode_AsUTF8String(unicode);
436
437 /* Encode via the codec registry */
438 v = PyCodec_Encode(unicode, encoding, errors);
439 if (v == NULL)
440 goto onError;
441 /* XXX Should we really enforce this ? */
442 if (!PyString_Check(v)) {
443 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000444 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000445 v->ob_type->tp_name);
446 Py_DECREF(v);
447 goto onError;
448 }
449 return v;
450
451 onError:
452 return NULL;
453}
454
455Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
456{
457 if (!PyUnicode_Check(unicode)) {
458 PyErr_BadArgument();
459 goto onError;
460 }
461 return PyUnicode_AS_UNICODE(unicode);
462
463 onError:
464 return NULL;
465}
466
467int PyUnicode_GetSize(PyObject *unicode)
468{
469 if (!PyUnicode_Check(unicode)) {
470 PyErr_BadArgument();
471 goto onError;
472 }
473 return PyUnicode_GET_SIZE(unicode);
474
475 onError:
476 return -1;
477}
478
479/* --- UTF-8 Codec -------------------------------------------------------- */
480
481static
482char utf8_code_length[256] = {
483 /* Map UTF-8 encoded prefix byte to sequence length. zero means
484 illegal prefix. see RFC 2279 for details */
485 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
486 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
487 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
488 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
489 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
490 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
491 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
492 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
495 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
497 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
498 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
499 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
500 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
501};
502
503static
504int utf8_decoding_error(const char **source,
505 Py_UNICODE **dest,
506 const char *errors,
507 const char *details)
508{
509 if ((errors == NULL) ||
510 (strcmp(errors,"strict") == 0)) {
511 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000512 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000513 details);
514 return -1;
515 }
516 else if (strcmp(errors,"ignore") == 0) {
517 (*source)++;
518 return 0;
519 }
520 else if (strcmp(errors,"replace") == 0) {
521 (*source)++;
522 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
523 (*dest)++;
524 return 0;
525 }
526 else {
527 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000528 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000529 errors);
530 return -1;
531 }
532}
533
534#define UTF8_ERROR(details) do { \
535 if (utf8_decoding_error(&s, &p, errors, details)) \
536 goto onError; \
537 continue; \
538} while (0)
539
540PyObject *PyUnicode_DecodeUTF8(const char *s,
541 int size,
542 const char *errors)
543{
544 int n;
545 const char *e;
546 PyUnicodeObject *unicode;
547 Py_UNICODE *p;
548
549 /* Note: size will always be longer than the resulting Unicode
550 character count */
551 unicode = _PyUnicode_New(size);
552 if (!unicode)
553 return NULL;
554 if (size == 0)
555 return (PyObject *)unicode;
556
557 /* Unpack UTF-8 encoded data */
558 p = unicode->str;
559 e = s + size;
560
561 while (s < e) {
562 register Py_UNICODE ch = (unsigned char)*s;
563
564 if (ch < 0x80) {
565 *p++ = ch;
566 s++;
567 continue;
568 }
569
570 n = utf8_code_length[ch];
571
572 if (s + n > e)
573 UTF8_ERROR("unexpected end of data");
574
575 switch (n) {
576
577 case 0:
578 UTF8_ERROR("unexpected code byte");
579 break;
580
581 case 1:
582 UTF8_ERROR("internal error");
583 break;
584
585 case 2:
586 if ((s[1] & 0xc0) != 0x80)
587 UTF8_ERROR("invalid data");
588 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
589 if (ch < 0x80)
590 UTF8_ERROR("illegal encoding");
591 else
592 *p++ = ch;
593 break;
594
595 case 3:
596 if ((s[1] & 0xc0) != 0x80 ||
597 (s[2] & 0xc0) != 0x80)
598 UTF8_ERROR("invalid data");
599 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
600 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
601 UTF8_ERROR("illegal encoding");
602 else
603 *p++ = ch;
604 break;
605
606 default:
607 /* Other sizes are only needed for UCS-4 */
608 UTF8_ERROR("unsupported Unicode code range");
609 }
610 s += n;
611 }
612
613 /* Adjust length */
614 if (_PyUnicode_Resize(unicode, p - unicode->str))
615 goto onError;
616
617 return (PyObject *)unicode;
618
619onError:
620 Py_DECREF(unicode);
621 return NULL;
622}
623
624#undef UTF8_ERROR
625
626static
627int utf8_encoding_error(const Py_UNICODE **source,
628 char **dest,
629 const char *errors,
630 const char *details)
631{
632 if ((errors == NULL) ||
633 (strcmp(errors,"strict") == 0)) {
634 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000635 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636 details);
637 return -1;
638 }
639 else if (strcmp(errors,"ignore") == 0) {
640 return 0;
641 }
642 else if (strcmp(errors,"replace") == 0) {
643 **dest = '?';
644 (*dest)++;
645 return 0;
646 }
647 else {
648 PyErr_Format(PyExc_ValueError,
649 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000650 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000651 errors);
652 return -1;
653 }
654}
655
656PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
657 int size,
658 const char *errors)
659{
660 PyObject *v;
661 char *p;
662 char *q;
663
664 v = PyString_FromStringAndSize(NULL, 3 * size);
665 if (v == NULL)
666 return NULL;
667 if (size == 0)
668 goto done;
669
670 p = q = PyString_AS_STRING(v);
671 while (size-- > 0) {
672 Py_UNICODE ch = *s++;
673 if (ch < 0x80)
674 *p++ = (char) ch;
675 else if (ch < 0x0800) {
676 *p++ = 0xc0 | (ch >> 6);
677 *p++ = 0x80 | (ch & 0x3f);
678 } else if (0xD800 <= ch && ch <= 0xDFFF) {
679 /* These byte ranges are reserved for UTF-16 surrogate
680 bytes which the Python implementation currently does
681 not support. */
682 printf("code range problem: U+%04x\n", ch);
683 if (utf8_encoding_error(&s, &p, errors,
684 "unsupported code range"))
685 goto onError;
686 } else {
687 *p++ = 0xe0 | (ch >> 12);
688 *p++ = 0x80 | ((ch >> 6) & 0x3f);
689 *p++ = 0x80 | (ch & 0x3f);
690 }
691 }
692 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000693 if (_PyString_Resize(&v, p - q))
694 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000695
696 done:
697 return v;
698
699 onError:
700 Py_DECREF(v);
701 return NULL;
702}
703
704/* Return a Python string holding the UTF-8 encoded value of the
705 Unicode object.
706
707 The resulting string is cached in the Unicode object for subsequent
708 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000709 the character buffer interface and will live (at least) as long as
710 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000711
712 The refcount of the string is *not* incremented.
713
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000714 *** Exported for internal use by the interpreter only !!! ***
715
Guido van Rossumd57fd912000-03-10 22:53:23 +0000716*/
717
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000718PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 const char *errors)
720{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000721 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000722
723 if (v)
724 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000725 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
726 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 errors);
728 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000729 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000730 return v;
731}
732
733PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
734{
735 PyObject *str;
736
737 if (!PyUnicode_Check(unicode)) {
738 PyErr_BadArgument();
739 return NULL;
740 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000741 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000742 if (str == NULL)
743 return NULL;
744 Py_INCREF(str);
745 return str;
746}
747
748/* --- UTF-16 Codec ------------------------------------------------------- */
749
750static
751int utf16_decoding_error(const Py_UNICODE **source,
752 Py_UNICODE **dest,
753 const char *errors,
754 const char *details)
755{
756 if ((errors == NULL) ||
757 (strcmp(errors,"strict") == 0)) {
758 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000759 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000760 details);
761 return -1;
762 }
763 else if (strcmp(errors,"ignore") == 0) {
764 return 0;
765 }
766 else if (strcmp(errors,"replace") == 0) {
767 if (dest) {
768 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
769 (*dest)++;
770 }
771 return 0;
772 }
773 else {
774 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000775 "UTF-16 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000776 errors);
777 return -1;
778 }
779}
780
781#define UTF16_ERROR(details) do { \
782 if (utf16_decoding_error(&q, &p, errors, details)) \
783 goto onError; \
784 continue; \
785} while(0)
786
787PyObject *PyUnicode_DecodeUTF16(const char *s,
788 int size,
789 const char *errors,
790 int *byteorder)
791{
792 PyUnicodeObject *unicode;
793 Py_UNICODE *p;
794 const Py_UNICODE *q, *e;
795 int bo = 0;
796
797 /* size should be an even number */
798 if (size % sizeof(Py_UNICODE) != 0) {
799 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
800 return NULL;
801 /* The remaining input chars are ignored if we fall through
802 here... */
803 }
804
805 /* Note: size will always be longer than the resulting Unicode
806 character count */
807 unicode = _PyUnicode_New(size);
808 if (!unicode)
809 return NULL;
810 if (size == 0)
811 return (PyObject *)unicode;
812
813 /* Unpack UTF-16 encoded data */
814 p = unicode->str;
815 q = (Py_UNICODE *)s;
816 e = q + (size / sizeof(Py_UNICODE));
817
818 if (byteorder)
819 bo = *byteorder;
820
821 while (q < e) {
822 register Py_UNICODE ch = *q++;
823
824 /* Check for BOM marks (U+FEFF) in the input and adjust
825 current byte order setting accordingly. Swap input
826 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
827 !) */
828#ifdef BYTEORDER_IS_LITTLE_ENDIAN
829 if (ch == 0xFEFF) {
830 bo = -1;
831 continue;
832 } else if (ch == 0xFFFE) {
833 bo = 1;
834 continue;
835 }
836 if (bo == 1)
837 ch = (ch >> 8) | (ch << 8);
838#else
839 if (ch == 0xFEFF) {
840 bo = 1;
841 continue;
842 } else if (ch == 0xFFFE) {
843 bo = -1;
844 continue;
845 }
846 if (bo == -1)
847 ch = (ch >> 8) | (ch << 8);
848#endif
849 if (ch < 0xD800 || ch > 0xDFFF) {
850 *p++ = ch;
851 continue;
852 }
853
854 /* UTF-16 code pair: */
855 if (q >= e)
856 UTF16_ERROR("unexpected end of data");
857 if (0xDC00 <= *q && *q <= 0xDFFF) {
858 q++;
859 if (0xD800 <= *q && *q <= 0xDBFF)
860 /* This is valid data (a UTF-16 surrogate pair), but
861 we are not able to store this information since our
862 Py_UNICODE type only has 16 bits... this might
863 change someday, even though it's unlikely. */
864 UTF16_ERROR("code pairs are not supported");
865 else
866 continue;
867 }
868 UTF16_ERROR("illegal encoding");
869 }
870
871 if (byteorder)
872 *byteorder = bo;
873
874 /* Adjust length */
875 if (_PyUnicode_Resize(unicode, p - unicode->str))
876 goto onError;
877
878 return (PyObject *)unicode;
879
880onError:
881 Py_DECREF(unicode);
882 return NULL;
883}
884
885#undef UTF16_ERROR
886
887PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
888 int size,
889 const char *errors,
890 int byteorder)
891{
892 PyObject *v;
893 Py_UNICODE *p;
894 char *q;
895
896 /* We don't create UTF-16 pairs... */
897 v = PyString_FromStringAndSize(NULL,
898 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
899 if (v == NULL)
900 return NULL;
901 if (size == 0)
902 goto done;
903
904 q = PyString_AS_STRING(v);
905 p = (Py_UNICODE *)q;
906
907 if (byteorder == 0)
908 *p++ = 0xFEFF;
909 if (byteorder == 0 ||
910#ifdef BYTEORDER_IS_LITTLE_ENDIAN
911 byteorder == -1
912#else
913 byteorder == 1
914#endif
915 )
916 memcpy(p, s, size * sizeof(Py_UNICODE));
917 else
918 while (size-- > 0) {
919 Py_UNICODE ch = *s++;
920 *p++ = (ch >> 8) | (ch << 8);
921 }
922 done:
923 return v;
924}
925
926PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
927{
928 if (!PyUnicode_Check(unicode)) {
929 PyErr_BadArgument();
930 return NULL;
931 }
932 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
933 PyUnicode_GET_SIZE(unicode),
934 NULL,
935 0);
936}
937
938/* --- Unicode Escape Codec ----------------------------------------------- */
939
940static
941int unicodeescape_decoding_error(const char **source,
942 unsigned int *x,
943 const char *errors,
944 const char *details)
945{
946 if ((errors == NULL) ||
947 (strcmp(errors,"strict") == 0)) {
948 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000949 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000950 details);
951 return -1;
952 }
953 else if (strcmp(errors,"ignore") == 0) {
954 return 0;
955 }
956 else if (strcmp(errors,"replace") == 0) {
957 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
958 return 0;
959 }
960 else {
961 PyErr_Format(PyExc_ValueError,
962 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000963 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000964 errors);
965 return -1;
966 }
967}
968
969PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
970 int size,
971 const char *errors)
972{
973 PyUnicodeObject *v;
974 Py_UNICODE *p = NULL, *buf = NULL;
975 const char *end;
976
977 /* Escaped strings will always be longer than the resulting
978 Unicode string, so we start with size here and then reduce the
979 length after conversion to the true value. */
980 v = _PyUnicode_New(size);
981 if (v == NULL)
982 goto onError;
983 if (size == 0)
984 return (PyObject *)v;
985 p = buf = PyUnicode_AS_UNICODE(v);
986 end = s + size;
987 while (s < end) {
988 unsigned char c;
989 unsigned int x;
990 int i;
991
992 /* Non-escape characters are interpreted as Unicode ordinals */
993 if (*s != '\\') {
994 *p++ = (unsigned char)*s++;
995 continue;
996 }
997
998 /* \ - Escapes */
999 s++;
1000 switch (*s++) {
1001
1002 /* \x escapes */
1003 case '\n': break;
1004 case '\\': *p++ = '\\'; break;
1005 case '\'': *p++ = '\''; break;
1006 case '\"': *p++ = '\"'; break;
1007 case 'b': *p++ = '\b'; break;
1008 case 'f': *p++ = '\014'; break; /* FF */
1009 case 't': *p++ = '\t'; break;
1010 case 'n': *p++ = '\n'; break;
1011 case 'r': *p++ = '\r'; break;
1012 case 'v': *p++ = '\013'; break; /* VT */
1013 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1014
1015 /* \OOO (octal) escapes */
1016 case '0': case '1': case '2': case '3':
1017 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001018 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001019 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001020 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001021 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001022 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001023 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001024 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025 break;
1026
1027 /* \xXXXX escape with 0-4 hex digits */
1028 case 'x':
1029 x = 0;
1030 c = (unsigned char)*s;
1031 if (isxdigit(c)) {
1032 do {
1033 x = (x<<4) & ~0xF;
1034 if ('0' <= c && c <= '9')
1035 x += c - '0';
1036 else if ('a' <= c && c <= 'f')
1037 x += 10 + c - 'a';
1038 else
1039 x += 10 + c - 'A';
1040 c = (unsigned char)*++s;
1041 } while (isxdigit(c));
1042 *p++ = x;
1043 } else {
1044 *p++ = '\\';
1045 *p++ = (unsigned char)s[-1];
1046 }
1047 break;
1048
1049 /* \uXXXX with 4 hex digits */
1050 case 'u':
1051 for (x = 0, i = 0; i < 4; i++) {
1052 c = (unsigned char)s[i];
1053 if (!isxdigit(c)) {
1054 if (unicodeescape_decoding_error(&s, &x, errors,
1055 "truncated \\uXXXX"))
1056 goto onError;
1057 i++;
1058 break;
1059 }
1060 x = (x<<4) & ~0xF;
1061 if (c >= '0' && c <= '9')
1062 x += c - '0';
1063 else if (c >= 'a' && c <= 'f')
1064 x += 10 + c - 'a';
1065 else
1066 x += 10 + c - 'A';
1067 }
1068 s += i;
1069 *p++ = x;
1070 break;
1071
1072 default:
1073 *p++ = '\\';
1074 *p++ = (unsigned char)s[-1];
1075 break;
1076 }
1077 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001078 if (_PyUnicode_Resize(v, (int)(p - buf)))
1079 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001080 return (PyObject *)v;
1081
1082 onError:
1083 Py_XDECREF(v);
1084 return NULL;
1085}
1086
1087/* Return a Unicode-Escape string version of the Unicode object.
1088
1089 If quotes is true, the string is enclosed in u"" or u'' quotes as
1090 appropriate.
1091
1092*/
1093
Barry Warsaw51ac5802000-03-20 16:36:48 +00001094static const Py_UNICODE *findchar(const Py_UNICODE *s,
1095 int size,
1096 Py_UNICODE ch);
1097
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098static
1099PyObject *unicodeescape_string(const Py_UNICODE *s,
1100 int size,
1101 int quotes)
1102{
1103 PyObject *repr;
1104 char *p;
1105 char *q;
1106
1107 static const char *hexdigit = "0123456789ABCDEF";
1108
1109 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1110 if (repr == NULL)
1111 return NULL;
1112
1113 p = q = PyString_AS_STRING(repr);
1114
1115 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001116 *p++ = 'u';
1117 *p++ = (findchar(s, size, '\'') &&
1118 !findchar(s, size, '"')) ? '"' : '\'';
1119 }
1120 while (size-- > 0) {
1121 Py_UNICODE ch = *s++;
1122 /* Escape quotes */
1123 if (quotes && (ch == q[1] || ch == '\\')) {
1124 *p++ = '\\';
1125 *p++ = (char) ch;
1126 }
1127 /* Map 16-bit characters to '\uxxxx' */
1128 else if (ch >= 256) {
1129 *p++ = '\\';
1130 *p++ = 'u';
1131 *p++ = hexdigit[(ch >> 12) & 0xf];
1132 *p++ = hexdigit[(ch >> 8) & 0xf];
1133 *p++ = hexdigit[(ch >> 4) & 0xf];
1134 *p++ = hexdigit[ch & 15];
1135 }
1136 /* Map non-printable US ASCII to '\ooo' */
1137 else if (ch < ' ' || ch >= 128) {
1138 *p++ = '\\';
1139 *p++ = hexdigit[(ch >> 6) & 7];
1140 *p++ = hexdigit[(ch >> 3) & 7];
1141 *p++ = hexdigit[ch & 7];
1142 }
1143 /* Copy everything else as-is */
1144 else
1145 *p++ = (char) ch;
1146 }
1147 if (quotes)
1148 *p++ = q[1];
1149
1150 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001151 if (_PyString_Resize(&repr, p - q))
1152 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153
1154 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001155
1156 onError:
1157 Py_DECREF(repr);
1158 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001159}
1160
1161PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1162 int size)
1163{
1164 return unicodeescape_string(s, size, 0);
1165}
1166
1167PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1168{
1169 if (!PyUnicode_Check(unicode)) {
1170 PyErr_BadArgument();
1171 return NULL;
1172 }
1173 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1174 PyUnicode_GET_SIZE(unicode));
1175}
1176
1177/* --- Raw Unicode Escape Codec ------------------------------------------- */
1178
1179PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1180 int size,
1181 const char *errors)
1182{
1183 PyUnicodeObject *v;
1184 Py_UNICODE *p, *buf;
1185 const char *end;
1186 const char *bs;
1187
1188 /* Escaped strings will always be longer than the resulting
1189 Unicode string, so we start with size here and then reduce the
1190 length after conversion to the true value. */
1191 v = _PyUnicode_New(size);
1192 if (v == NULL)
1193 goto onError;
1194 if (size == 0)
1195 return (PyObject *)v;
1196 p = buf = PyUnicode_AS_UNICODE(v);
1197 end = s + size;
1198 while (s < end) {
1199 unsigned char c;
1200 unsigned int x;
1201 int i;
1202
1203 /* Non-escape characters are interpreted as Unicode ordinals */
1204 if (*s != '\\') {
1205 *p++ = (unsigned char)*s++;
1206 continue;
1207 }
1208
1209 /* \u-escapes are only interpreted iff the number of leading
1210 backslashes if odd */
1211 bs = s;
1212 for (;s < end;) {
1213 if (*s != '\\')
1214 break;
1215 *p++ = (unsigned char)*s++;
1216 }
1217 if (((s - bs) & 1) == 0 ||
1218 s >= end ||
1219 *s != 'u') {
1220 continue;
1221 }
1222 p--;
1223 s++;
1224
1225 /* \uXXXX with 4 hex digits */
1226 for (x = 0, i = 0; i < 4; i++) {
1227 c = (unsigned char)s[i];
1228 if (!isxdigit(c)) {
1229 if (unicodeescape_decoding_error(&s, &x, errors,
1230 "truncated \\uXXXX"))
1231 goto onError;
1232 i++;
1233 break;
1234 }
1235 x = (x<<4) & ~0xF;
1236 if (c >= '0' && c <= '9')
1237 x += c - '0';
1238 else if (c >= 'a' && c <= 'f')
1239 x += 10 + c - 'a';
1240 else
1241 x += 10 + c - 'A';
1242 }
1243 s += i;
1244 *p++ = x;
1245 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001246 if (_PyUnicode_Resize(v, (int)(p - buf)))
1247 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001248 return (PyObject *)v;
1249
1250 onError:
1251 Py_XDECREF(v);
1252 return NULL;
1253}
1254
1255PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1256 int size)
1257{
1258 PyObject *repr;
1259 char *p;
1260 char *q;
1261
1262 static const char *hexdigit = "0123456789ABCDEF";
1263
1264 repr = PyString_FromStringAndSize(NULL, 6 * size);
1265 if (repr == NULL)
1266 return NULL;
1267
1268 p = q = PyString_AS_STRING(repr);
1269 while (size-- > 0) {
1270 Py_UNICODE ch = *s++;
1271 /* Map 16-bit characters to '\uxxxx' */
1272 if (ch >= 256) {
1273 *p++ = '\\';
1274 *p++ = 'u';
1275 *p++ = hexdigit[(ch >> 12) & 0xf];
1276 *p++ = hexdigit[(ch >> 8) & 0xf];
1277 *p++ = hexdigit[(ch >> 4) & 0xf];
1278 *p++ = hexdigit[ch & 15];
1279 }
1280 /* Copy everything else as-is */
1281 else
1282 *p++ = (char) ch;
1283 }
1284 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001285 if (_PyString_Resize(&repr, p - q))
1286 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001287
1288 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001289
1290 onError:
1291 Py_DECREF(repr);
1292 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001293}
1294
1295PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1296{
1297 if (!PyUnicode_Check(unicode)) {
1298 PyErr_BadArgument();
1299 return NULL;
1300 }
1301 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1302 PyUnicode_GET_SIZE(unicode));
1303}
1304
1305/* --- Latin-1 Codec ------------------------------------------------------ */
1306
1307PyObject *PyUnicode_DecodeLatin1(const char *s,
1308 int size,
1309 const char *errors)
1310{
1311 PyUnicodeObject *v;
1312 Py_UNICODE *p;
1313
1314 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1315 v = _PyUnicode_New(size);
1316 if (v == NULL)
1317 goto onError;
1318 if (size == 0)
1319 return (PyObject *)v;
1320 p = PyUnicode_AS_UNICODE(v);
1321 while (size-- > 0)
1322 *p++ = (unsigned char)*s++;
1323 return (PyObject *)v;
1324
1325 onError:
1326 Py_XDECREF(v);
1327 return NULL;
1328}
1329
1330static
1331int latin1_encoding_error(const Py_UNICODE **source,
1332 char **dest,
1333 const char *errors,
1334 const char *details)
1335{
1336 if ((errors == NULL) ||
1337 (strcmp(errors,"strict") == 0)) {
1338 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001339 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340 details);
1341 return -1;
1342 }
1343 else if (strcmp(errors,"ignore") == 0) {
1344 return 0;
1345 }
1346 else if (strcmp(errors,"replace") == 0) {
1347 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001348 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001349 return 0;
1350 }
1351 else {
1352 PyErr_Format(PyExc_ValueError,
1353 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001354 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001355 errors);
1356 return -1;
1357 }
1358}
1359
1360PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1361 int size,
1362 const char *errors)
1363{
1364 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001365 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001366 repr = PyString_FromStringAndSize(NULL, size);
1367 if (repr == NULL)
1368 return NULL;
1369
1370 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001371 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 while (size-- > 0) {
1373 Py_UNICODE ch = *p++;
1374 if (ch >= 256) {
1375 if (latin1_encoding_error(&p, &s, errors,
1376 "ordinal not in range(256)"))
1377 goto onError;
1378 }
1379 else
1380 *s++ = (char)ch;
1381 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001382 /* Resize if error handling skipped some characters */
1383 if (s - start < PyString_GET_SIZE(repr))
1384 if (_PyString_Resize(&repr, s - start))
1385 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001386 return repr;
1387
1388 onError:
1389 Py_DECREF(repr);
1390 return NULL;
1391}
1392
1393PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1394{
1395 if (!PyUnicode_Check(unicode)) {
1396 PyErr_BadArgument();
1397 return NULL;
1398 }
1399 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1400 PyUnicode_GET_SIZE(unicode),
1401 NULL);
1402}
1403
1404/* --- 7-bit ASCII Codec -------------------------------------------------- */
1405
1406static
1407int ascii_decoding_error(const char **source,
1408 Py_UNICODE **dest,
1409 const char *errors,
1410 const char *details)
1411{
1412 if ((errors == NULL) ||
1413 (strcmp(errors,"strict") == 0)) {
1414 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001415 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001416 details);
1417 return -1;
1418 }
1419 else if (strcmp(errors,"ignore") == 0) {
1420 return 0;
1421 }
1422 else if (strcmp(errors,"replace") == 0) {
1423 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1424 (*dest)++;
1425 return 0;
1426 }
1427 else {
1428 PyErr_Format(PyExc_ValueError,
1429 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001430 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001431 errors);
1432 return -1;
1433 }
1434}
1435
1436PyObject *PyUnicode_DecodeASCII(const char *s,
1437 int size,
1438 const char *errors)
1439{
1440 PyUnicodeObject *v;
1441 Py_UNICODE *p;
1442
1443 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1444 v = _PyUnicode_New(size);
1445 if (v == NULL)
1446 goto onError;
1447 if (size == 0)
1448 return (PyObject *)v;
1449 p = PyUnicode_AS_UNICODE(v);
1450 while (size-- > 0) {
1451 register unsigned char c;
1452
1453 c = (unsigned char)*s++;
1454 if (c < 128)
1455 *p++ = c;
1456 else if (ascii_decoding_error(&s, &p, errors,
1457 "ordinal not in range(128)"))
1458 goto onError;
1459 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001460 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1461 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1462 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001463 return (PyObject *)v;
1464
1465 onError:
1466 Py_XDECREF(v);
1467 return NULL;
1468}
1469
1470static
1471int ascii_encoding_error(const Py_UNICODE **source,
1472 char **dest,
1473 const char *errors,
1474 const char *details)
1475{
1476 if ((errors == NULL) ||
1477 (strcmp(errors,"strict") == 0)) {
1478 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001479 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 details);
1481 return -1;
1482 }
1483 else if (strcmp(errors,"ignore") == 0) {
1484 return 0;
1485 }
1486 else if (strcmp(errors,"replace") == 0) {
1487 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001488 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001489 return 0;
1490 }
1491 else {
1492 PyErr_Format(PyExc_ValueError,
1493 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001494 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001495 errors);
1496 return -1;
1497 }
1498}
1499
1500PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1501 int size,
1502 const char *errors)
1503{
1504 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001505 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001506 repr = PyString_FromStringAndSize(NULL, size);
1507 if (repr == NULL)
1508 return NULL;
1509
1510 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001511 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 while (size-- > 0) {
1513 Py_UNICODE ch = *p++;
1514 if (ch >= 128) {
1515 if (ascii_encoding_error(&p, &s, errors,
1516 "ordinal not in range(128)"))
1517 goto onError;
1518 }
1519 else
1520 *s++ = (char)ch;
1521 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001522 /* Resize if error handling skipped some characters */
1523 if (s - start < PyString_GET_SIZE(repr))
1524 if (_PyString_Resize(&repr, s - start))
1525 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001526 return repr;
1527
1528 onError:
1529 Py_DECREF(repr);
1530 return NULL;
1531}
1532
1533PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1534{
1535 if (!PyUnicode_Check(unicode)) {
1536 PyErr_BadArgument();
1537 return NULL;
1538 }
1539 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1540 PyUnicode_GET_SIZE(unicode),
1541 NULL);
1542}
1543
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001544#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001545
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001546/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001547
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001548PyObject *PyUnicode_DecodeMBCS(const char *s,
1549 int size,
1550 const char *errors)
1551{
1552 PyUnicodeObject *v;
1553 Py_UNICODE *p;
1554
1555 /* First get the size of the result */
1556 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001557 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001558 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1559
1560 v = _PyUnicode_New(usize);
1561 if (v == NULL)
1562 return NULL;
1563 if (usize == 0)
1564 return (PyObject *)v;
1565 p = PyUnicode_AS_UNICODE(v);
1566 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1567 Py_DECREF(v);
1568 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1569 }
1570
1571 return (PyObject *)v;
1572}
1573
1574PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1575 int size,
1576 const char *errors)
1577{
1578 PyObject *repr;
1579 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001580 DWORD mbcssize;
1581
1582 /* If there are no characters, bail now! */
1583 if (size==0)
1584 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001585
1586 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001587 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001588 if (mbcssize==0)
1589 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1590
1591 repr = PyString_FromStringAndSize(NULL, mbcssize);
1592 if (repr == NULL)
1593 return NULL;
1594 if (mbcssize==0)
1595 return repr;
1596
1597 /* Do the conversion */
1598 s = PyString_AS_STRING(repr);
1599 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1600 Py_DECREF(repr);
1601 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1602 }
1603 return repr;
1604}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001605
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001606#endif /* MS_WIN32 */
1607
Guido van Rossumd57fd912000-03-10 22:53:23 +00001608/* --- Character Mapping Codec -------------------------------------------- */
1609
1610static
1611int charmap_decoding_error(const char **source,
1612 Py_UNICODE **dest,
1613 const char *errors,
1614 const char *details)
1615{
1616 if ((errors == NULL) ||
1617 (strcmp(errors,"strict") == 0)) {
1618 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001619 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001620 details);
1621 return -1;
1622 }
1623 else if (strcmp(errors,"ignore") == 0) {
1624 return 0;
1625 }
1626 else if (strcmp(errors,"replace") == 0) {
1627 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1628 (*dest)++;
1629 return 0;
1630 }
1631 else {
1632 PyErr_Format(PyExc_ValueError,
1633 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001634 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001635 errors);
1636 return -1;
1637 }
1638}
1639
1640PyObject *PyUnicode_DecodeCharmap(const char *s,
1641 int size,
1642 PyObject *mapping,
1643 const char *errors)
1644{
1645 PyUnicodeObject *v;
1646 Py_UNICODE *p;
1647
1648 /* Default to Latin-1 */
1649 if (mapping == NULL)
1650 return PyUnicode_DecodeLatin1(s, size, errors);
1651
1652 v = _PyUnicode_New(size);
1653 if (v == NULL)
1654 goto onError;
1655 if (size == 0)
1656 return (PyObject *)v;
1657 p = PyUnicode_AS_UNICODE(v);
1658 while (size-- > 0) {
1659 unsigned char ch = *s++;
1660 PyObject *w, *x;
1661
1662 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1663 w = PyInt_FromLong((long)ch);
1664 if (w == NULL)
1665 goto onError;
1666 x = PyObject_GetItem(mapping, w);
1667 Py_DECREF(w);
1668 if (x == NULL) {
1669 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1670 /* No mapping found: default to Latin-1 mapping */
1671 PyErr_Clear();
1672 *p++ = (Py_UNICODE)ch;
1673 continue;
1674 }
1675 goto onError;
1676 }
1677
1678 /* Apply mapping */
1679 if (PyInt_Check(x)) {
1680 int value = PyInt_AS_LONG(x);
1681 if (value < 0 || value > 65535) {
1682 PyErr_SetString(PyExc_TypeError,
1683 "character mapping must be in range(65336)");
1684 Py_DECREF(x);
1685 goto onError;
1686 }
1687 *p++ = (Py_UNICODE)value;
1688 }
1689 else if (x == Py_None) {
1690 /* undefined mapping */
1691 if (charmap_decoding_error(&s, &p, errors,
1692 "character maps to <undefined>")) {
1693 Py_DECREF(x);
1694 goto onError;
1695 }
1696 }
1697 else if (PyUnicode_Check(x)) {
1698 if (PyUnicode_GET_SIZE(x) != 1) {
1699 /* 1-n mapping */
1700 PyErr_SetString(PyExc_NotImplementedError,
1701 "1-n mappings are currently not implemented");
1702 Py_DECREF(x);
1703 goto onError;
1704 }
1705 *p++ = *PyUnicode_AS_UNICODE(x);
1706 }
1707 else {
1708 /* wrong return value */
1709 PyErr_SetString(PyExc_TypeError,
1710 "character mapping must return integer, None or unicode");
1711 Py_DECREF(x);
1712 goto onError;
1713 }
1714 Py_DECREF(x);
1715 }
1716 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1717 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1718 goto onError;
1719 return (PyObject *)v;
1720
1721 onError:
1722 Py_XDECREF(v);
1723 return NULL;
1724}
1725
1726static
1727int charmap_encoding_error(const Py_UNICODE **source,
1728 char **dest,
1729 const char *errors,
1730 const char *details)
1731{
1732 if ((errors == NULL) ||
1733 (strcmp(errors,"strict") == 0)) {
1734 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001735 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001736 details);
1737 return -1;
1738 }
1739 else if (strcmp(errors,"ignore") == 0) {
1740 return 0;
1741 }
1742 else if (strcmp(errors,"replace") == 0) {
1743 **dest = '?';
1744 (*dest)++;
1745 return 0;
1746 }
1747 else {
1748 PyErr_Format(PyExc_ValueError,
1749 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001750 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001751 errors);
1752 return -1;
1753 }
1754}
1755
1756PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1757 int size,
1758 PyObject *mapping,
1759 const char *errors)
1760{
1761 PyObject *v;
1762 char *s;
1763
1764 /* Default to Latin-1 */
1765 if (mapping == NULL)
1766 return PyUnicode_EncodeLatin1(p, size, errors);
1767
1768 v = PyString_FromStringAndSize(NULL, size);
1769 if (v == NULL)
1770 return NULL;
1771 s = PyString_AS_STRING(v);
1772 while (size-- > 0) {
1773 Py_UNICODE ch = *p++;
1774 PyObject *w, *x;
1775
1776 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1777 w = PyInt_FromLong((long)ch);
1778 if (w == NULL)
1779 goto onError;
1780 x = PyObject_GetItem(mapping, w);
1781 Py_DECREF(w);
1782 if (x == NULL) {
1783 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1784 /* No mapping found: default to Latin-1 mapping if possible */
1785 PyErr_Clear();
1786 if (ch < 256) {
1787 *s++ = (char)ch;
1788 continue;
1789 }
1790 else if (!charmap_encoding_error(&p, &s, errors,
1791 "missing character mapping"))
1792 continue;
1793 }
1794 goto onError;
1795 }
1796
1797 /* Apply mapping */
1798 if (PyInt_Check(x)) {
1799 int value = PyInt_AS_LONG(x);
1800 if (value < 0 || value > 255) {
1801 PyErr_SetString(PyExc_TypeError,
1802 "character mapping must be in range(256)");
1803 Py_DECREF(x);
1804 goto onError;
1805 }
1806 *s++ = (char)value;
1807 }
1808 else if (x == Py_None) {
1809 /* undefined mapping */
1810 if (charmap_encoding_error(&p, &s, errors,
1811 "character maps to <undefined>")) {
1812 Py_DECREF(x);
1813 goto onError;
1814 }
1815 }
1816 else if (PyString_Check(x)) {
1817 if (PyString_GET_SIZE(x) != 1) {
1818 /* 1-n mapping */
1819 PyErr_SetString(PyExc_NotImplementedError,
1820 "1-n mappings are currently not implemented");
1821 Py_DECREF(x);
1822 goto onError;
1823 }
1824 *s++ = *PyString_AS_STRING(x);
1825 }
1826 else {
1827 /* wrong return value */
1828 PyErr_SetString(PyExc_TypeError,
1829 "character mapping must return integer, None or unicode");
1830 Py_DECREF(x);
1831 goto onError;
1832 }
1833 Py_DECREF(x);
1834 }
1835 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1836 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1837 goto onError;
1838 return v;
1839
1840 onError:
1841 Py_DECREF(v);
1842 return NULL;
1843}
1844
1845PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1846 PyObject *mapping)
1847{
1848 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1849 PyErr_BadArgument();
1850 return NULL;
1851 }
1852 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1853 PyUnicode_GET_SIZE(unicode),
1854 mapping,
1855 NULL);
1856}
1857
1858static
1859int translate_error(const Py_UNICODE **source,
1860 Py_UNICODE **dest,
1861 const char *errors,
1862 const char *details)
1863{
1864 if ((errors == NULL) ||
1865 (strcmp(errors,"strict") == 0)) {
1866 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001867 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001868 details);
1869 return -1;
1870 }
1871 else if (strcmp(errors,"ignore") == 0) {
1872 return 0;
1873 }
1874 else if (strcmp(errors,"replace") == 0) {
1875 **dest = '?';
1876 (*dest)++;
1877 return 0;
1878 }
1879 else {
1880 PyErr_Format(PyExc_ValueError,
1881 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001882 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001883 errors);
1884 return -1;
1885 }
1886}
1887
1888PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1889 int size,
1890 PyObject *mapping,
1891 const char *errors)
1892{
1893 PyUnicodeObject *v;
1894 Py_UNICODE *p;
1895
1896 if (mapping == NULL) {
1897 PyErr_BadArgument();
1898 return NULL;
1899 }
1900
1901 /* Output will never be longer than input */
1902 v = _PyUnicode_New(size);
1903 if (v == NULL)
1904 goto onError;
1905 if (size == 0)
1906 goto done;
1907 p = PyUnicode_AS_UNICODE(v);
1908 while (size-- > 0) {
1909 Py_UNICODE ch = *s++;
1910 PyObject *w, *x;
1911
1912 /* Get mapping */
1913 w = PyInt_FromLong(ch);
1914 if (w == NULL)
1915 goto onError;
1916 x = PyObject_GetItem(mapping, w);
1917 Py_DECREF(w);
1918 if (x == NULL) {
1919 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1920 /* No mapping found: default to 1-1 mapping */
1921 PyErr_Clear();
1922 *p++ = ch;
1923 continue;
1924 }
1925 goto onError;
1926 }
1927
1928 /* Apply mapping */
1929 if (PyInt_Check(x))
1930 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1931 else if (x == Py_None) {
1932 /* undefined mapping */
1933 if (translate_error(&s, &p, errors,
1934 "character maps to <undefined>")) {
1935 Py_DECREF(x);
1936 goto onError;
1937 }
1938 }
1939 else if (PyUnicode_Check(x)) {
1940 if (PyUnicode_GET_SIZE(x) != 1) {
1941 /* 1-n mapping */
1942 PyErr_SetString(PyExc_NotImplementedError,
1943 "1-n mappings are currently not implemented");
1944 Py_DECREF(x);
1945 goto onError;
1946 }
1947 *p++ = *PyUnicode_AS_UNICODE(x);
1948 }
1949 else {
1950 /* wrong return value */
1951 PyErr_SetString(PyExc_TypeError,
1952 "translate mapping must return integer, None or unicode");
1953 Py_DECREF(x);
1954 goto onError;
1955 }
1956 Py_DECREF(x);
1957 }
1958 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001959 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1960 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001961
1962 done:
1963 return (PyObject *)v;
1964
1965 onError:
1966 Py_XDECREF(v);
1967 return NULL;
1968}
1969
1970PyObject *PyUnicode_Translate(PyObject *str,
1971 PyObject *mapping,
1972 const char *errors)
1973{
1974 PyObject *result;
1975
1976 str = PyUnicode_FromObject(str);
1977 if (str == NULL)
1978 goto onError;
1979 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
1980 PyUnicode_GET_SIZE(str),
1981 mapping,
1982 errors);
1983 Py_DECREF(str);
1984 return result;
1985
1986 onError:
1987 Py_XDECREF(str);
1988 return NULL;
1989}
1990
Guido van Rossum9e896b32000-04-05 20:11:21 +00001991/* --- Decimal Encoder ---------------------------------------------------- */
1992
1993int PyUnicode_EncodeDecimal(Py_UNICODE *s,
1994 int length,
1995 char *output,
1996 const char *errors)
1997{
1998 Py_UNICODE *p, *end;
1999
2000 if (output == NULL) {
2001 PyErr_BadArgument();
2002 return -1;
2003 }
2004
2005 p = s;
2006 end = s + length;
2007 while (p < end) {
2008 register Py_UNICODE ch = *p++;
2009 int decimal;
2010
2011 if (Py_UNICODE_ISSPACE(ch)) {
2012 *output++ = ' ';
2013 continue;
2014 }
2015 decimal = Py_UNICODE_TODECIMAL(ch);
2016 if (decimal >= 0) {
2017 *output++ = '0' + decimal;
2018 continue;
2019 }
Guido van Rossumba477042000-04-06 18:18:10 +00002020 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002021 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002022 continue;
2023 }
2024 /* All other characters are considered invalid */
2025 if (errors == NULL || strcmp(errors, "strict") == 0) {
2026 PyErr_SetString(PyExc_ValueError,
2027 "invalid decimal Unicode string");
2028 goto onError;
2029 }
2030 else if (strcmp(errors, "ignore") == 0)
2031 continue;
2032 else if (strcmp(errors, "replace") == 0) {
2033 *output++ = '?';
2034 continue;
2035 }
2036 }
2037 /* 0-terminate the output string */
2038 *output++ = '\0';
2039 return 0;
2040
2041 onError:
2042 return -1;
2043}
2044
Guido van Rossumd57fd912000-03-10 22:53:23 +00002045/* --- Helpers ------------------------------------------------------------ */
2046
2047static
2048int count(PyUnicodeObject *self,
2049 int start,
2050 int end,
2051 PyUnicodeObject *substring)
2052{
2053 int count = 0;
2054
2055 end -= substring->length;
2056
2057 while (start <= end)
2058 if (Py_UNICODE_MATCH(self, start, substring)) {
2059 count++;
2060 start += substring->length;
2061 } else
2062 start++;
2063
2064 return count;
2065}
2066
2067int PyUnicode_Count(PyObject *str,
2068 PyObject *substr,
2069 int start,
2070 int end)
2071{
2072 int result;
2073
2074 str = PyUnicode_FromObject(str);
2075 if (str == NULL)
2076 return -1;
2077 substr = PyUnicode_FromObject(substr);
2078 if (substr == NULL) {
2079 Py_DECREF(substr);
2080 return -1;
2081 }
2082
2083 result = count((PyUnicodeObject *)str,
2084 start, end,
2085 (PyUnicodeObject *)substr);
2086
2087 Py_DECREF(str);
2088 Py_DECREF(substr);
2089 return result;
2090}
2091
2092static
2093int findstring(PyUnicodeObject *self,
2094 PyUnicodeObject *substring,
2095 int start,
2096 int end,
2097 int direction)
2098{
2099 if (start < 0)
2100 start += self->length;
2101 if (start < 0)
2102 start = 0;
2103
2104 if (substring->length == 0)
2105 return start;
2106
2107 if (end > self->length)
2108 end = self->length;
2109 if (end < 0)
2110 end += self->length;
2111 if (end < 0)
2112 end = 0;
2113
2114 end -= substring->length;
2115
2116 if (direction < 0) {
2117 for (; end >= start; end--)
2118 if (Py_UNICODE_MATCH(self, end, substring))
2119 return end;
2120 } else {
2121 for (; start <= end; start++)
2122 if (Py_UNICODE_MATCH(self, start, substring))
2123 return start;
2124 }
2125
2126 return -1;
2127}
2128
2129int PyUnicode_Find(PyObject *str,
2130 PyObject *substr,
2131 int start,
2132 int end,
2133 int direction)
2134{
2135 int result;
2136
2137 str = PyUnicode_FromObject(str);
2138 if (str == NULL)
2139 return -1;
2140 substr = PyUnicode_FromObject(substr);
2141 if (substr == NULL) {
2142 Py_DECREF(substr);
2143 return -1;
2144 }
2145
2146 result = findstring((PyUnicodeObject *)str,
2147 (PyUnicodeObject *)substr,
2148 start, end, direction);
2149 Py_DECREF(str);
2150 Py_DECREF(substr);
2151 return result;
2152}
2153
2154static
2155int tailmatch(PyUnicodeObject *self,
2156 PyUnicodeObject *substring,
2157 int start,
2158 int end,
2159 int direction)
2160{
2161 if (start < 0)
2162 start += self->length;
2163 if (start < 0)
2164 start = 0;
2165
2166 if (substring->length == 0)
2167 return 1;
2168
2169 if (end > self->length)
2170 end = self->length;
2171 if (end < 0)
2172 end += self->length;
2173 if (end < 0)
2174 end = 0;
2175
2176 end -= substring->length;
2177 if (end < start)
2178 return 0;
2179
2180 if (direction > 0) {
2181 if (Py_UNICODE_MATCH(self, end, substring))
2182 return 1;
2183 } else {
2184 if (Py_UNICODE_MATCH(self, start, substring))
2185 return 1;
2186 }
2187
2188 return 0;
2189}
2190
2191int PyUnicode_Tailmatch(PyObject *str,
2192 PyObject *substr,
2193 int start,
2194 int end,
2195 int direction)
2196{
2197 int result;
2198
2199 str = PyUnicode_FromObject(str);
2200 if (str == NULL)
2201 return -1;
2202 substr = PyUnicode_FromObject(substr);
2203 if (substr == NULL) {
2204 Py_DECREF(substr);
2205 return -1;
2206 }
2207
2208 result = tailmatch((PyUnicodeObject *)str,
2209 (PyUnicodeObject *)substr,
2210 start, end, direction);
2211 Py_DECREF(str);
2212 Py_DECREF(substr);
2213 return result;
2214}
2215
2216static
2217const Py_UNICODE *findchar(const Py_UNICODE *s,
2218 int size,
2219 Py_UNICODE ch)
2220{
2221 /* like wcschr, but doesn't stop at NULL characters */
2222
2223 while (size-- > 0) {
2224 if (*s == ch)
2225 return s;
2226 s++;
2227 }
2228
2229 return NULL;
2230}
2231
2232/* Apply fixfct filter to the Unicode object self and return a
2233 reference to the modified object */
2234
2235static
2236PyObject *fixup(PyUnicodeObject *self,
2237 int (*fixfct)(PyUnicodeObject *s))
2238{
2239
2240 PyUnicodeObject *u;
2241
2242 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2243 self->length);
2244 if (u == NULL)
2245 return NULL;
2246 if (!fixfct(u)) {
2247 /* fixfct should return TRUE if it modified the buffer. If
2248 FALSE, return a reference to the original buffer instead
2249 (to save space, not time) */
2250 Py_INCREF(self);
2251 Py_DECREF(u);
2252 return (PyObject*) self;
2253 }
2254 return (PyObject*) u;
2255}
2256
2257static
2258int fixupper(PyUnicodeObject *self)
2259{
2260 int len = self->length;
2261 Py_UNICODE *s = self->str;
2262 int status = 0;
2263
2264 while (len-- > 0) {
2265 register Py_UNICODE ch;
2266
2267 ch = Py_UNICODE_TOUPPER(*s);
2268 if (ch != *s) {
2269 status = 1;
2270 *s = ch;
2271 }
2272 s++;
2273 }
2274
2275 return status;
2276}
2277
2278static
2279int fixlower(PyUnicodeObject *self)
2280{
2281 int len = self->length;
2282 Py_UNICODE *s = self->str;
2283 int status = 0;
2284
2285 while (len-- > 0) {
2286 register Py_UNICODE ch;
2287
2288 ch = Py_UNICODE_TOLOWER(*s);
2289 if (ch != *s) {
2290 status = 1;
2291 *s = ch;
2292 }
2293 s++;
2294 }
2295
2296 return status;
2297}
2298
2299static
2300int fixswapcase(PyUnicodeObject *self)
2301{
2302 int len = self->length;
2303 Py_UNICODE *s = self->str;
2304 int status = 0;
2305
2306 while (len-- > 0) {
2307 if (Py_UNICODE_ISUPPER(*s)) {
2308 *s = Py_UNICODE_TOLOWER(*s);
2309 status = 1;
2310 } else if (Py_UNICODE_ISLOWER(*s)) {
2311 *s = Py_UNICODE_TOUPPER(*s);
2312 status = 1;
2313 }
2314 s++;
2315 }
2316
2317 return status;
2318}
2319
2320static
2321int fixcapitalize(PyUnicodeObject *self)
2322{
2323 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2324 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2325 return 1;
2326 }
2327 return 0;
2328}
2329
2330static
2331int fixtitle(PyUnicodeObject *self)
2332{
2333 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2334 register Py_UNICODE *e;
2335 int previous_is_cased;
2336
2337 /* Shortcut for single character strings */
2338 if (PyUnicode_GET_SIZE(self) == 1) {
2339 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2340 if (*p != ch) {
2341 *p = ch;
2342 return 1;
2343 }
2344 else
2345 return 0;
2346 }
2347
2348 e = p + PyUnicode_GET_SIZE(self);
2349 previous_is_cased = 0;
2350 for (; p < e; p++) {
2351 register const Py_UNICODE ch = *p;
2352
2353 if (previous_is_cased)
2354 *p = Py_UNICODE_TOLOWER(ch);
2355 else
2356 *p = Py_UNICODE_TOTITLE(ch);
2357
2358 if (Py_UNICODE_ISLOWER(ch) ||
2359 Py_UNICODE_ISUPPER(ch) ||
2360 Py_UNICODE_ISTITLE(ch))
2361 previous_is_cased = 1;
2362 else
2363 previous_is_cased = 0;
2364 }
2365 return 1;
2366}
2367
2368PyObject *PyUnicode_Join(PyObject *separator,
2369 PyObject *seq)
2370{
2371 Py_UNICODE *sep;
2372 int seplen;
2373 PyUnicodeObject *res = NULL;
2374 int reslen = 0;
2375 Py_UNICODE *p;
2376 int seqlen = 0;
2377 int sz = 100;
2378 int i;
2379
2380 seqlen = PySequence_Length(seq);
2381 if (seqlen < 0 && PyErr_Occurred())
2382 return NULL;
2383
2384 if (separator == NULL) {
2385 Py_UNICODE blank = ' ';
2386 sep = &blank;
2387 seplen = 1;
2388 }
2389 else {
2390 separator = PyUnicode_FromObject(separator);
2391 if (separator == NULL)
2392 return NULL;
2393 sep = PyUnicode_AS_UNICODE(separator);
2394 seplen = PyUnicode_GET_SIZE(separator);
2395 }
2396
2397 res = _PyUnicode_New(sz);
2398 if (res == NULL)
2399 goto onError;
2400 p = PyUnicode_AS_UNICODE(res);
2401 reslen = 0;
2402
2403 for (i = 0; i < seqlen; i++) {
2404 int itemlen;
2405 PyObject *item;
2406
2407 item = PySequence_GetItem(seq, i);
2408 if (item == NULL)
2409 goto onError;
2410 if (!PyUnicode_Check(item)) {
2411 PyObject *v;
2412 v = PyUnicode_FromObject(item);
2413 Py_DECREF(item);
2414 item = v;
2415 if (item == NULL)
2416 goto onError;
2417 }
2418 itemlen = PyUnicode_GET_SIZE(item);
2419 while (reslen + itemlen + seplen >= sz) {
2420 if (_PyUnicode_Resize(res, sz*2))
2421 goto onError;
2422 sz *= 2;
2423 p = PyUnicode_AS_UNICODE(res) + reslen;
2424 }
2425 if (i > 0) {
2426 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2427 p += seplen;
2428 reslen += seplen;
2429 }
2430 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2431 p += itemlen;
2432 reslen += itemlen;
2433 Py_DECREF(item);
2434 }
2435 if (_PyUnicode_Resize(res, reslen))
2436 goto onError;
2437
2438 Py_XDECREF(separator);
2439 return (PyObject *)res;
2440
2441 onError:
2442 Py_XDECREF(separator);
2443 Py_DECREF(res);
2444 return NULL;
2445}
2446
2447static
2448PyUnicodeObject *pad(PyUnicodeObject *self,
2449 int left,
2450 int right,
2451 Py_UNICODE fill)
2452{
2453 PyUnicodeObject *u;
2454
2455 if (left < 0)
2456 left = 0;
2457 if (right < 0)
2458 right = 0;
2459
2460 if (left == 0 && right == 0) {
2461 Py_INCREF(self);
2462 return self;
2463 }
2464
2465 u = _PyUnicode_New(left + self->length + right);
2466 if (u) {
2467 if (left)
2468 Py_UNICODE_FILL(u->str, fill, left);
2469 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2470 if (right)
2471 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2472 }
2473
2474 return u;
2475}
2476
2477#define SPLIT_APPEND(data, left, right) \
2478 str = PyUnicode_FromUnicode(data + left, right - left); \
2479 if (!str) \
2480 goto onError; \
2481 if (PyList_Append(list, str)) { \
2482 Py_DECREF(str); \
2483 goto onError; \
2484 } \
2485 else \
2486 Py_DECREF(str);
2487
2488static
2489PyObject *split_whitespace(PyUnicodeObject *self,
2490 PyObject *list,
2491 int maxcount)
2492{
2493 register int i;
2494 register int j;
2495 int len = self->length;
2496 PyObject *str;
2497
2498 for (i = j = 0; i < len; ) {
2499 /* find a token */
2500 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2501 i++;
2502 j = i;
2503 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2504 i++;
2505 if (j < i) {
2506 if (maxcount-- <= 0)
2507 break;
2508 SPLIT_APPEND(self->str, j, i);
2509 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2510 i++;
2511 j = i;
2512 }
2513 }
2514 if (j < len) {
2515 SPLIT_APPEND(self->str, j, len);
2516 }
2517 return list;
2518
2519 onError:
2520 Py_DECREF(list);
2521 return NULL;
2522}
2523
2524PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002525 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002526{
2527 register int i;
2528 register int j;
2529 int len;
2530 PyObject *list;
2531 PyObject *str;
2532 Py_UNICODE *data;
2533
2534 string = PyUnicode_FromObject(string);
2535 if (string == NULL)
2536 return NULL;
2537 data = PyUnicode_AS_UNICODE(string);
2538 len = PyUnicode_GET_SIZE(string);
2539
Guido van Rossumd57fd912000-03-10 22:53:23 +00002540 list = PyList_New(0);
2541 if (!list)
2542 goto onError;
2543
2544 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002545 int eol;
2546
Guido van Rossumd57fd912000-03-10 22:53:23 +00002547 /* Find a line and append it */
2548 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2549 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002550
2551 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002552 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002553 if (i < len) {
2554 if (data[i] == '\r' && i + 1 < len &&
2555 data[i+1] == '\n')
2556 i += 2;
2557 else
2558 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002559 if (keepends)
2560 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002561 }
Guido van Rossum86662912000-04-11 15:38:46 +00002562 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002563 j = i;
2564 }
2565 if (j < len) {
2566 SPLIT_APPEND(data, j, len);
2567 }
2568
2569 Py_DECREF(string);
2570 return list;
2571
2572 onError:
2573 Py_DECREF(list);
2574 Py_DECREF(string);
2575 return NULL;
2576}
2577
2578static
2579PyObject *split_char(PyUnicodeObject *self,
2580 PyObject *list,
2581 Py_UNICODE ch,
2582 int maxcount)
2583{
2584 register int i;
2585 register int j;
2586 int len = self->length;
2587 PyObject *str;
2588
2589 for (i = j = 0; i < len; ) {
2590 if (self->str[i] == ch) {
2591 if (maxcount-- <= 0)
2592 break;
2593 SPLIT_APPEND(self->str, j, i);
2594 i = j = i + 1;
2595 } else
2596 i++;
2597 }
2598 if (j <= len) {
2599 SPLIT_APPEND(self->str, j, len);
2600 }
2601 return list;
2602
2603 onError:
2604 Py_DECREF(list);
2605 return NULL;
2606}
2607
2608static
2609PyObject *split_substring(PyUnicodeObject *self,
2610 PyObject *list,
2611 PyUnicodeObject *substring,
2612 int maxcount)
2613{
2614 register int i;
2615 register int j;
2616 int len = self->length;
2617 int sublen = substring->length;
2618 PyObject *str;
2619
2620 for (i = j = 0; i < len - sublen; ) {
2621 if (Py_UNICODE_MATCH(self, i, substring)) {
2622 if (maxcount-- <= 0)
2623 break;
2624 SPLIT_APPEND(self->str, j, i);
2625 i = j = i + sublen;
2626 } else
2627 i++;
2628 }
2629 if (j <= len) {
2630 SPLIT_APPEND(self->str, j, len);
2631 }
2632 return list;
2633
2634 onError:
2635 Py_DECREF(list);
2636 return NULL;
2637}
2638
2639#undef SPLIT_APPEND
2640
2641static
2642PyObject *split(PyUnicodeObject *self,
2643 PyUnicodeObject *substring,
2644 int maxcount)
2645{
2646 PyObject *list;
2647
2648 if (maxcount < 0)
2649 maxcount = INT_MAX;
2650
2651 list = PyList_New(0);
2652 if (!list)
2653 return NULL;
2654
2655 if (substring == NULL)
2656 return split_whitespace(self,list,maxcount);
2657
2658 else if (substring->length == 1)
2659 return split_char(self,list,substring->str[0],maxcount);
2660
2661 else if (substring->length == 0) {
2662 Py_DECREF(list);
2663 PyErr_SetString(PyExc_ValueError, "empty separator");
2664 return NULL;
2665 }
2666 else
2667 return split_substring(self,list,substring,maxcount);
2668}
2669
2670static
2671PyObject *strip(PyUnicodeObject *self,
2672 int left,
2673 int right)
2674{
2675 Py_UNICODE *p = self->str;
2676 int start = 0;
2677 int end = self->length;
2678
2679 if (left)
2680 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2681 start++;
2682
2683 if (right)
2684 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2685 end--;
2686
2687 if (start == 0 && end == self->length) {
2688 /* couldn't strip anything off, return original string */
2689 Py_INCREF(self);
2690 return (PyObject*) self;
2691 }
2692
2693 return (PyObject*) PyUnicode_FromUnicode(
2694 self->str + start,
2695 end - start
2696 );
2697}
2698
2699static
2700PyObject *replace(PyUnicodeObject *self,
2701 PyUnicodeObject *str1,
2702 PyUnicodeObject *str2,
2703 int maxcount)
2704{
2705 PyUnicodeObject *u;
2706
2707 if (maxcount < 0)
2708 maxcount = INT_MAX;
2709
2710 if (str1->length == 1 && str2->length == 1) {
2711 int i;
2712
2713 /* replace characters */
2714 if (!findchar(self->str, self->length, str1->str[0])) {
2715 /* nothing to replace, return original string */
2716 Py_INCREF(self);
2717 u = self;
2718 } else {
2719 Py_UNICODE u1 = str1->str[0];
2720 Py_UNICODE u2 = str2->str[0];
2721
2722 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2723 self->str,
2724 self->length
2725 );
2726 if (u)
2727 for (i = 0; i < u->length; i++)
2728 if (u->str[i] == u1) {
2729 if (--maxcount < 0)
2730 break;
2731 u->str[i] = u2;
2732 }
2733 }
2734
2735 } else {
2736 int n, i;
2737 Py_UNICODE *p;
2738
2739 /* replace strings */
2740 n = count(self, 0, self->length, str1);
2741 if (n > maxcount)
2742 n = maxcount;
2743 if (n == 0) {
2744 /* nothing to replace, return original string */
2745 Py_INCREF(self);
2746 u = self;
2747 } else {
2748 u = _PyUnicode_New(
2749 self->length + n * (str2->length - str1->length));
2750 if (u) {
2751 i = 0;
2752 p = u->str;
2753 while (i <= self->length - str1->length)
2754 if (Py_UNICODE_MATCH(self, i, str1)) {
2755 /* replace string segment */
2756 Py_UNICODE_COPY(p, str2->str, str2->length);
2757 p += str2->length;
2758 i += str1->length;
2759 if (--n <= 0) {
2760 /* copy remaining part */
2761 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2762 break;
2763 }
2764 } else
2765 *p++ = self->str[i++];
2766 }
2767 }
2768 }
2769
2770 return (PyObject *) u;
2771}
2772
2773/* --- Unicode Object Methods --------------------------------------------- */
2774
2775static char title__doc__[] =
2776"S.title() -> unicode\n\
2777\n\
2778Return a titlecased version of S, i.e. words start with title case\n\
2779characters, all remaining cased characters have lower case.";
2780
2781static PyObject*
2782unicode_title(PyUnicodeObject *self, PyObject *args)
2783{
2784 if (!PyArg_NoArgs(args))
2785 return NULL;
2786 return fixup(self, fixtitle);
2787}
2788
2789static char capitalize__doc__[] =
2790"S.capitalize() -> unicode\n\
2791\n\
2792Return a capitalized version of S, i.e. make the first character\n\
2793have upper case.";
2794
2795static PyObject*
2796unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2797{
2798 if (!PyArg_NoArgs(args))
2799 return NULL;
2800 return fixup(self, fixcapitalize);
2801}
2802
2803#if 0
2804static char capwords__doc__[] =
2805"S.capwords() -> unicode\n\
2806\n\
2807Apply .capitalize() to all words in S and return the result with\n\
2808normalized whitespace (all whitespace strings are replaced by ' ').";
2809
2810static PyObject*
2811unicode_capwords(PyUnicodeObject *self, PyObject *args)
2812{
2813 PyObject *list;
2814 PyObject *item;
2815 int i;
2816
2817 if (!PyArg_NoArgs(args))
2818 return NULL;
2819
2820 /* Split into words */
2821 list = split(self, NULL, -1);
2822 if (!list)
2823 return NULL;
2824
2825 /* Capitalize each word */
2826 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2827 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2828 fixcapitalize);
2829 if (item == NULL)
2830 goto onError;
2831 Py_DECREF(PyList_GET_ITEM(list, i));
2832 PyList_SET_ITEM(list, i, item);
2833 }
2834
2835 /* Join the words to form a new string */
2836 item = PyUnicode_Join(NULL, list);
2837
2838onError:
2839 Py_DECREF(list);
2840 return (PyObject *)item;
2841}
2842#endif
2843
2844static char center__doc__[] =
2845"S.center(width) -> unicode\n\
2846\n\
2847Return S centered in a Unicode string of length width. Padding is done\n\
2848using spaces.";
2849
2850static PyObject *
2851unicode_center(PyUnicodeObject *self, PyObject *args)
2852{
2853 int marg, left;
2854 int width;
2855
2856 if (!PyArg_ParseTuple(args, "i:center", &width))
2857 return NULL;
2858
2859 if (self->length >= width) {
2860 Py_INCREF(self);
2861 return (PyObject*) self;
2862 }
2863
2864 marg = width - self->length;
2865 left = marg / 2 + (marg & width & 1);
2866
2867 return (PyObject*) pad(self, left, marg - left, ' ');
2868}
2869
2870static int
2871unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2872{
2873 int len1, len2;
2874 Py_UNICODE *s1 = str1->str;
2875 Py_UNICODE *s2 = str2->str;
2876
2877 len1 = str1->length;
2878 len2 = str2->length;
2879
2880 while (len1 > 0 && len2 > 0) {
2881 int cmp = (*s1++) - (*s2++);
2882 if (cmp)
2883 /* This should make Christian happy! */
2884 return (cmp < 0) ? -1 : (cmp != 0);
2885 len1--, len2--;
2886 }
2887
2888 return (len1 < len2) ? -1 : (len1 != len2);
2889}
2890
2891int PyUnicode_Compare(PyObject *left,
2892 PyObject *right)
2893{
2894 PyUnicodeObject *u = NULL, *v = NULL;
2895 int result;
2896
2897 /* Coerce the two arguments */
2898 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2899 if (u == NULL)
2900 goto onError;
2901 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2902 if (v == NULL)
2903 goto onError;
2904
2905 /* Shortcut for emtpy or interned objects */
2906 if (v == u) {
2907 Py_DECREF(u);
2908 Py_DECREF(v);
2909 return 0;
2910 }
2911
2912 result = unicode_compare(u, v);
2913
2914 Py_DECREF(u);
2915 Py_DECREF(v);
2916 return result;
2917
2918onError:
2919 Py_XDECREF(u);
2920 Py_XDECREF(v);
2921 return -1;
2922}
2923
Guido van Rossum403d68b2000-03-13 15:55:09 +00002924int PyUnicode_Contains(PyObject *container,
2925 PyObject *element)
2926{
2927 PyUnicodeObject *u = NULL, *v = NULL;
2928 int result;
2929 register const Py_UNICODE *p, *e;
2930 register Py_UNICODE ch;
2931
2932 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002933 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2934 if (v == NULL)
2935 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002936 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2937 if (u == NULL) {
2938 Py_DECREF(v);
2939 goto onError;
2940 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002941
2942 /* Check v in u */
2943 if (PyUnicode_GET_SIZE(v) != 1) {
2944 PyErr_SetString(PyExc_TypeError,
2945 "string member test needs char left operand");
2946 goto onError;
2947 }
2948 ch = *PyUnicode_AS_UNICODE(v);
2949 p = PyUnicode_AS_UNICODE(u);
2950 e = p + PyUnicode_GET_SIZE(u);
2951 result = 0;
2952 while (p < e) {
2953 if (*p++ == ch) {
2954 result = 1;
2955 break;
2956 }
2957 }
2958
2959 Py_DECREF(u);
2960 Py_DECREF(v);
2961 return result;
2962
2963onError:
2964 Py_XDECREF(u);
2965 Py_XDECREF(v);
2966 return -1;
2967}
2968
Guido van Rossumd57fd912000-03-10 22:53:23 +00002969/* Concat to string or Unicode object giving a new Unicode object. */
2970
2971PyObject *PyUnicode_Concat(PyObject *left,
2972 PyObject *right)
2973{
2974 PyUnicodeObject *u = NULL, *v = NULL, *w;
2975
2976 /* Coerce the two arguments */
2977 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2978 if (u == NULL)
2979 goto onError;
2980 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2981 if (v == NULL)
2982 goto onError;
2983
2984 /* Shortcuts */
2985 if (v == unicode_empty) {
2986 Py_DECREF(v);
2987 return (PyObject *)u;
2988 }
2989 if (u == unicode_empty) {
2990 Py_DECREF(u);
2991 return (PyObject *)v;
2992 }
2993
2994 /* Concat the two Unicode strings */
2995 w = _PyUnicode_New(u->length + v->length);
2996 if (w == NULL)
2997 goto onError;
2998 Py_UNICODE_COPY(w->str, u->str, u->length);
2999 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3000
3001 Py_DECREF(u);
3002 Py_DECREF(v);
3003 return (PyObject *)w;
3004
3005onError:
3006 Py_XDECREF(u);
3007 Py_XDECREF(v);
3008 return NULL;
3009}
3010
3011static char count__doc__[] =
3012"S.count(sub[, start[, end]]) -> int\n\
3013\n\
3014Return the number of occurrences of substring sub in Unicode string\n\
3015S[start:end]. Optional arguments start and end are\n\
3016interpreted as in slice notation.";
3017
3018static PyObject *
3019unicode_count(PyUnicodeObject *self, PyObject *args)
3020{
3021 PyUnicodeObject *substring;
3022 int start = 0;
3023 int end = INT_MAX;
3024 PyObject *result;
3025
Guido van Rossumb8872e62000-05-09 14:14:27 +00003026 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3027 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003028 return NULL;
3029
3030 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3031 (PyObject *)substring);
3032 if (substring == NULL)
3033 return NULL;
3034
3035 if (substring->length == 0) {
3036 Py_DECREF(substring);
3037 return PyInt_FromLong((long) 0);
3038 }
3039
3040 if (start < 0)
3041 start += self->length;
3042 if (start < 0)
3043 start = 0;
3044 if (end > self->length)
3045 end = self->length;
3046 if (end < 0)
3047 end += self->length;
3048 if (end < 0)
3049 end = 0;
3050
3051 result = PyInt_FromLong((long) count(self, start, end, substring));
3052
3053 Py_DECREF(substring);
3054 return result;
3055}
3056
3057static char encode__doc__[] =
3058"S.encode([encoding[,errors]]) -> string\n\
3059\n\
3060Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
3061errors may be given to set a different error handling scheme. Default\n\
3062is 'strict' meaning that encoding errors raise a ValueError. Other\n\
3063possible values are 'ignore' and 'replace'.";
3064
3065static PyObject *
3066unicode_encode(PyUnicodeObject *self, PyObject *args)
3067{
3068 char *encoding = NULL;
3069 char *errors = NULL;
3070 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3071 return NULL;
3072 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3073}
3074
3075static char expandtabs__doc__[] =
3076"S.expandtabs([tabsize]) -> unicode\n\
3077\n\
3078Return a copy of S where all tab characters are expanded using spaces.\n\
3079If tabsize is not given, a tab size of 8 characters is assumed.";
3080
3081static PyObject*
3082unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3083{
3084 Py_UNICODE *e;
3085 Py_UNICODE *p;
3086 Py_UNICODE *q;
3087 int i, j;
3088 PyUnicodeObject *u;
3089 int tabsize = 8;
3090
3091 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3092 return NULL;
3093
3094 /* First pass: determine size of ouput string */
3095 i = j = 0;
3096 e = self->str + self->length;
3097 for (p = self->str; p < e; p++)
3098 if (*p == '\t') {
3099 if (tabsize > 0)
3100 j += tabsize - (j % tabsize);
3101 }
3102 else {
3103 j++;
3104 if (*p == '\n' || *p == '\r') {
3105 i += j;
3106 j = 0;
3107 }
3108 }
3109
3110 /* Second pass: create output string and fill it */
3111 u = _PyUnicode_New(i + j);
3112 if (!u)
3113 return NULL;
3114
3115 j = 0;
3116 q = u->str;
3117
3118 for (p = self->str; p < e; p++)
3119 if (*p == '\t') {
3120 if (tabsize > 0) {
3121 i = tabsize - (j % tabsize);
3122 j += i;
3123 while (i--)
3124 *q++ = ' ';
3125 }
3126 }
3127 else {
3128 j++;
3129 *q++ = *p;
3130 if (*p == '\n' || *p == '\r')
3131 j = 0;
3132 }
3133
3134 return (PyObject*) u;
3135}
3136
3137static char find__doc__[] =
3138"S.find(sub [,start [,end]]) -> int\n\
3139\n\
3140Return the lowest index in S where substring sub is found,\n\
3141such that sub is contained within s[start,end]. Optional\n\
3142arguments start and end are interpreted as in slice notation.\n\
3143\n\
3144Return -1 on failure.";
3145
3146static PyObject *
3147unicode_find(PyUnicodeObject *self, PyObject *args)
3148{
3149 PyUnicodeObject *substring;
3150 int start = 0;
3151 int end = INT_MAX;
3152 PyObject *result;
3153
Guido van Rossumb8872e62000-05-09 14:14:27 +00003154 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3155 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003156 return NULL;
3157 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3158 (PyObject *)substring);
3159 if (substring == NULL)
3160 return NULL;
3161
3162 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3163
3164 Py_DECREF(substring);
3165 return result;
3166}
3167
3168static PyObject *
3169unicode_getitem(PyUnicodeObject *self, int index)
3170{
3171 if (index < 0 || index >= self->length) {
3172 PyErr_SetString(PyExc_IndexError, "string index out of range");
3173 return NULL;
3174 }
3175
3176 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3177}
3178
3179static long
3180unicode_hash(PyUnicodeObject *self)
3181{
3182 long hash;
3183 PyObject *utf8;
3184
3185 /* Since Unicode objects compare equal to their UTF-8 string
3186 counterparts, they should also use the UTF-8 strings as basis
3187 for their hash value. This is needed to assure that strings and
3188 Unicode objects behave in the same way as dictionary
3189 keys. Unfortunately, this costs some performance and also some
3190 memory if the cached UTF-8 representation is not used later
3191 on. */
3192 if (self->hash != -1)
3193 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003194 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003195 if (utf8 == NULL)
3196 return -1;
3197 hash = PyObject_Hash(utf8);
3198 if (hash == -1)
3199 return -1;
3200 self->hash = hash;
3201 return hash;
3202}
3203
3204static char index__doc__[] =
3205"S.index(sub [,start [,end]]) -> int\n\
3206\n\
3207Like S.find() but raise ValueError when the substring is not found.";
3208
3209static PyObject *
3210unicode_index(PyUnicodeObject *self, PyObject *args)
3211{
3212 int result;
3213 PyUnicodeObject *substring;
3214 int start = 0;
3215 int end = INT_MAX;
3216
Guido van Rossumb8872e62000-05-09 14:14:27 +00003217 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3218 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219 return NULL;
3220
3221 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3222 (PyObject *)substring);
3223 if (substring == NULL)
3224 return NULL;
3225
3226 result = findstring(self, substring, start, end, 1);
3227
3228 Py_DECREF(substring);
3229 if (result < 0) {
3230 PyErr_SetString(PyExc_ValueError, "substring not found");
3231 return NULL;
3232 }
3233 return PyInt_FromLong(result);
3234}
3235
3236static char islower__doc__[] =
3237"S.islower() -> int\n\
3238\n\
3239Return 1 if all cased characters in S are lowercase and there is\n\
3240at least one cased character in S, 0 otherwise.";
3241
3242static PyObject*
3243unicode_islower(PyUnicodeObject *self, PyObject *args)
3244{
3245 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3246 register const Py_UNICODE *e;
3247 int cased;
3248
3249 if (!PyArg_NoArgs(args))
3250 return NULL;
3251
3252 /* Shortcut for single character strings */
3253 if (PyUnicode_GET_SIZE(self) == 1)
3254 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3255
3256 e = p + PyUnicode_GET_SIZE(self);
3257 cased = 0;
3258 for (; p < e; p++) {
3259 register const Py_UNICODE ch = *p;
3260
3261 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3262 return PyInt_FromLong(0);
3263 else if (!cased && Py_UNICODE_ISLOWER(ch))
3264 cased = 1;
3265 }
3266 return PyInt_FromLong(cased);
3267}
3268
3269static char isupper__doc__[] =
3270"S.isupper() -> int\n\
3271\n\
3272Return 1 if all cased characters in S are uppercase and there is\n\
3273at least one cased character in S, 0 otherwise.";
3274
3275static PyObject*
3276unicode_isupper(PyUnicodeObject *self, PyObject *args)
3277{
3278 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3279 register const Py_UNICODE *e;
3280 int cased;
3281
3282 if (!PyArg_NoArgs(args))
3283 return NULL;
3284
3285 /* Shortcut for single character strings */
3286 if (PyUnicode_GET_SIZE(self) == 1)
3287 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3288
3289 e = p + PyUnicode_GET_SIZE(self);
3290 cased = 0;
3291 for (; p < e; p++) {
3292 register const Py_UNICODE ch = *p;
3293
3294 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3295 return PyInt_FromLong(0);
3296 else if (!cased && Py_UNICODE_ISUPPER(ch))
3297 cased = 1;
3298 }
3299 return PyInt_FromLong(cased);
3300}
3301
3302static char istitle__doc__[] =
3303"S.istitle() -> int\n\
3304\n\
3305Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3306may only follow uncased characters and lowercase characters only cased\n\
3307ones. Return 0 otherwise.";
3308
3309static PyObject*
3310unicode_istitle(PyUnicodeObject *self, PyObject *args)
3311{
3312 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3313 register const Py_UNICODE *e;
3314 int cased, previous_is_cased;
3315
3316 if (!PyArg_NoArgs(args))
3317 return NULL;
3318
3319 /* Shortcut for single character strings */
3320 if (PyUnicode_GET_SIZE(self) == 1)
3321 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3322 (Py_UNICODE_ISUPPER(*p) != 0));
3323
3324 e = p + PyUnicode_GET_SIZE(self);
3325 cased = 0;
3326 previous_is_cased = 0;
3327 for (; p < e; p++) {
3328 register const Py_UNICODE ch = *p;
3329
3330 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3331 if (previous_is_cased)
3332 return PyInt_FromLong(0);
3333 previous_is_cased = 1;
3334 cased = 1;
3335 }
3336 else if (Py_UNICODE_ISLOWER(ch)) {
3337 if (!previous_is_cased)
3338 return PyInt_FromLong(0);
3339 previous_is_cased = 1;
3340 cased = 1;
3341 }
3342 else
3343 previous_is_cased = 0;
3344 }
3345 return PyInt_FromLong(cased);
3346}
3347
3348static char isspace__doc__[] =
3349"S.isspace() -> int\n\
3350\n\
3351Return 1 if there are only whitespace characters in S,\n\
33520 otherwise.";
3353
3354static PyObject*
3355unicode_isspace(PyUnicodeObject *self, PyObject *args)
3356{
3357 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3358 register const Py_UNICODE *e;
3359
3360 if (!PyArg_NoArgs(args))
3361 return NULL;
3362
3363 /* Shortcut for single character strings */
3364 if (PyUnicode_GET_SIZE(self) == 1 &&
3365 Py_UNICODE_ISSPACE(*p))
3366 return PyInt_FromLong(1);
3367
3368 e = p + PyUnicode_GET_SIZE(self);
3369 for (; p < e; p++) {
3370 if (!Py_UNICODE_ISSPACE(*p))
3371 return PyInt_FromLong(0);
3372 }
3373 return PyInt_FromLong(1);
3374}
3375
3376static char isdecimal__doc__[] =
3377"S.isdecimal() -> int\n\
3378\n\
3379Return 1 if there are only decimal characters in S,\n\
33800 otherwise.";
3381
3382static PyObject*
3383unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3384{
3385 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3386 register const Py_UNICODE *e;
3387
3388 if (!PyArg_NoArgs(args))
3389 return NULL;
3390
3391 /* Shortcut for single character strings */
3392 if (PyUnicode_GET_SIZE(self) == 1 &&
3393 Py_UNICODE_ISDECIMAL(*p))
3394 return PyInt_FromLong(1);
3395
3396 e = p + PyUnicode_GET_SIZE(self);
3397 for (; p < e; p++) {
3398 if (!Py_UNICODE_ISDECIMAL(*p))
3399 return PyInt_FromLong(0);
3400 }
3401 return PyInt_FromLong(1);
3402}
3403
3404static char isdigit__doc__[] =
3405"S.isdigit() -> int\n\
3406\n\
3407Return 1 if there are only digit characters in S,\n\
34080 otherwise.";
3409
3410static PyObject*
3411unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3412{
3413 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3414 register const Py_UNICODE *e;
3415
3416 if (!PyArg_NoArgs(args))
3417 return NULL;
3418
3419 /* Shortcut for single character strings */
3420 if (PyUnicode_GET_SIZE(self) == 1 &&
3421 Py_UNICODE_ISDIGIT(*p))
3422 return PyInt_FromLong(1);
3423
3424 e = p + PyUnicode_GET_SIZE(self);
3425 for (; p < e; p++) {
3426 if (!Py_UNICODE_ISDIGIT(*p))
3427 return PyInt_FromLong(0);
3428 }
3429 return PyInt_FromLong(1);
3430}
3431
3432static char isnumeric__doc__[] =
3433"S.isnumeric() -> int\n\
3434\n\
3435Return 1 if there are only numeric characters in S,\n\
34360 otherwise.";
3437
3438static PyObject*
3439unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3440{
3441 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3442 register const Py_UNICODE *e;
3443
3444 if (!PyArg_NoArgs(args))
3445 return NULL;
3446
3447 /* Shortcut for single character strings */
3448 if (PyUnicode_GET_SIZE(self) == 1 &&
3449 Py_UNICODE_ISNUMERIC(*p))
3450 return PyInt_FromLong(1);
3451
3452 e = p + PyUnicode_GET_SIZE(self);
3453 for (; p < e; p++) {
3454 if (!Py_UNICODE_ISNUMERIC(*p))
3455 return PyInt_FromLong(0);
3456 }
3457 return PyInt_FromLong(1);
3458}
3459
3460static char join__doc__[] =
3461"S.join(sequence) -> unicode\n\
3462\n\
3463Return a string which is the concatenation of the strings in the\n\
3464sequence. The separator between elements is S.";
3465
3466static PyObject*
3467unicode_join(PyUnicodeObject *self, PyObject *args)
3468{
3469 PyObject *data;
3470 if (!PyArg_ParseTuple(args, "O:join", &data))
3471 return NULL;
3472
3473 return PyUnicode_Join((PyObject *)self, data);
3474}
3475
3476static int
3477unicode_length(PyUnicodeObject *self)
3478{
3479 return self->length;
3480}
3481
3482static char ljust__doc__[] =
3483"S.ljust(width) -> unicode\n\
3484\n\
3485Return S left justified in a Unicode string of length width. Padding is\n\
3486done using spaces.";
3487
3488static PyObject *
3489unicode_ljust(PyUnicodeObject *self, PyObject *args)
3490{
3491 int width;
3492 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3493 return NULL;
3494
3495 if (self->length >= width) {
3496 Py_INCREF(self);
3497 return (PyObject*) self;
3498 }
3499
3500 return (PyObject*) pad(self, 0, width - self->length, ' ');
3501}
3502
3503static char lower__doc__[] =
3504"S.lower() -> unicode\n\
3505\n\
3506Return a copy of the string S converted to lowercase.";
3507
3508static PyObject*
3509unicode_lower(PyUnicodeObject *self, PyObject *args)
3510{
3511 if (!PyArg_NoArgs(args))
3512 return NULL;
3513 return fixup(self, fixlower);
3514}
3515
3516static char lstrip__doc__[] =
3517"S.lstrip() -> unicode\n\
3518\n\
3519Return a copy of the string S with leading whitespace removed.";
3520
3521static PyObject *
3522unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3523{
3524 if (!PyArg_NoArgs(args))
3525 return NULL;
3526 return strip(self, 1, 0);
3527}
3528
3529static PyObject*
3530unicode_repeat(PyUnicodeObject *str, int len)
3531{
3532 PyUnicodeObject *u;
3533 Py_UNICODE *p;
3534
3535 if (len < 0)
3536 len = 0;
3537
3538 if (len == 1) {
3539 /* no repeat, return original string */
3540 Py_INCREF(str);
3541 return (PyObject*) str;
3542 }
3543
3544 u = _PyUnicode_New(len * str->length);
3545 if (!u)
3546 return NULL;
3547
3548 p = u->str;
3549
3550 while (len-- > 0) {
3551 Py_UNICODE_COPY(p, str->str, str->length);
3552 p += str->length;
3553 }
3554
3555 return (PyObject*) u;
3556}
3557
3558PyObject *PyUnicode_Replace(PyObject *obj,
3559 PyObject *subobj,
3560 PyObject *replobj,
3561 int maxcount)
3562{
3563 PyObject *self;
3564 PyObject *str1;
3565 PyObject *str2;
3566 PyObject *result;
3567
3568 self = PyUnicode_FromObject(obj);
3569 if (self == NULL)
3570 return NULL;
3571 str1 = PyUnicode_FromObject(subobj);
3572 if (str1 == NULL) {
3573 Py_DECREF(self);
3574 return NULL;
3575 }
3576 str2 = PyUnicode_FromObject(replobj);
3577 if (str2 == NULL) {
3578 Py_DECREF(self);
3579 Py_DECREF(str1);
3580 return NULL;
3581 }
3582 result = replace((PyUnicodeObject *)self,
3583 (PyUnicodeObject *)str1,
3584 (PyUnicodeObject *)str2,
3585 maxcount);
3586 Py_DECREF(self);
3587 Py_DECREF(str1);
3588 Py_DECREF(str2);
3589 return result;
3590}
3591
3592static char replace__doc__[] =
3593"S.replace (old, new[, maxsplit]) -> unicode\n\
3594\n\
3595Return a copy of S with all occurrences of substring\n\
3596old replaced by new. If the optional argument maxsplit is\n\
3597given, only the first maxsplit occurrences are replaced.";
3598
3599static PyObject*
3600unicode_replace(PyUnicodeObject *self, PyObject *args)
3601{
3602 PyUnicodeObject *str1;
3603 PyUnicodeObject *str2;
3604 int maxcount = -1;
3605 PyObject *result;
3606
3607 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3608 return NULL;
3609 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3610 if (str1 == NULL)
3611 return NULL;
3612 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3613 if (str2 == NULL)
3614 return NULL;
3615
3616 result = replace(self, str1, str2, maxcount);
3617
3618 Py_DECREF(str1);
3619 Py_DECREF(str2);
3620 return result;
3621}
3622
3623static
3624PyObject *unicode_repr(PyObject *unicode)
3625{
3626 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3627 PyUnicode_GET_SIZE(unicode),
3628 1);
3629}
3630
3631static char rfind__doc__[] =
3632"S.rfind(sub [,start [,end]]) -> int\n\
3633\n\
3634Return the highest index in S where substring sub is found,\n\
3635such that sub is contained within s[start,end]. Optional\n\
3636arguments start and end are interpreted as in slice notation.\n\
3637\n\
3638Return -1 on failure.";
3639
3640static PyObject *
3641unicode_rfind(PyUnicodeObject *self, PyObject *args)
3642{
3643 PyUnicodeObject *substring;
3644 int start = 0;
3645 int end = INT_MAX;
3646 PyObject *result;
3647
Guido van Rossumb8872e62000-05-09 14:14:27 +00003648 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
3649 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003650 return NULL;
3651 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3652 (PyObject *)substring);
3653 if (substring == NULL)
3654 return NULL;
3655
3656 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3657
3658 Py_DECREF(substring);
3659 return result;
3660}
3661
3662static char rindex__doc__[] =
3663"S.rindex(sub [,start [,end]]) -> int\n\
3664\n\
3665Like S.rfind() but raise ValueError when the substring is not found.";
3666
3667static PyObject *
3668unicode_rindex(PyUnicodeObject *self, PyObject *args)
3669{
3670 int result;
3671 PyUnicodeObject *substring;
3672 int start = 0;
3673 int end = INT_MAX;
3674
Guido van Rossumb8872e62000-05-09 14:14:27 +00003675 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
3676 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003677 return NULL;
3678 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3679 (PyObject *)substring);
3680 if (substring == NULL)
3681 return NULL;
3682
3683 result = findstring(self, substring, start, end, -1);
3684
3685 Py_DECREF(substring);
3686 if (result < 0) {
3687 PyErr_SetString(PyExc_ValueError, "substring not found");
3688 return NULL;
3689 }
3690 return PyInt_FromLong(result);
3691}
3692
3693static char rjust__doc__[] =
3694"S.rjust(width) -> unicode\n\
3695\n\
3696Return S right justified in a Unicode string of length width. Padding is\n\
3697done using spaces.";
3698
3699static PyObject *
3700unicode_rjust(PyUnicodeObject *self, PyObject *args)
3701{
3702 int width;
3703 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3704 return NULL;
3705
3706 if (self->length >= width) {
3707 Py_INCREF(self);
3708 return (PyObject*) self;
3709 }
3710
3711 return (PyObject*) pad(self, width - self->length, 0, ' ');
3712}
3713
3714static char rstrip__doc__[] =
3715"S.rstrip() -> unicode\n\
3716\n\
3717Return a copy of the string S with trailing whitespace removed.";
3718
3719static PyObject *
3720unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3721{
3722 if (!PyArg_NoArgs(args))
3723 return NULL;
3724 return strip(self, 0, 1);
3725}
3726
3727static PyObject*
3728unicode_slice(PyUnicodeObject *self, int start, int end)
3729{
3730 /* standard clamping */
3731 if (start < 0)
3732 start = 0;
3733 if (end < 0)
3734 end = 0;
3735 if (end > self->length)
3736 end = self->length;
3737 if (start == 0 && end == self->length) {
3738 /* full slice, return original string */
3739 Py_INCREF(self);
3740 return (PyObject*) self;
3741 }
3742 if (start > end)
3743 start = end;
3744 /* copy slice */
3745 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3746 end - start);
3747}
3748
3749PyObject *PyUnicode_Split(PyObject *s,
3750 PyObject *sep,
3751 int maxsplit)
3752{
3753 PyObject *result;
3754
3755 s = PyUnicode_FromObject(s);
3756 if (s == NULL)
3757 return NULL;
3758 if (sep != NULL) {
3759 sep = PyUnicode_FromObject(sep);
3760 if (sep == NULL) {
3761 Py_DECREF(s);
3762 return NULL;
3763 }
3764 }
3765
3766 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3767
3768 Py_DECREF(s);
3769 Py_XDECREF(sep);
3770 return result;
3771}
3772
3773static char split__doc__[] =
3774"S.split([sep [,maxsplit]]) -> list of strings\n\
3775\n\
3776Return a list of the words in S, using sep as the\n\
3777delimiter string. If maxsplit is given, at most maxsplit\n\
3778splits are done. If sep is not specified, any whitespace string\n\
3779is a separator.";
3780
3781static PyObject*
3782unicode_split(PyUnicodeObject *self, PyObject *args)
3783{
3784 PyObject *substring = Py_None;
3785 int maxcount = -1;
3786
3787 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3788 return NULL;
3789
3790 if (substring == Py_None)
3791 return split(self, NULL, maxcount);
3792 else if (PyUnicode_Check(substring))
3793 return split(self, (PyUnicodeObject *)substring, maxcount);
3794 else
3795 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3796}
3797
3798static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00003799"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00003800\n\
3801Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00003802Line breaks are not included in the resulting list unless keepends\n\
3803is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003804
3805static PyObject*
3806unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3807{
Guido van Rossum86662912000-04-11 15:38:46 +00003808 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003809
Guido van Rossum86662912000-04-11 15:38:46 +00003810 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003811 return NULL;
3812
Guido van Rossum86662912000-04-11 15:38:46 +00003813 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003814}
3815
3816static
3817PyObject *unicode_str(PyUnicodeObject *self)
3818{
3819 return PyUnicode_AsUTF8String((PyObject *)self);
3820}
3821
3822static char strip__doc__[] =
3823"S.strip() -> unicode\n\
3824\n\
3825Return a copy of S with leading and trailing whitespace removed.";
3826
3827static PyObject *
3828unicode_strip(PyUnicodeObject *self, PyObject *args)
3829{
3830 if (!PyArg_NoArgs(args))
3831 return NULL;
3832 return strip(self, 1, 1);
3833}
3834
3835static char swapcase__doc__[] =
3836"S.swapcase() -> unicode\n\
3837\n\
3838Return a copy of S with uppercase characters converted to lowercase\n\
3839and vice versa.";
3840
3841static PyObject*
3842unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3843{
3844 if (!PyArg_NoArgs(args))
3845 return NULL;
3846 return fixup(self, fixswapcase);
3847}
3848
3849static char translate__doc__[] =
3850"S.translate(table) -> unicode\n\
3851\n\
3852Return a copy of the string S, where all characters have been mapped\n\
3853through the given translation table, which must be a mapping of\n\
3854Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3855are left untouched. Characters mapped to None are deleted.";
3856
3857static PyObject*
3858unicode_translate(PyUnicodeObject *self, PyObject *args)
3859{
3860 PyObject *table;
3861
3862 if (!PyArg_ParseTuple(args, "O:translate", &table))
3863 return NULL;
3864 return PyUnicode_TranslateCharmap(self->str,
3865 self->length,
3866 table,
3867 "ignore");
3868}
3869
3870static char upper__doc__[] =
3871"S.upper() -> unicode\n\
3872\n\
3873Return a copy of S converted to uppercase.";
3874
3875static PyObject*
3876unicode_upper(PyUnicodeObject *self, PyObject *args)
3877{
3878 if (!PyArg_NoArgs(args))
3879 return NULL;
3880 return fixup(self, fixupper);
3881}
3882
3883#if 0
3884static char zfill__doc__[] =
3885"S.zfill(width) -> unicode\n\
3886\n\
3887Pad a numeric string x with zeros on the left, to fill a field\n\
3888of the specified width. The string x is never truncated.";
3889
3890static PyObject *
3891unicode_zfill(PyUnicodeObject *self, PyObject *args)
3892{
3893 int fill;
3894 PyUnicodeObject *u;
3895
3896 int width;
3897 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3898 return NULL;
3899
3900 if (self->length >= width) {
3901 Py_INCREF(self);
3902 return (PyObject*) self;
3903 }
3904
3905 fill = width - self->length;
3906
3907 u = pad(self, fill, 0, '0');
3908
3909 if (u->str[fill] == '+' || u->str[fill] == '-') {
3910 /* move sign to beginning of string */
3911 u->str[0] = u->str[fill];
3912 u->str[fill] = '0';
3913 }
3914
3915 return (PyObject*) u;
3916}
3917#endif
3918
3919#if 0
3920static PyObject*
3921unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3922{
3923 if (!PyArg_NoArgs(args))
3924 return NULL;
3925 return PyInt_FromLong(unicode_freelist_size);
3926}
3927#endif
3928
3929static char startswith__doc__[] =
3930"S.startswith(prefix[, start[, end]]) -> int\n\
3931\n\
3932Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3933optional start, test S beginning at that position. With optional end, stop\n\
3934comparing S at that position.";
3935
3936static PyObject *
3937unicode_startswith(PyUnicodeObject *self,
3938 PyObject *args)
3939{
3940 PyUnicodeObject *substring;
3941 int start = 0;
3942 int end = INT_MAX;
3943 PyObject *result;
3944
Guido van Rossumb8872e62000-05-09 14:14:27 +00003945 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
3946 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003947 return NULL;
3948 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3949 (PyObject *)substring);
3950 if (substring == NULL)
3951 return NULL;
3952
3953 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
3954
3955 Py_DECREF(substring);
3956 return result;
3957}
3958
3959
3960static char endswith__doc__[] =
3961"S.endswith(suffix[, start[, end]]) -> int\n\
3962\n\
3963Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
3964optional start, test S beginning at that position. With optional end, stop\n\
3965comparing S at that position.";
3966
3967static PyObject *
3968unicode_endswith(PyUnicodeObject *self,
3969 PyObject *args)
3970{
3971 PyUnicodeObject *substring;
3972 int start = 0;
3973 int end = INT_MAX;
3974 PyObject *result;
3975
Guido van Rossumb8872e62000-05-09 14:14:27 +00003976 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
3977 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003978 return NULL;
3979 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3980 (PyObject *)substring);
3981 if (substring == NULL)
3982 return NULL;
3983
3984 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
3985
3986 Py_DECREF(substring);
3987 return result;
3988}
3989
3990
3991static PyMethodDef unicode_methods[] = {
3992
3993 /* Order is according to common usage: often used methods should
3994 appear first, since lookup is done sequentially. */
3995
3996 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
3997 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
3998 {"split", (PyCFunction) unicode_split, 1, split__doc__},
3999 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4000 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4001 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4002 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4003 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4004 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4005 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4006 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4007 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4008 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4009 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4010/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4011 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4012 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4013 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4014 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4015 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4016 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4017 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4018 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4019 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4020 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4021 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4022 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4023 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4024 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4025 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4026 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4027 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4028 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4029#if 0
4030 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4031 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4032#endif
4033
4034#if 0
4035 /* This one is just used for debugging the implementation. */
4036 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4037#endif
4038
4039 {NULL, NULL}
4040};
4041
4042static PyObject *
4043unicode_getattr(PyUnicodeObject *self, char *name)
4044{
4045 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4046}
4047
4048static PySequenceMethods unicode_as_sequence = {
4049 (inquiry) unicode_length, /* sq_length */
4050 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4051 (intargfunc) unicode_repeat, /* sq_repeat */
4052 (intargfunc) unicode_getitem, /* sq_item */
4053 (intintargfunc) unicode_slice, /* sq_slice */
4054 0, /* sq_ass_item */
4055 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004056 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057};
4058
4059static int
4060unicode_buffer_getreadbuf(PyUnicodeObject *self,
4061 int index,
4062 const void **ptr)
4063{
4064 if (index != 0) {
4065 PyErr_SetString(PyExc_SystemError,
4066 "accessing non-existent unicode segment");
4067 return -1;
4068 }
4069 *ptr = (void *) self->str;
4070 return PyUnicode_GET_DATA_SIZE(self);
4071}
4072
4073static int
4074unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4075 const void **ptr)
4076{
4077 PyErr_SetString(PyExc_TypeError,
4078 "cannot use unicode as modifyable buffer");
4079 return -1;
4080}
4081
4082static int
4083unicode_buffer_getsegcount(PyUnicodeObject *self,
4084 int *lenp)
4085{
4086 if (lenp)
4087 *lenp = PyUnicode_GET_DATA_SIZE(self);
4088 return 1;
4089}
4090
4091static int
4092unicode_buffer_getcharbuf(PyUnicodeObject *self,
4093 int index,
4094 const void **ptr)
4095{
4096 PyObject *str;
4097
4098 if (index != 0) {
4099 PyErr_SetString(PyExc_SystemError,
4100 "accessing non-existent unicode segment");
4101 return -1;
4102 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004103 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004104 if (str == NULL)
4105 return -1;
4106 *ptr = (void *) PyString_AS_STRING(str);
4107 return PyString_GET_SIZE(str);
4108}
4109
4110/* Helpers for PyUnicode_Format() */
4111
4112static PyObject *
4113getnextarg(args, arglen, p_argidx)
4114 PyObject *args;
4115int arglen;
4116int *p_argidx;
4117{
4118 int argidx = *p_argidx;
4119 if (argidx < arglen) {
4120 (*p_argidx)++;
4121 if (arglen < 0)
4122 return args;
4123 else
4124 return PyTuple_GetItem(args, argidx);
4125 }
4126 PyErr_SetString(PyExc_TypeError,
4127 "not enough arguments for format string");
4128 return NULL;
4129}
4130
4131#define F_LJUST (1<<0)
4132#define F_SIGN (1<<1)
4133#define F_BLANK (1<<2)
4134#define F_ALT (1<<3)
4135#define F_ZERO (1<<4)
4136
4137static
4138#ifdef HAVE_STDARG_PROTOTYPES
4139int usprintf(register Py_UNICODE *buffer, char *format, ...)
4140#else
4141int usprintf(va_alist) va_dcl
4142#endif
4143{
4144 register int i;
4145 int len;
4146 va_list va;
4147 char *charbuffer;
4148#ifdef HAVE_STDARG_PROTOTYPES
4149 va_start(va, format);
4150#else
4151 Py_UNICODE *args;
4152 char *format;
4153
4154 va_start(va);
4155 buffer = va_arg(va, Py_UNICODE *);
4156 format = va_arg(va, char *);
4157#endif
4158
4159 /* First, format the string as char array, then expand to Py_UNICODE
4160 array. */
4161 charbuffer = (char *)buffer;
4162 len = vsprintf(charbuffer, format, va);
4163 for (i = len - 1; i >= 0; i--)
4164 buffer[i] = (Py_UNICODE) charbuffer[i];
4165
4166 va_end(va);
4167 return len;
4168}
4169
4170static int
4171formatfloat(Py_UNICODE *buf,
4172 int flags,
4173 int prec,
4174 int type,
4175 PyObject *v)
4176{
4177 char fmt[20];
4178 double x;
4179
4180 x = PyFloat_AsDouble(v);
4181 if (x == -1.0 && PyErr_Occurred())
4182 return -1;
4183 if (prec < 0)
4184 prec = 6;
4185 if (prec > 50)
4186 prec = 50; /* Arbitrary limitation */
4187 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4188 type = 'g';
4189 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4190 return usprintf(buf, fmt, x);
4191}
4192
4193static int
4194formatint(Py_UNICODE *buf,
4195 int flags,
4196 int prec,
4197 int type,
4198 PyObject *v)
4199{
4200 char fmt[20];
4201 long x;
4202
4203 x = PyInt_AsLong(v);
4204 if (x == -1 && PyErr_Occurred())
4205 return -1;
4206 if (prec < 0)
4207 prec = 1;
4208 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4209 return usprintf(buf, fmt, x);
4210}
4211
4212static int
4213formatchar(Py_UNICODE *buf,
4214 PyObject *v)
4215{
4216 if (PyUnicode_Check(v))
4217 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4218
4219 else if (PyString_Check(v))
4220 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4221
4222 else {
4223 /* Integer input truncated to a character */
4224 long x;
4225 x = PyInt_AsLong(v);
4226 if (x == -1 && PyErr_Occurred())
4227 return -1;
4228 buf[0] = (char) x;
4229 }
4230 buf[1] = '\0';
4231 return 1;
4232}
4233
4234PyObject *PyUnicode_Format(PyObject *format,
4235 PyObject *args)
4236{
4237 Py_UNICODE *fmt, *res;
4238 int fmtcnt, rescnt, reslen, arglen, argidx;
4239 int args_owned = 0;
4240 PyUnicodeObject *result = NULL;
4241 PyObject *dict = NULL;
4242 PyObject *uformat;
4243
4244 if (format == NULL || args == NULL) {
4245 PyErr_BadInternalCall();
4246 return NULL;
4247 }
4248 uformat = PyUnicode_FromObject(format);
4249 fmt = PyUnicode_AS_UNICODE(uformat);
4250 fmtcnt = PyUnicode_GET_SIZE(uformat);
4251
4252 reslen = rescnt = fmtcnt + 100;
4253 result = _PyUnicode_New(reslen);
4254 if (result == NULL)
4255 goto onError;
4256 res = PyUnicode_AS_UNICODE(result);
4257
4258 if (PyTuple_Check(args)) {
4259 arglen = PyTuple_Size(args);
4260 argidx = 0;
4261 }
4262 else {
4263 arglen = -1;
4264 argidx = -2;
4265 }
4266 if (args->ob_type->tp_as_mapping)
4267 dict = args;
4268
4269 while (--fmtcnt >= 0) {
4270 if (*fmt != '%') {
4271 if (--rescnt < 0) {
4272 rescnt = fmtcnt + 100;
4273 reslen += rescnt;
4274 if (_PyUnicode_Resize(result, reslen) < 0)
4275 return NULL;
4276 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4277 --rescnt;
4278 }
4279 *res++ = *fmt++;
4280 }
4281 else {
4282 /* Got a format specifier */
4283 int flags = 0;
4284 int width = -1;
4285 int prec = -1;
4286 int size = 0;
4287 Py_UNICODE c = '\0';
4288 Py_UNICODE fill;
4289 PyObject *v = NULL;
4290 PyObject *temp = NULL;
4291 Py_UNICODE *buf;
4292 Py_UNICODE sign;
4293 int len;
4294 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4295
4296 fmt++;
4297 if (*fmt == '(') {
4298 Py_UNICODE *keystart;
4299 int keylen;
4300 PyObject *key;
4301 int pcount = 1;
4302
4303 if (dict == NULL) {
4304 PyErr_SetString(PyExc_TypeError,
4305 "format requires a mapping");
4306 goto onError;
4307 }
4308 ++fmt;
4309 --fmtcnt;
4310 keystart = fmt;
4311 /* Skip over balanced parentheses */
4312 while (pcount > 0 && --fmtcnt >= 0) {
4313 if (*fmt == ')')
4314 --pcount;
4315 else if (*fmt == '(')
4316 ++pcount;
4317 fmt++;
4318 }
4319 keylen = fmt - keystart - 1;
4320 if (fmtcnt < 0 || pcount > 0) {
4321 PyErr_SetString(PyExc_ValueError,
4322 "incomplete format key");
4323 goto onError;
4324 }
4325 /* keys are converted to strings (using UTF-8) and
4326 then looked up since Python uses strings to hold
4327 variables names etc. in its namespaces and we
4328 wouldn't want to break common idioms. The
4329 alternative would be using Unicode objects for the
4330 lookup but u"abc" and "abc" have different hash
4331 values (on purpose). */
4332 key = PyUnicode_EncodeUTF8(keystart,
4333 keylen,
4334 NULL);
4335 if (key == NULL)
4336 goto onError;
4337 if (args_owned) {
4338 Py_DECREF(args);
4339 args_owned = 0;
4340 }
4341 args = PyObject_GetItem(dict, key);
4342 Py_DECREF(key);
4343 if (args == NULL) {
4344 goto onError;
4345 }
4346 args_owned = 1;
4347 arglen = -1;
4348 argidx = -2;
4349 }
4350 while (--fmtcnt >= 0) {
4351 switch (c = *fmt++) {
4352 case '-': flags |= F_LJUST; continue;
4353 case '+': flags |= F_SIGN; continue;
4354 case ' ': flags |= F_BLANK; continue;
4355 case '#': flags |= F_ALT; continue;
4356 case '0': flags |= F_ZERO; continue;
4357 }
4358 break;
4359 }
4360 if (c == '*') {
4361 v = getnextarg(args, arglen, &argidx);
4362 if (v == NULL)
4363 goto onError;
4364 if (!PyInt_Check(v)) {
4365 PyErr_SetString(PyExc_TypeError,
4366 "* wants int");
4367 goto onError;
4368 }
4369 width = PyInt_AsLong(v);
4370 if (width < 0) {
4371 flags |= F_LJUST;
4372 width = -width;
4373 }
4374 if (--fmtcnt >= 0)
4375 c = *fmt++;
4376 }
4377 else if (c >= '0' && c <= '9') {
4378 width = c - '0';
4379 while (--fmtcnt >= 0) {
4380 c = *fmt++;
4381 if (c < '0' || c > '9')
4382 break;
4383 if ((width*10) / 10 != width) {
4384 PyErr_SetString(PyExc_ValueError,
4385 "width too big");
4386 goto onError;
4387 }
4388 width = width*10 + (c - '0');
4389 }
4390 }
4391 if (c == '.') {
4392 prec = 0;
4393 if (--fmtcnt >= 0)
4394 c = *fmt++;
4395 if (c == '*') {
4396 v = getnextarg(args, arglen, &argidx);
4397 if (v == NULL)
4398 goto onError;
4399 if (!PyInt_Check(v)) {
4400 PyErr_SetString(PyExc_TypeError,
4401 "* wants int");
4402 goto onError;
4403 }
4404 prec = PyInt_AsLong(v);
4405 if (prec < 0)
4406 prec = 0;
4407 if (--fmtcnt >= 0)
4408 c = *fmt++;
4409 }
4410 else if (c >= '0' && c <= '9') {
4411 prec = c - '0';
4412 while (--fmtcnt >= 0) {
4413 c = Py_CHARMASK(*fmt++);
4414 if (c < '0' || c > '9')
4415 break;
4416 if ((prec*10) / 10 != prec) {
4417 PyErr_SetString(PyExc_ValueError,
4418 "prec too big");
4419 goto onError;
4420 }
4421 prec = prec*10 + (c - '0');
4422 }
4423 }
4424 } /* prec */
4425 if (fmtcnt >= 0) {
4426 if (c == 'h' || c == 'l' || c == 'L') {
4427 size = c;
4428 if (--fmtcnt >= 0)
4429 c = *fmt++;
4430 }
4431 }
4432 if (fmtcnt < 0) {
4433 PyErr_SetString(PyExc_ValueError,
4434 "incomplete format");
4435 goto onError;
4436 }
4437 if (c != '%') {
4438 v = getnextarg(args, arglen, &argidx);
4439 if (v == NULL)
4440 goto onError;
4441 }
4442 sign = 0;
4443 fill = ' ';
4444 switch (c) {
4445
4446 case '%':
4447 buf = tmpbuf;
4448 buf[0] = '%';
4449 len = 1;
4450 break;
4451
4452 case 's':
4453 case 'r':
4454 if (PyUnicode_Check(v) && c == 's') {
4455 temp = v;
4456 Py_INCREF(temp);
4457 }
4458 else {
4459 PyObject *unicode;
4460 if (c == 's')
4461 temp = PyObject_Str(v);
4462 else
4463 temp = PyObject_Repr(v);
4464 if (temp == NULL)
4465 goto onError;
4466 if (!PyString_Check(temp)) {
4467 /* XXX Note: this should never happen, since
4468 PyObject_Repr() and PyObject_Str() assure
4469 this */
4470 Py_DECREF(temp);
4471 PyErr_SetString(PyExc_TypeError,
4472 "%s argument has non-string str()");
4473 goto onError;
4474 }
4475 unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
4476 PyString_GET_SIZE(temp),
4477 "strict");
4478 Py_DECREF(temp);
4479 temp = unicode;
4480 if (temp == NULL)
4481 goto onError;
4482 }
4483 buf = PyUnicode_AS_UNICODE(temp);
4484 len = PyUnicode_GET_SIZE(temp);
4485 if (prec >= 0 && len > prec)
4486 len = prec;
4487 break;
4488
4489 case 'i':
4490 case 'd':
4491 case 'u':
4492 case 'o':
4493 case 'x':
4494 case 'X':
4495 if (c == 'i')
4496 c = 'd';
4497 buf = tmpbuf;
4498 len = formatint(buf, flags, prec, c, v);
4499 if (len < 0)
4500 goto onError;
4501 sign = (c == 'd');
4502 if (flags & F_ZERO) {
4503 fill = '0';
4504 if ((flags&F_ALT) &&
4505 (c == 'x' || c == 'X') &&
4506 buf[0] == '0' && buf[1] == c) {
4507 *res++ = *buf++;
4508 *res++ = *buf++;
4509 rescnt -= 2;
4510 len -= 2;
4511 width -= 2;
4512 if (width < 0)
4513 width = 0;
4514 }
4515 }
4516 break;
4517
4518 case 'e':
4519 case 'E':
4520 case 'f':
4521 case 'g':
4522 case 'G':
4523 buf = tmpbuf;
4524 len = formatfloat(buf, flags, prec, c, v);
4525 if (len < 0)
4526 goto onError;
4527 sign = 1;
4528 if (flags&F_ZERO)
4529 fill = '0';
4530 break;
4531
4532 case 'c':
4533 buf = tmpbuf;
4534 len = formatchar(buf, v);
4535 if (len < 0)
4536 goto onError;
4537 break;
4538
4539 default:
4540 PyErr_Format(PyExc_ValueError,
4541 "unsupported format character '%c' (0x%x)",
4542 c, c);
4543 goto onError;
4544 }
4545 if (sign) {
4546 if (*buf == '-' || *buf == '+') {
4547 sign = *buf++;
4548 len--;
4549 }
4550 else if (flags & F_SIGN)
4551 sign = '+';
4552 else if (flags & F_BLANK)
4553 sign = ' ';
4554 else
4555 sign = 0;
4556 }
4557 if (width < len)
4558 width = len;
4559 if (rescnt < width + (sign != 0)) {
4560 reslen -= rescnt;
4561 rescnt = width + fmtcnt + 100;
4562 reslen += rescnt;
4563 if (_PyUnicode_Resize(result, reslen) < 0)
4564 return NULL;
4565 res = PyUnicode_AS_UNICODE(result)
4566 + reslen - rescnt;
4567 }
4568 if (sign) {
4569 if (fill != ' ')
4570 *res++ = sign;
4571 rescnt--;
4572 if (width > len)
4573 width--;
4574 }
4575 if (width > len && !(flags & F_LJUST)) {
4576 do {
4577 --rescnt;
4578 *res++ = fill;
4579 } while (--width > len);
4580 }
4581 if (sign && fill == ' ')
4582 *res++ = sign;
4583 memcpy(res, buf, len * sizeof(Py_UNICODE));
4584 res += len;
4585 rescnt -= len;
4586 while (--width >= len) {
4587 --rescnt;
4588 *res++ = ' ';
4589 }
4590 if (dict && (argidx < arglen) && c != '%') {
4591 PyErr_SetString(PyExc_TypeError,
4592 "not all arguments converted");
4593 goto onError;
4594 }
4595 Py_XDECREF(temp);
4596 } /* '%' */
4597 } /* until end */
4598 if (argidx < arglen && !dict) {
4599 PyErr_SetString(PyExc_TypeError,
4600 "not all arguments converted");
4601 goto onError;
4602 }
4603
4604 if (args_owned) {
4605 Py_DECREF(args);
4606 }
4607 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004608 if (_PyUnicode_Resize(result, reslen - rescnt))
4609 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004610 return (PyObject *)result;
4611
4612 onError:
4613 Py_XDECREF(result);
4614 Py_DECREF(uformat);
4615 if (args_owned) {
4616 Py_DECREF(args);
4617 }
4618 return NULL;
4619}
4620
4621static PyBufferProcs unicode_as_buffer = {
4622 (getreadbufferproc) unicode_buffer_getreadbuf,
4623 (getwritebufferproc) unicode_buffer_getwritebuf,
4624 (getsegcountproc) unicode_buffer_getsegcount,
4625 (getcharbufferproc) unicode_buffer_getcharbuf,
4626};
4627
4628PyTypeObject PyUnicode_Type = {
4629 PyObject_HEAD_INIT(&PyType_Type)
4630 0, /* ob_size */
4631 "unicode", /* tp_name */
4632 sizeof(PyUnicodeObject), /* tp_size */
4633 0, /* tp_itemsize */
4634 /* Slots */
4635 (destructor)_PyUnicode_Free, /* tp_dealloc */
4636 0, /* tp_print */
4637 (getattrfunc)unicode_getattr, /* tp_getattr */
4638 0, /* tp_setattr */
4639 (cmpfunc) unicode_compare, /* tp_compare */
4640 (reprfunc) unicode_repr, /* tp_repr */
4641 0, /* tp_as_number */
4642 &unicode_as_sequence, /* tp_as_sequence */
4643 0, /* tp_as_mapping */
4644 (hashfunc) unicode_hash, /* tp_hash*/
4645 0, /* tp_call*/
4646 (reprfunc) unicode_str, /* tp_str */
4647 (getattrofunc) NULL, /* tp_getattro */
4648 (setattrofunc) NULL, /* tp_setattro */
4649 &unicode_as_buffer, /* tp_as_buffer */
4650 Py_TPFLAGS_DEFAULT, /* tp_flags */
4651};
4652
4653/* Initialize the Unicode implementation */
4654
4655void _PyUnicode_Init()
4656{
4657 /* Doublecheck the configuration... */
4658 if (sizeof(Py_UNICODE) != 2)
4659 Py_FatalError("Unicode configuration error: "
4660 "sizeof(Py_UNICODE) != 2 bytes");
4661
4662 unicode_empty = _PyUnicode_New(0);
4663}
4664
4665/* Finalize the Unicode implementation */
4666
4667void
4668_PyUnicode_Fini()
4669{
4670 PyUnicodeObject *u = unicode_freelist;
4671
4672 while (u != NULL) {
4673 PyUnicodeObject *v = u;
4674 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004675 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00004676 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004677 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00004678 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 }
4680 Py_XDECREF(unicode_empty);
4681}