blob: fa7c5ea2e61fe201d4b394ec9c29fbfd2935f9f2 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
111/* --- Globals ------------------------------------------------------------ */
112
113/* The empty Unicode object */
114static PyUnicodeObject *unicode_empty = NULL;
115
116/* Free list for Unicode objects */
117static PyUnicodeObject *unicode_freelist = NULL;
118static int unicode_freelist_size = 0;
119
Fred Drakee4315f52000-05-09 19:53:39 +0000120/* Default encoding to use and assume when NULL is passed as encoding
121 parameter; it is initialized by _PyUnicode_Init().
122
123 Always use the PyUnicode_SetDefaultEncoding() and
124 PyUnicode_GetDefaultEncoding() APIs to access this global.
125
126*/
127
128static char unicode_default_encoding[100];
129
Guido van Rossumd57fd912000-03-10 22:53:23 +0000130/* --- Unicode Object ----------------------------------------------------- */
131
132static
133int _PyUnicode_Resize(register PyUnicodeObject *unicode,
134 int length)
135{
136 void *oldstr;
137
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000138 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000139 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000140 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000141
142 /* Resizing unicode_empty is not allowed. */
143 if (unicode == unicode_empty) {
144 PyErr_SetString(PyExc_SystemError,
145 "can't resize empty unicode object");
146 return -1;
147 }
148
149 /* We allocate one more byte to make sure the string is
150 Ux0000 terminated -- XXX is this needed ? */
151 oldstr = unicode->str;
152 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
153 if (!unicode->str) {
154 unicode->str = oldstr;
155 PyErr_NoMemory();
156 return -1;
157 }
158 unicode->str[length] = 0;
159 unicode->length = length;
160
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000161 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000162 /* Reset the object caches */
163 if (unicode->utf8str) {
164 Py_DECREF(unicode->utf8str);
165 unicode->utf8str = NULL;
166 }
167 unicode->hash = -1;
168
169 return 0;
170}
171
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000172int PyUnicode_Resize(PyObject **unicode,
173 int length)
174{
175 PyUnicodeObject *v;
176
177 if (unicode == NULL) {
178 PyErr_BadInternalCall();
179 return -1;
180 }
181 v = (PyUnicodeObject *)*unicode;
182 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
183 PyErr_BadInternalCall();
184 return -1;
185 }
186 return _PyUnicode_Resize(v, length);
187}
188
Guido van Rossumd57fd912000-03-10 22:53:23 +0000189/* We allocate one more byte to make sure the string is
190 Ux0000 terminated -- XXX is this needed ?
191
192 XXX This allocator could further be enhanced by assuring that the
193 free list never reduces its size below 1.
194
195*/
196
197static
198PyUnicodeObject *_PyUnicode_New(int length)
199{
200 register PyUnicodeObject *unicode;
201
202 /* Optimization for empty strings */
203 if (length == 0 && unicode_empty != NULL) {
204 Py_INCREF(unicode_empty);
205 return unicode_empty;
206 }
207
208 /* Unicode freelist & memory allocation */
209 if (unicode_freelist) {
210 unicode = unicode_freelist;
211 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
212 unicode_freelist_size--;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000213 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000214 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000215 /* Keep-Alive optimization: we only upsize the buffer,
216 never downsize it. */
217 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000219 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000220 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000221 }
222 }
223 else
224 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
225 }
226 else {
227 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
228 if (unicode == NULL)
229 return NULL;
230 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
231 }
232
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000233 if (!unicode->str) {
234 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000235 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 unicode->str[length] = 0;
238 unicode->length = length;
239 unicode->hash = -1;
240 unicode->utf8str = NULL;
241 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242
243 onError:
244 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247}
248
249static
250void _PyUnicode_Free(register PyUnicodeObject *unicode)
251{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization */
254 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000255 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 unicode->str = NULL;
257 unicode->length = 0;
258 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000259 if (unicode->utf8str) {
260 Py_DECREF(unicode->utf8str);
261 unicode->utf8str = NULL;
262 }
263 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 *(PyUnicodeObject **)unicode = unicode_freelist;
265 unicode_freelist = unicode;
266 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000269 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000270 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000271 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 }
273}
274
275PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
276 int size)
277{
278 PyUnicodeObject *unicode;
279
280 unicode = _PyUnicode_New(size);
281 if (!unicode)
282 return NULL;
283
284 /* Copy the Unicode data into the new object */
285 if (u != NULL)
286 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
287
288 return (PyObject *)unicode;
289}
290
291#ifdef HAVE_WCHAR_H
292
293PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
298 if (w == NULL) {
299 PyErr_BadInternalCall();
300 return NULL;
301 }
302
303 unicode = _PyUnicode_New(size);
304 if (!unicode)
305 return NULL;
306
307 /* Copy the wchar_t data into the new object */
308#ifdef HAVE_USABLE_WCHAR_T
309 memcpy(unicode->str, w, size * sizeof(wchar_t));
310#else
311 {
312 register Py_UNICODE *u;
313 register int i;
314 u = PyUnicode_AS_UNICODE(unicode);
315 for (i = size; i >= 0; i--)
316 *u++ = *w++;
317 }
318#endif
319
320 return (PyObject *)unicode;
321}
322
323int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
324 register wchar_t *w,
325 int size)
326{
327 if (unicode == NULL) {
328 PyErr_BadInternalCall();
329 return -1;
330 }
331 if (size > PyUnicode_GET_SIZE(unicode))
332 size = PyUnicode_GET_SIZE(unicode);
333#ifdef HAVE_USABLE_WCHAR_T
334 memcpy(w, unicode->str, size * sizeof(wchar_t));
335#else
336 {
337 register Py_UNICODE *u;
338 register int i;
339 u = PyUnicode_AS_UNICODE(unicode);
340 for (i = size; i >= 0; i--)
341 *w++ = *u++;
342 }
343#endif
344
345 return size;
346}
347
348#endif
349
350PyObject *PyUnicode_FromObject(register PyObject *obj)
351{
352 const char *s;
353 int len;
354
355 if (obj == NULL) {
356 PyErr_BadInternalCall();
357 return NULL;
358 }
359 else if (PyUnicode_Check(obj)) {
360 Py_INCREF(obj);
361 return obj;
362 }
363 else if (PyString_Check(obj)) {
364 s = PyString_AS_STRING(obj);
365 len = PyString_GET_SIZE(obj);
366 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000367 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
368 /* Overwrite the error message with something more useful in
369 case of a TypeError. */
370 if (PyErr_ExceptionMatches(PyExc_TypeError))
371 PyErr_SetString(PyExc_TypeError,
372 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000373 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000374 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000375 if (len == 0) {
376 Py_INCREF(unicode_empty);
377 return (PyObject *)unicode_empty;
378 }
Fred Drakee4315f52000-05-09 19:53:39 +0000379 return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380}
381
382PyObject *PyUnicode_Decode(const char *s,
383 int size,
384 const char *encoding,
385 const char *errors)
386{
387 PyObject *buffer = NULL, *unicode;
388
Fred Drakee4315f52000-05-09 19:53:39 +0000389 if (encoding == NULL)
390 encoding = PyUnicode_GetDefaultEncoding();
391
392 /* Shortcuts for common default encodings */
393 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000394 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000395 else if (strcmp(encoding, "latin-1") == 0)
396 return PyUnicode_DecodeLatin1(s, size, errors);
397 else if (strcmp(encoding, "ascii") == 0)
398 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399
400 /* Decode via the codec registry */
401 buffer = PyBuffer_FromMemory((void *)s, size);
402 if (buffer == NULL)
403 goto onError;
404 unicode = PyCodec_Decode(buffer, encoding, errors);
405 if (unicode == NULL)
406 goto onError;
407 if (!PyUnicode_Check(unicode)) {
408 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000409 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 unicode->ob_type->tp_name);
411 Py_DECREF(unicode);
412 goto onError;
413 }
414 Py_DECREF(buffer);
415 return unicode;
416
417 onError:
418 Py_XDECREF(buffer);
419 return NULL;
420}
421
422PyObject *PyUnicode_Encode(const Py_UNICODE *s,
423 int size,
424 const char *encoding,
425 const char *errors)
426{
427 PyObject *v, *unicode;
428
429 unicode = PyUnicode_FromUnicode(s, size);
430 if (unicode == NULL)
431 return NULL;
432 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
433 Py_DECREF(unicode);
434 return v;
435}
436
437PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
438 const char *encoding,
439 const char *errors)
440{
441 PyObject *v;
442
443 if (!PyUnicode_Check(unicode)) {
444 PyErr_BadArgument();
445 goto onError;
446 }
Fred Drakee4315f52000-05-09 19:53:39 +0000447
448 if (encoding == NULL)
449 encoding = PyUnicode_GetDefaultEncoding();
450
451 /* Shortcuts for common default encodings */
452 if (errors == NULL) {
453 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000454 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000455 else if (strcmp(encoding, "latin-1") == 0)
456 return PyUnicode_AsLatin1String(unicode);
457 else if (strcmp(encoding, "ascii") == 0)
458 return PyUnicode_AsASCIIString(unicode);
459 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460
461 /* Encode via the codec registry */
462 v = PyCodec_Encode(unicode, encoding, errors);
463 if (v == NULL)
464 goto onError;
465 /* XXX Should we really enforce this ? */
466 if (!PyString_Check(v)) {
467 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000468 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000469 v->ob_type->tp_name);
470 Py_DECREF(v);
471 goto onError;
472 }
473 return v;
474
475 onError:
476 return NULL;
477}
478
479Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
480{
481 if (!PyUnicode_Check(unicode)) {
482 PyErr_BadArgument();
483 goto onError;
484 }
485 return PyUnicode_AS_UNICODE(unicode);
486
487 onError:
488 return NULL;
489}
490
491int PyUnicode_GetSize(PyObject *unicode)
492{
493 if (!PyUnicode_Check(unicode)) {
494 PyErr_BadArgument();
495 goto onError;
496 }
497 return PyUnicode_GET_SIZE(unicode);
498
499 onError:
500 return -1;
501}
502
Fred Drakee4315f52000-05-09 19:53:39 +0000503const char *PyUnicode_GetDefaultEncoding()
504{
505 return unicode_default_encoding;
506}
507
508int PyUnicode_SetDefaultEncoding(const char *encoding)
509{
510 PyObject *v;
511
512 /* Make sure the encoding is valid. As side effect, this also
513 loads the encoding into the codec registry cache. */
514 v = _PyCodec_Lookup(encoding);
515 if (v == NULL)
516 goto onError;
517 Py_DECREF(v);
518 strncpy(unicode_default_encoding,
519 encoding,
520 sizeof(unicode_default_encoding));
521 return 0;
522
523 onError:
524 return -1;
525}
526
Guido van Rossumd57fd912000-03-10 22:53:23 +0000527/* --- UTF-8 Codec -------------------------------------------------------- */
528
529static
530char utf8_code_length[256] = {
531 /* Map UTF-8 encoded prefix byte to sequence length. zero means
532 illegal prefix. see RFC 2279 for details */
533 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
534 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
535 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
536 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
537 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
541 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
542 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
544 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
545 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
546 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
547 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
548 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
549};
550
551static
552int utf8_decoding_error(const char **source,
553 Py_UNICODE **dest,
554 const char *errors,
555 const char *details)
556{
557 if ((errors == NULL) ||
558 (strcmp(errors,"strict") == 0)) {
559 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000560 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000561 details);
562 return -1;
563 }
564 else if (strcmp(errors,"ignore") == 0) {
565 (*source)++;
566 return 0;
567 }
568 else if (strcmp(errors,"replace") == 0) {
569 (*source)++;
570 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
571 (*dest)++;
572 return 0;
573 }
574 else {
575 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000576 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000577 errors);
578 return -1;
579 }
580}
581
582#define UTF8_ERROR(details) do { \
583 if (utf8_decoding_error(&s, &p, errors, details)) \
584 goto onError; \
585 continue; \
586} while (0)
587
588PyObject *PyUnicode_DecodeUTF8(const char *s,
589 int size,
590 const char *errors)
591{
592 int n;
593 const char *e;
594 PyUnicodeObject *unicode;
595 Py_UNICODE *p;
596
597 /* Note: size will always be longer than the resulting Unicode
598 character count */
599 unicode = _PyUnicode_New(size);
600 if (!unicode)
601 return NULL;
602 if (size == 0)
603 return (PyObject *)unicode;
604
605 /* Unpack UTF-8 encoded data */
606 p = unicode->str;
607 e = s + size;
608
609 while (s < e) {
610 register Py_UNICODE ch = (unsigned char)*s;
611
612 if (ch < 0x80) {
613 *p++ = ch;
614 s++;
615 continue;
616 }
617
618 n = utf8_code_length[ch];
619
620 if (s + n > e)
621 UTF8_ERROR("unexpected end of data");
622
623 switch (n) {
624
625 case 0:
626 UTF8_ERROR("unexpected code byte");
627 break;
628
629 case 1:
630 UTF8_ERROR("internal error");
631 break;
632
633 case 2:
634 if ((s[1] & 0xc0) != 0x80)
635 UTF8_ERROR("invalid data");
636 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
637 if (ch < 0x80)
638 UTF8_ERROR("illegal encoding");
639 else
640 *p++ = ch;
641 break;
642
643 case 3:
644 if ((s[1] & 0xc0) != 0x80 ||
645 (s[2] & 0xc0) != 0x80)
646 UTF8_ERROR("invalid data");
647 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
648 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
649 UTF8_ERROR("illegal encoding");
650 else
651 *p++ = ch;
652 break;
653
654 default:
655 /* Other sizes are only needed for UCS-4 */
656 UTF8_ERROR("unsupported Unicode code range");
657 }
658 s += n;
659 }
660
661 /* Adjust length */
662 if (_PyUnicode_Resize(unicode, p - unicode->str))
663 goto onError;
664
665 return (PyObject *)unicode;
666
667onError:
668 Py_DECREF(unicode);
669 return NULL;
670}
671
672#undef UTF8_ERROR
673
674static
675int utf8_encoding_error(const Py_UNICODE **source,
676 char **dest,
677 const char *errors,
678 const char *details)
679{
680 if ((errors == NULL) ||
681 (strcmp(errors,"strict") == 0)) {
682 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000683 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000684 details);
685 return -1;
686 }
687 else if (strcmp(errors,"ignore") == 0) {
688 return 0;
689 }
690 else if (strcmp(errors,"replace") == 0) {
691 **dest = '?';
692 (*dest)++;
693 return 0;
694 }
695 else {
696 PyErr_Format(PyExc_ValueError,
697 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000698 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000699 errors);
700 return -1;
701 }
702}
703
704PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
705 int size,
706 const char *errors)
707{
708 PyObject *v;
709 char *p;
710 char *q;
711
712 v = PyString_FromStringAndSize(NULL, 3 * size);
713 if (v == NULL)
714 return NULL;
715 if (size == 0)
716 goto done;
717
718 p = q = PyString_AS_STRING(v);
719 while (size-- > 0) {
720 Py_UNICODE ch = *s++;
721 if (ch < 0x80)
722 *p++ = (char) ch;
723 else if (ch < 0x0800) {
724 *p++ = 0xc0 | (ch >> 6);
725 *p++ = 0x80 | (ch & 0x3f);
726 } else if (0xD800 <= ch && ch <= 0xDFFF) {
727 /* These byte ranges are reserved for UTF-16 surrogate
728 bytes which the Python implementation currently does
729 not support. */
730 printf("code range problem: U+%04x\n", ch);
731 if (utf8_encoding_error(&s, &p, errors,
732 "unsupported code range"))
733 goto onError;
734 } else {
735 *p++ = 0xe0 | (ch >> 12);
736 *p++ = 0x80 | ((ch >> 6) & 0x3f);
737 *p++ = 0x80 | (ch & 0x3f);
738 }
739 }
740 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000741 if (_PyString_Resize(&v, p - q))
742 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000743
744 done:
745 return v;
746
747 onError:
748 Py_DECREF(v);
749 return NULL;
750}
751
752/* Return a Python string holding the UTF-8 encoded value of the
753 Unicode object.
754
755 The resulting string is cached in the Unicode object for subsequent
756 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000757 the character buffer interface and will live (at least) as long as
758 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000759
760 The refcount of the string is *not* incremented.
761
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000762 *** Exported for internal use by the interpreter only !!! ***
763
Guido van Rossumd57fd912000-03-10 22:53:23 +0000764*/
765
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000766PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000767 const char *errors)
768{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000769 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000770
771 if (v)
772 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000773 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
774 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775 errors);
776 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000777 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000778 return v;
779}
780
781PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
782{
783 PyObject *str;
784
785 if (!PyUnicode_Check(unicode)) {
786 PyErr_BadArgument();
787 return NULL;
788 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000789 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000790 if (str == NULL)
791 return NULL;
792 Py_INCREF(str);
793 return str;
794}
795
796/* --- UTF-16 Codec ------------------------------------------------------- */
797
798static
799int utf16_decoding_error(const Py_UNICODE **source,
800 Py_UNICODE **dest,
801 const char *errors,
802 const char *details)
803{
804 if ((errors == NULL) ||
805 (strcmp(errors,"strict") == 0)) {
806 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000807 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000808 details);
809 return -1;
810 }
811 else if (strcmp(errors,"ignore") == 0) {
812 return 0;
813 }
814 else if (strcmp(errors,"replace") == 0) {
815 if (dest) {
816 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
817 (*dest)++;
818 }
819 return 0;
820 }
821 else {
822 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000823 "UTF-16 decoding error; "
824 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000825 errors);
826 return -1;
827 }
828}
829
830#define UTF16_ERROR(details) do { \
831 if (utf16_decoding_error(&q, &p, errors, details)) \
832 goto onError; \
833 continue; \
834} while(0)
835
836PyObject *PyUnicode_DecodeUTF16(const char *s,
837 int size,
838 const char *errors,
839 int *byteorder)
840{
841 PyUnicodeObject *unicode;
842 Py_UNICODE *p;
843 const Py_UNICODE *q, *e;
844 int bo = 0;
845
846 /* size should be an even number */
847 if (size % sizeof(Py_UNICODE) != 0) {
848 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
849 return NULL;
850 /* The remaining input chars are ignored if we fall through
851 here... */
852 }
853
854 /* Note: size will always be longer than the resulting Unicode
855 character count */
856 unicode = _PyUnicode_New(size);
857 if (!unicode)
858 return NULL;
859 if (size == 0)
860 return (PyObject *)unicode;
861
862 /* Unpack UTF-16 encoded data */
863 p = unicode->str;
864 q = (Py_UNICODE *)s;
865 e = q + (size / sizeof(Py_UNICODE));
866
867 if (byteorder)
868 bo = *byteorder;
869
870 while (q < e) {
871 register Py_UNICODE ch = *q++;
872
873 /* Check for BOM marks (U+FEFF) in the input and adjust
874 current byte order setting accordingly. Swap input
875 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
876 !) */
877#ifdef BYTEORDER_IS_LITTLE_ENDIAN
878 if (ch == 0xFEFF) {
879 bo = -1;
880 continue;
881 } else if (ch == 0xFFFE) {
882 bo = 1;
883 continue;
884 }
885 if (bo == 1)
886 ch = (ch >> 8) | (ch << 8);
887#else
888 if (ch == 0xFEFF) {
889 bo = 1;
890 continue;
891 } else if (ch == 0xFFFE) {
892 bo = -1;
893 continue;
894 }
895 if (bo == -1)
896 ch = (ch >> 8) | (ch << 8);
897#endif
898 if (ch < 0xD800 || ch > 0xDFFF) {
899 *p++ = ch;
900 continue;
901 }
902
903 /* UTF-16 code pair: */
904 if (q >= e)
905 UTF16_ERROR("unexpected end of data");
906 if (0xDC00 <= *q && *q <= 0xDFFF) {
907 q++;
908 if (0xD800 <= *q && *q <= 0xDBFF)
909 /* This is valid data (a UTF-16 surrogate pair), but
910 we are not able to store this information since our
911 Py_UNICODE type only has 16 bits... this might
912 change someday, even though it's unlikely. */
913 UTF16_ERROR("code pairs are not supported");
914 else
915 continue;
916 }
917 UTF16_ERROR("illegal encoding");
918 }
919
920 if (byteorder)
921 *byteorder = bo;
922
923 /* Adjust length */
924 if (_PyUnicode_Resize(unicode, p - unicode->str))
925 goto onError;
926
927 return (PyObject *)unicode;
928
929onError:
930 Py_DECREF(unicode);
931 return NULL;
932}
933
934#undef UTF16_ERROR
935
936PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
937 int size,
938 const char *errors,
939 int byteorder)
940{
941 PyObject *v;
942 Py_UNICODE *p;
943 char *q;
944
945 /* We don't create UTF-16 pairs... */
946 v = PyString_FromStringAndSize(NULL,
947 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
948 if (v == NULL)
949 return NULL;
950 if (size == 0)
951 goto done;
952
953 q = PyString_AS_STRING(v);
954 p = (Py_UNICODE *)q;
955
956 if (byteorder == 0)
957 *p++ = 0xFEFF;
958 if (byteorder == 0 ||
959#ifdef BYTEORDER_IS_LITTLE_ENDIAN
960 byteorder == -1
961#else
962 byteorder == 1
963#endif
964 )
965 memcpy(p, s, size * sizeof(Py_UNICODE));
966 else
967 while (size-- > 0) {
968 Py_UNICODE ch = *s++;
969 *p++ = (ch >> 8) | (ch << 8);
970 }
971 done:
972 return v;
973}
974
975PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
976{
977 if (!PyUnicode_Check(unicode)) {
978 PyErr_BadArgument();
979 return NULL;
980 }
981 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
982 PyUnicode_GET_SIZE(unicode),
983 NULL,
984 0);
985}
986
987/* --- Unicode Escape Codec ----------------------------------------------- */
988
989static
990int unicodeescape_decoding_error(const char **source,
991 unsigned int *x,
992 const char *errors,
993 const char *details)
994{
995 if ((errors == NULL) ||
996 (strcmp(errors,"strict") == 0)) {
997 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000998 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000999 details);
1000 return -1;
1001 }
1002 else if (strcmp(errors,"ignore") == 0) {
1003 return 0;
1004 }
1005 else if (strcmp(errors,"replace") == 0) {
1006 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
1007 return 0;
1008 }
1009 else {
1010 PyErr_Format(PyExc_ValueError,
1011 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001012 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001013 errors);
1014 return -1;
1015 }
1016}
1017
1018PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1019 int size,
1020 const char *errors)
1021{
1022 PyUnicodeObject *v;
1023 Py_UNICODE *p = NULL, *buf = NULL;
1024 const char *end;
1025
1026 /* Escaped strings will always be longer than the resulting
1027 Unicode string, so we start with size here and then reduce the
1028 length after conversion to the true value. */
1029 v = _PyUnicode_New(size);
1030 if (v == NULL)
1031 goto onError;
1032 if (size == 0)
1033 return (PyObject *)v;
1034 p = buf = PyUnicode_AS_UNICODE(v);
1035 end = s + size;
1036 while (s < end) {
1037 unsigned char c;
1038 unsigned int x;
1039 int i;
1040
1041 /* Non-escape characters are interpreted as Unicode ordinals */
1042 if (*s != '\\') {
1043 *p++ = (unsigned char)*s++;
1044 continue;
1045 }
1046
1047 /* \ - Escapes */
1048 s++;
1049 switch (*s++) {
1050
1051 /* \x escapes */
1052 case '\n': break;
1053 case '\\': *p++ = '\\'; break;
1054 case '\'': *p++ = '\''; break;
1055 case '\"': *p++ = '\"'; break;
1056 case 'b': *p++ = '\b'; break;
1057 case 'f': *p++ = '\014'; break; /* FF */
1058 case 't': *p++ = '\t'; break;
1059 case 'n': *p++ = '\n'; break;
1060 case 'r': *p++ = '\r'; break;
1061 case 'v': *p++ = '\013'; break; /* VT */
1062 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1063
1064 /* \OOO (octal) escapes */
1065 case '0': case '1': case '2': case '3':
1066 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001067 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001068 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001069 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001070 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001071 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001073 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 break;
1075
1076 /* \xXXXX escape with 0-4 hex digits */
1077 case 'x':
1078 x = 0;
1079 c = (unsigned char)*s;
1080 if (isxdigit(c)) {
1081 do {
1082 x = (x<<4) & ~0xF;
1083 if ('0' <= c && c <= '9')
1084 x += c - '0';
1085 else if ('a' <= c && c <= 'f')
1086 x += 10 + c - 'a';
1087 else
1088 x += 10 + c - 'A';
1089 c = (unsigned char)*++s;
1090 } while (isxdigit(c));
1091 *p++ = x;
1092 } else {
1093 *p++ = '\\';
1094 *p++ = (unsigned char)s[-1];
1095 }
1096 break;
1097
1098 /* \uXXXX with 4 hex digits */
1099 case 'u':
1100 for (x = 0, i = 0; i < 4; i++) {
1101 c = (unsigned char)s[i];
1102 if (!isxdigit(c)) {
1103 if (unicodeescape_decoding_error(&s, &x, errors,
1104 "truncated \\uXXXX"))
1105 goto onError;
1106 i++;
1107 break;
1108 }
1109 x = (x<<4) & ~0xF;
1110 if (c >= '0' && c <= '9')
1111 x += c - '0';
1112 else if (c >= 'a' && c <= 'f')
1113 x += 10 + c - 'a';
1114 else
1115 x += 10 + c - 'A';
1116 }
1117 s += i;
1118 *p++ = x;
1119 break;
1120
1121 default:
1122 *p++ = '\\';
1123 *p++ = (unsigned char)s[-1];
1124 break;
1125 }
1126 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001127 if (_PyUnicode_Resize(v, (int)(p - buf)))
1128 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001129 return (PyObject *)v;
1130
1131 onError:
1132 Py_XDECREF(v);
1133 return NULL;
1134}
1135
1136/* Return a Unicode-Escape string version of the Unicode object.
1137
1138 If quotes is true, the string is enclosed in u"" or u'' quotes as
1139 appropriate.
1140
1141*/
1142
Barry Warsaw51ac5802000-03-20 16:36:48 +00001143static const Py_UNICODE *findchar(const Py_UNICODE *s,
1144 int size,
1145 Py_UNICODE ch);
1146
Guido van Rossumd57fd912000-03-10 22:53:23 +00001147static
1148PyObject *unicodeescape_string(const Py_UNICODE *s,
1149 int size,
1150 int quotes)
1151{
1152 PyObject *repr;
1153 char *p;
1154 char *q;
1155
1156 static const char *hexdigit = "0123456789ABCDEF";
1157
1158 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1159 if (repr == NULL)
1160 return NULL;
1161
1162 p = q = PyString_AS_STRING(repr);
1163
1164 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001165 *p++ = 'u';
1166 *p++ = (findchar(s, size, '\'') &&
1167 !findchar(s, size, '"')) ? '"' : '\'';
1168 }
1169 while (size-- > 0) {
1170 Py_UNICODE ch = *s++;
1171 /* Escape quotes */
1172 if (quotes && (ch == q[1] || ch == '\\')) {
1173 *p++ = '\\';
1174 *p++ = (char) ch;
1175 }
1176 /* Map 16-bit characters to '\uxxxx' */
1177 else if (ch >= 256) {
1178 *p++ = '\\';
1179 *p++ = 'u';
1180 *p++ = hexdigit[(ch >> 12) & 0xf];
1181 *p++ = hexdigit[(ch >> 8) & 0xf];
1182 *p++ = hexdigit[(ch >> 4) & 0xf];
1183 *p++ = hexdigit[ch & 15];
1184 }
1185 /* Map non-printable US ASCII to '\ooo' */
1186 else if (ch < ' ' || ch >= 128) {
1187 *p++ = '\\';
1188 *p++ = hexdigit[(ch >> 6) & 7];
1189 *p++ = hexdigit[(ch >> 3) & 7];
1190 *p++ = hexdigit[ch & 7];
1191 }
1192 /* Copy everything else as-is */
1193 else
1194 *p++ = (char) ch;
1195 }
1196 if (quotes)
1197 *p++ = q[1];
1198
1199 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001200 if (_PyString_Resize(&repr, p - q))
1201 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202
1203 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001204
1205 onError:
1206 Py_DECREF(repr);
1207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208}
1209
1210PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1211 int size)
1212{
1213 return unicodeescape_string(s, size, 0);
1214}
1215
1216PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1217{
1218 if (!PyUnicode_Check(unicode)) {
1219 PyErr_BadArgument();
1220 return NULL;
1221 }
1222 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1223 PyUnicode_GET_SIZE(unicode));
1224}
1225
1226/* --- Raw Unicode Escape Codec ------------------------------------------- */
1227
1228PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1229 int size,
1230 const char *errors)
1231{
1232 PyUnicodeObject *v;
1233 Py_UNICODE *p, *buf;
1234 const char *end;
1235 const char *bs;
1236
1237 /* Escaped strings will always be longer than the resulting
1238 Unicode string, so we start with size here and then reduce the
1239 length after conversion to the true value. */
1240 v = _PyUnicode_New(size);
1241 if (v == NULL)
1242 goto onError;
1243 if (size == 0)
1244 return (PyObject *)v;
1245 p = buf = PyUnicode_AS_UNICODE(v);
1246 end = s + size;
1247 while (s < end) {
1248 unsigned char c;
1249 unsigned int x;
1250 int i;
1251
1252 /* Non-escape characters are interpreted as Unicode ordinals */
1253 if (*s != '\\') {
1254 *p++ = (unsigned char)*s++;
1255 continue;
1256 }
1257
1258 /* \u-escapes are only interpreted iff the number of leading
1259 backslashes if odd */
1260 bs = s;
1261 for (;s < end;) {
1262 if (*s != '\\')
1263 break;
1264 *p++ = (unsigned char)*s++;
1265 }
1266 if (((s - bs) & 1) == 0 ||
1267 s >= end ||
1268 *s != 'u') {
1269 continue;
1270 }
1271 p--;
1272 s++;
1273
1274 /* \uXXXX with 4 hex digits */
1275 for (x = 0, i = 0; i < 4; i++) {
1276 c = (unsigned char)s[i];
1277 if (!isxdigit(c)) {
1278 if (unicodeescape_decoding_error(&s, &x, errors,
1279 "truncated \\uXXXX"))
1280 goto onError;
1281 i++;
1282 break;
1283 }
1284 x = (x<<4) & ~0xF;
1285 if (c >= '0' && c <= '9')
1286 x += c - '0';
1287 else if (c >= 'a' && c <= 'f')
1288 x += 10 + c - 'a';
1289 else
1290 x += 10 + c - 'A';
1291 }
1292 s += i;
1293 *p++ = x;
1294 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001295 if (_PyUnicode_Resize(v, (int)(p - buf)))
1296 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001297 return (PyObject *)v;
1298
1299 onError:
1300 Py_XDECREF(v);
1301 return NULL;
1302}
1303
1304PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1305 int size)
1306{
1307 PyObject *repr;
1308 char *p;
1309 char *q;
1310
1311 static const char *hexdigit = "0123456789ABCDEF";
1312
1313 repr = PyString_FromStringAndSize(NULL, 6 * size);
1314 if (repr == NULL)
1315 return NULL;
1316
1317 p = q = PyString_AS_STRING(repr);
1318 while (size-- > 0) {
1319 Py_UNICODE ch = *s++;
1320 /* Map 16-bit characters to '\uxxxx' */
1321 if (ch >= 256) {
1322 *p++ = '\\';
1323 *p++ = 'u';
1324 *p++ = hexdigit[(ch >> 12) & 0xf];
1325 *p++ = hexdigit[(ch >> 8) & 0xf];
1326 *p++ = hexdigit[(ch >> 4) & 0xf];
1327 *p++ = hexdigit[ch & 15];
1328 }
1329 /* Copy everything else as-is */
1330 else
1331 *p++ = (char) ch;
1332 }
1333 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001334 if (_PyString_Resize(&repr, p - q))
1335 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001336
1337 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001338
1339 onError:
1340 Py_DECREF(repr);
1341 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001342}
1343
1344PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1345{
1346 if (!PyUnicode_Check(unicode)) {
1347 PyErr_BadArgument();
1348 return NULL;
1349 }
1350 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1351 PyUnicode_GET_SIZE(unicode));
1352}
1353
1354/* --- Latin-1 Codec ------------------------------------------------------ */
1355
1356PyObject *PyUnicode_DecodeLatin1(const char *s,
1357 int size,
1358 const char *errors)
1359{
1360 PyUnicodeObject *v;
1361 Py_UNICODE *p;
1362
1363 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1364 v = _PyUnicode_New(size);
1365 if (v == NULL)
1366 goto onError;
1367 if (size == 0)
1368 return (PyObject *)v;
1369 p = PyUnicode_AS_UNICODE(v);
1370 while (size-- > 0)
1371 *p++ = (unsigned char)*s++;
1372 return (PyObject *)v;
1373
1374 onError:
1375 Py_XDECREF(v);
1376 return NULL;
1377}
1378
1379static
1380int latin1_encoding_error(const Py_UNICODE **source,
1381 char **dest,
1382 const char *errors,
1383 const char *details)
1384{
1385 if ((errors == NULL) ||
1386 (strcmp(errors,"strict") == 0)) {
1387 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001388 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001389 details);
1390 return -1;
1391 }
1392 else if (strcmp(errors,"ignore") == 0) {
1393 return 0;
1394 }
1395 else if (strcmp(errors,"replace") == 0) {
1396 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001397 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001398 return 0;
1399 }
1400 else {
1401 PyErr_Format(PyExc_ValueError,
1402 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001403 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001404 errors);
1405 return -1;
1406 }
1407}
1408
1409PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1410 int size,
1411 const char *errors)
1412{
1413 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001414 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001415 repr = PyString_FromStringAndSize(NULL, size);
1416 if (repr == NULL)
1417 return NULL;
1418
1419 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001420 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001421 while (size-- > 0) {
1422 Py_UNICODE ch = *p++;
1423 if (ch >= 256) {
1424 if (latin1_encoding_error(&p, &s, errors,
1425 "ordinal not in range(256)"))
1426 goto onError;
1427 }
1428 else
1429 *s++ = (char)ch;
1430 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001431 /* Resize if error handling skipped some characters */
1432 if (s - start < PyString_GET_SIZE(repr))
1433 if (_PyString_Resize(&repr, s - start))
1434 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001435 return repr;
1436
1437 onError:
1438 Py_DECREF(repr);
1439 return NULL;
1440}
1441
1442PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1443{
1444 if (!PyUnicode_Check(unicode)) {
1445 PyErr_BadArgument();
1446 return NULL;
1447 }
1448 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1449 PyUnicode_GET_SIZE(unicode),
1450 NULL);
1451}
1452
1453/* --- 7-bit ASCII Codec -------------------------------------------------- */
1454
1455static
1456int ascii_decoding_error(const char **source,
1457 Py_UNICODE **dest,
1458 const char *errors,
1459 const char *details)
1460{
1461 if ((errors == NULL) ||
1462 (strcmp(errors,"strict") == 0)) {
1463 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001464 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001465 details);
1466 return -1;
1467 }
1468 else if (strcmp(errors,"ignore") == 0) {
1469 return 0;
1470 }
1471 else if (strcmp(errors,"replace") == 0) {
1472 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1473 (*dest)++;
1474 return 0;
1475 }
1476 else {
1477 PyErr_Format(PyExc_ValueError,
1478 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001479 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001480 errors);
1481 return -1;
1482 }
1483}
1484
1485PyObject *PyUnicode_DecodeASCII(const char *s,
1486 int size,
1487 const char *errors)
1488{
1489 PyUnicodeObject *v;
1490 Py_UNICODE *p;
1491
1492 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1493 v = _PyUnicode_New(size);
1494 if (v == NULL)
1495 goto onError;
1496 if (size == 0)
1497 return (PyObject *)v;
1498 p = PyUnicode_AS_UNICODE(v);
1499 while (size-- > 0) {
1500 register unsigned char c;
1501
1502 c = (unsigned char)*s++;
1503 if (c < 128)
1504 *p++ = c;
1505 else if (ascii_decoding_error(&s, &p, errors,
1506 "ordinal not in range(128)"))
1507 goto onError;
1508 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001509 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1510 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1511 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001512 return (PyObject *)v;
1513
1514 onError:
1515 Py_XDECREF(v);
1516 return NULL;
1517}
1518
1519static
1520int ascii_encoding_error(const Py_UNICODE **source,
1521 char **dest,
1522 const char *errors,
1523 const char *details)
1524{
1525 if ((errors == NULL) ||
1526 (strcmp(errors,"strict") == 0)) {
1527 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001528 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001529 details);
1530 return -1;
1531 }
1532 else if (strcmp(errors,"ignore") == 0) {
1533 return 0;
1534 }
1535 else if (strcmp(errors,"replace") == 0) {
1536 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001537 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001538 return 0;
1539 }
1540 else {
1541 PyErr_Format(PyExc_ValueError,
1542 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001543 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544 errors);
1545 return -1;
1546 }
1547}
1548
1549PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1550 int size,
1551 const char *errors)
1552{
1553 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001554 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001555 repr = PyString_FromStringAndSize(NULL, size);
1556 if (repr == NULL)
1557 return NULL;
1558
1559 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001560 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001561 while (size-- > 0) {
1562 Py_UNICODE ch = *p++;
1563 if (ch >= 128) {
1564 if (ascii_encoding_error(&p, &s, errors,
1565 "ordinal not in range(128)"))
1566 goto onError;
1567 }
1568 else
1569 *s++ = (char)ch;
1570 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001571 /* Resize if error handling skipped some characters */
1572 if (s - start < PyString_GET_SIZE(repr))
1573 if (_PyString_Resize(&repr, s - start))
1574 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001575 return repr;
1576
1577 onError:
1578 Py_DECREF(repr);
1579 return NULL;
1580}
1581
1582PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1583{
1584 if (!PyUnicode_Check(unicode)) {
1585 PyErr_BadArgument();
1586 return NULL;
1587 }
1588 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1589 PyUnicode_GET_SIZE(unicode),
1590 NULL);
1591}
1592
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001593#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001594
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001595/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001596
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001597PyObject *PyUnicode_DecodeMBCS(const char *s,
1598 int size,
1599 const char *errors)
1600{
1601 PyUnicodeObject *v;
1602 Py_UNICODE *p;
1603
1604 /* First get the size of the result */
1605 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001606 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001607 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1608
1609 v = _PyUnicode_New(usize);
1610 if (v == NULL)
1611 return NULL;
1612 if (usize == 0)
1613 return (PyObject *)v;
1614 p = PyUnicode_AS_UNICODE(v);
1615 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1616 Py_DECREF(v);
1617 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1618 }
1619
1620 return (PyObject *)v;
1621}
1622
1623PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1624 int size,
1625 const char *errors)
1626{
1627 PyObject *repr;
1628 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001629 DWORD mbcssize;
1630
1631 /* If there are no characters, bail now! */
1632 if (size==0)
1633 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001634
1635 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001636 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001637 if (mbcssize==0)
1638 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1639
1640 repr = PyString_FromStringAndSize(NULL, mbcssize);
1641 if (repr == NULL)
1642 return NULL;
1643 if (mbcssize==0)
1644 return repr;
1645
1646 /* Do the conversion */
1647 s = PyString_AS_STRING(repr);
1648 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1649 Py_DECREF(repr);
1650 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1651 }
1652 return repr;
1653}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001654
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001655#endif /* MS_WIN32 */
1656
Guido van Rossumd57fd912000-03-10 22:53:23 +00001657/* --- Character Mapping Codec -------------------------------------------- */
1658
1659static
1660int charmap_decoding_error(const char **source,
1661 Py_UNICODE **dest,
1662 const char *errors,
1663 const char *details)
1664{
1665 if ((errors == NULL) ||
1666 (strcmp(errors,"strict") == 0)) {
1667 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001668 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001669 details);
1670 return -1;
1671 }
1672 else if (strcmp(errors,"ignore") == 0) {
1673 return 0;
1674 }
1675 else if (strcmp(errors,"replace") == 0) {
1676 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1677 (*dest)++;
1678 return 0;
1679 }
1680 else {
1681 PyErr_Format(PyExc_ValueError,
1682 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001683 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001684 errors);
1685 return -1;
1686 }
1687}
1688
1689PyObject *PyUnicode_DecodeCharmap(const char *s,
1690 int size,
1691 PyObject *mapping,
1692 const char *errors)
1693{
1694 PyUnicodeObject *v;
1695 Py_UNICODE *p;
1696
1697 /* Default to Latin-1 */
1698 if (mapping == NULL)
1699 return PyUnicode_DecodeLatin1(s, size, errors);
1700
1701 v = _PyUnicode_New(size);
1702 if (v == NULL)
1703 goto onError;
1704 if (size == 0)
1705 return (PyObject *)v;
1706 p = PyUnicode_AS_UNICODE(v);
1707 while (size-- > 0) {
1708 unsigned char ch = *s++;
1709 PyObject *w, *x;
1710
1711 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1712 w = PyInt_FromLong((long)ch);
1713 if (w == NULL)
1714 goto onError;
1715 x = PyObject_GetItem(mapping, w);
1716 Py_DECREF(w);
1717 if (x == NULL) {
1718 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1719 /* No mapping found: default to Latin-1 mapping */
1720 PyErr_Clear();
1721 *p++ = (Py_UNICODE)ch;
1722 continue;
1723 }
1724 goto onError;
1725 }
1726
1727 /* Apply mapping */
1728 if (PyInt_Check(x)) {
1729 int value = PyInt_AS_LONG(x);
1730 if (value < 0 || value > 65535) {
1731 PyErr_SetString(PyExc_TypeError,
1732 "character mapping must be in range(65336)");
1733 Py_DECREF(x);
1734 goto onError;
1735 }
1736 *p++ = (Py_UNICODE)value;
1737 }
1738 else if (x == Py_None) {
1739 /* undefined mapping */
1740 if (charmap_decoding_error(&s, &p, errors,
1741 "character maps to <undefined>")) {
1742 Py_DECREF(x);
1743 goto onError;
1744 }
1745 }
1746 else if (PyUnicode_Check(x)) {
1747 if (PyUnicode_GET_SIZE(x) != 1) {
1748 /* 1-n mapping */
1749 PyErr_SetString(PyExc_NotImplementedError,
1750 "1-n mappings are currently not implemented");
1751 Py_DECREF(x);
1752 goto onError;
1753 }
1754 *p++ = *PyUnicode_AS_UNICODE(x);
1755 }
1756 else {
1757 /* wrong return value */
1758 PyErr_SetString(PyExc_TypeError,
1759 "character mapping must return integer, None or unicode");
1760 Py_DECREF(x);
1761 goto onError;
1762 }
1763 Py_DECREF(x);
1764 }
1765 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1766 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1767 goto onError;
1768 return (PyObject *)v;
1769
1770 onError:
1771 Py_XDECREF(v);
1772 return NULL;
1773}
1774
1775static
1776int charmap_encoding_error(const Py_UNICODE **source,
1777 char **dest,
1778 const char *errors,
1779 const char *details)
1780{
1781 if ((errors == NULL) ||
1782 (strcmp(errors,"strict") == 0)) {
1783 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001784 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001785 details);
1786 return -1;
1787 }
1788 else if (strcmp(errors,"ignore") == 0) {
1789 return 0;
1790 }
1791 else if (strcmp(errors,"replace") == 0) {
1792 **dest = '?';
1793 (*dest)++;
1794 return 0;
1795 }
1796 else {
1797 PyErr_Format(PyExc_ValueError,
1798 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001799 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001800 errors);
1801 return -1;
1802 }
1803}
1804
1805PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1806 int size,
1807 PyObject *mapping,
1808 const char *errors)
1809{
1810 PyObject *v;
1811 char *s;
1812
1813 /* Default to Latin-1 */
1814 if (mapping == NULL)
1815 return PyUnicode_EncodeLatin1(p, size, errors);
1816
1817 v = PyString_FromStringAndSize(NULL, size);
1818 if (v == NULL)
1819 return NULL;
1820 s = PyString_AS_STRING(v);
1821 while (size-- > 0) {
1822 Py_UNICODE ch = *p++;
1823 PyObject *w, *x;
1824
1825 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1826 w = PyInt_FromLong((long)ch);
1827 if (w == NULL)
1828 goto onError;
1829 x = PyObject_GetItem(mapping, w);
1830 Py_DECREF(w);
1831 if (x == NULL) {
1832 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1833 /* No mapping found: default to Latin-1 mapping if possible */
1834 PyErr_Clear();
1835 if (ch < 256) {
1836 *s++ = (char)ch;
1837 continue;
1838 }
1839 else if (!charmap_encoding_error(&p, &s, errors,
1840 "missing character mapping"))
1841 continue;
1842 }
1843 goto onError;
1844 }
1845
1846 /* Apply mapping */
1847 if (PyInt_Check(x)) {
1848 int value = PyInt_AS_LONG(x);
1849 if (value < 0 || value > 255) {
1850 PyErr_SetString(PyExc_TypeError,
1851 "character mapping must be in range(256)");
1852 Py_DECREF(x);
1853 goto onError;
1854 }
1855 *s++ = (char)value;
1856 }
1857 else if (x == Py_None) {
1858 /* undefined mapping */
1859 if (charmap_encoding_error(&p, &s, errors,
1860 "character maps to <undefined>")) {
1861 Py_DECREF(x);
1862 goto onError;
1863 }
1864 }
1865 else if (PyString_Check(x)) {
1866 if (PyString_GET_SIZE(x) != 1) {
1867 /* 1-n mapping */
1868 PyErr_SetString(PyExc_NotImplementedError,
1869 "1-n mappings are currently not implemented");
1870 Py_DECREF(x);
1871 goto onError;
1872 }
1873 *s++ = *PyString_AS_STRING(x);
1874 }
1875 else {
1876 /* wrong return value */
1877 PyErr_SetString(PyExc_TypeError,
1878 "character mapping must return integer, None or unicode");
1879 Py_DECREF(x);
1880 goto onError;
1881 }
1882 Py_DECREF(x);
1883 }
1884 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1885 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1886 goto onError;
1887 return v;
1888
1889 onError:
1890 Py_DECREF(v);
1891 return NULL;
1892}
1893
1894PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1895 PyObject *mapping)
1896{
1897 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1898 PyErr_BadArgument();
1899 return NULL;
1900 }
1901 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1902 PyUnicode_GET_SIZE(unicode),
1903 mapping,
1904 NULL);
1905}
1906
1907static
1908int translate_error(const Py_UNICODE **source,
1909 Py_UNICODE **dest,
1910 const char *errors,
1911 const char *details)
1912{
1913 if ((errors == NULL) ||
1914 (strcmp(errors,"strict") == 0)) {
1915 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001916 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001917 details);
1918 return -1;
1919 }
1920 else if (strcmp(errors,"ignore") == 0) {
1921 return 0;
1922 }
1923 else if (strcmp(errors,"replace") == 0) {
1924 **dest = '?';
1925 (*dest)++;
1926 return 0;
1927 }
1928 else {
1929 PyErr_Format(PyExc_ValueError,
1930 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001931 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932 errors);
1933 return -1;
1934 }
1935}
1936
1937PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1938 int size,
1939 PyObject *mapping,
1940 const char *errors)
1941{
1942 PyUnicodeObject *v;
1943 Py_UNICODE *p;
1944
1945 if (mapping == NULL) {
1946 PyErr_BadArgument();
1947 return NULL;
1948 }
1949
1950 /* Output will never be longer than input */
1951 v = _PyUnicode_New(size);
1952 if (v == NULL)
1953 goto onError;
1954 if (size == 0)
1955 goto done;
1956 p = PyUnicode_AS_UNICODE(v);
1957 while (size-- > 0) {
1958 Py_UNICODE ch = *s++;
1959 PyObject *w, *x;
1960
1961 /* Get mapping */
1962 w = PyInt_FromLong(ch);
1963 if (w == NULL)
1964 goto onError;
1965 x = PyObject_GetItem(mapping, w);
1966 Py_DECREF(w);
1967 if (x == NULL) {
1968 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1969 /* No mapping found: default to 1-1 mapping */
1970 PyErr_Clear();
1971 *p++ = ch;
1972 continue;
1973 }
1974 goto onError;
1975 }
1976
1977 /* Apply mapping */
1978 if (PyInt_Check(x))
1979 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1980 else if (x == Py_None) {
1981 /* undefined mapping */
1982 if (translate_error(&s, &p, errors,
1983 "character maps to <undefined>")) {
1984 Py_DECREF(x);
1985 goto onError;
1986 }
1987 }
1988 else if (PyUnicode_Check(x)) {
1989 if (PyUnicode_GET_SIZE(x) != 1) {
1990 /* 1-n mapping */
1991 PyErr_SetString(PyExc_NotImplementedError,
1992 "1-n mappings are currently not implemented");
1993 Py_DECREF(x);
1994 goto onError;
1995 }
1996 *p++ = *PyUnicode_AS_UNICODE(x);
1997 }
1998 else {
1999 /* wrong return value */
2000 PyErr_SetString(PyExc_TypeError,
2001 "translate mapping must return integer, None or unicode");
2002 Py_DECREF(x);
2003 goto onError;
2004 }
2005 Py_DECREF(x);
2006 }
2007 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002008 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2009 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010
2011 done:
2012 return (PyObject *)v;
2013
2014 onError:
2015 Py_XDECREF(v);
2016 return NULL;
2017}
2018
2019PyObject *PyUnicode_Translate(PyObject *str,
2020 PyObject *mapping,
2021 const char *errors)
2022{
2023 PyObject *result;
2024
2025 str = PyUnicode_FromObject(str);
2026 if (str == NULL)
2027 goto onError;
2028 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2029 PyUnicode_GET_SIZE(str),
2030 mapping,
2031 errors);
2032 Py_DECREF(str);
2033 return result;
2034
2035 onError:
2036 Py_XDECREF(str);
2037 return NULL;
2038}
2039
Guido van Rossum9e896b32000-04-05 20:11:21 +00002040/* --- Decimal Encoder ---------------------------------------------------- */
2041
2042int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2043 int length,
2044 char *output,
2045 const char *errors)
2046{
2047 Py_UNICODE *p, *end;
2048
2049 if (output == NULL) {
2050 PyErr_BadArgument();
2051 return -1;
2052 }
2053
2054 p = s;
2055 end = s + length;
2056 while (p < end) {
2057 register Py_UNICODE ch = *p++;
2058 int decimal;
2059
2060 if (Py_UNICODE_ISSPACE(ch)) {
2061 *output++ = ' ';
2062 continue;
2063 }
2064 decimal = Py_UNICODE_TODECIMAL(ch);
2065 if (decimal >= 0) {
2066 *output++ = '0' + decimal;
2067 continue;
2068 }
Guido van Rossumba477042000-04-06 18:18:10 +00002069 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002070 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002071 continue;
2072 }
2073 /* All other characters are considered invalid */
2074 if (errors == NULL || strcmp(errors, "strict") == 0) {
2075 PyErr_SetString(PyExc_ValueError,
2076 "invalid decimal Unicode string");
2077 goto onError;
2078 }
2079 else if (strcmp(errors, "ignore") == 0)
2080 continue;
2081 else if (strcmp(errors, "replace") == 0) {
2082 *output++ = '?';
2083 continue;
2084 }
2085 }
2086 /* 0-terminate the output string */
2087 *output++ = '\0';
2088 return 0;
2089
2090 onError:
2091 return -1;
2092}
2093
Guido van Rossumd57fd912000-03-10 22:53:23 +00002094/* --- Helpers ------------------------------------------------------------ */
2095
2096static
2097int count(PyUnicodeObject *self,
2098 int start,
2099 int end,
2100 PyUnicodeObject *substring)
2101{
2102 int count = 0;
2103
2104 end -= substring->length;
2105
2106 while (start <= end)
2107 if (Py_UNICODE_MATCH(self, start, substring)) {
2108 count++;
2109 start += substring->length;
2110 } else
2111 start++;
2112
2113 return count;
2114}
2115
2116int PyUnicode_Count(PyObject *str,
2117 PyObject *substr,
2118 int start,
2119 int end)
2120{
2121 int result;
2122
2123 str = PyUnicode_FromObject(str);
2124 if (str == NULL)
2125 return -1;
2126 substr = PyUnicode_FromObject(substr);
2127 if (substr == NULL) {
2128 Py_DECREF(substr);
2129 return -1;
2130 }
2131
2132 result = count((PyUnicodeObject *)str,
2133 start, end,
2134 (PyUnicodeObject *)substr);
2135
2136 Py_DECREF(str);
2137 Py_DECREF(substr);
2138 return result;
2139}
2140
2141static
2142int findstring(PyUnicodeObject *self,
2143 PyUnicodeObject *substring,
2144 int start,
2145 int end,
2146 int direction)
2147{
2148 if (start < 0)
2149 start += self->length;
2150 if (start < 0)
2151 start = 0;
2152
2153 if (substring->length == 0)
2154 return start;
2155
2156 if (end > self->length)
2157 end = self->length;
2158 if (end < 0)
2159 end += self->length;
2160 if (end < 0)
2161 end = 0;
2162
2163 end -= substring->length;
2164
2165 if (direction < 0) {
2166 for (; end >= start; end--)
2167 if (Py_UNICODE_MATCH(self, end, substring))
2168 return end;
2169 } else {
2170 for (; start <= end; start++)
2171 if (Py_UNICODE_MATCH(self, start, substring))
2172 return start;
2173 }
2174
2175 return -1;
2176}
2177
2178int PyUnicode_Find(PyObject *str,
2179 PyObject *substr,
2180 int start,
2181 int end,
2182 int direction)
2183{
2184 int result;
2185
2186 str = PyUnicode_FromObject(str);
2187 if (str == NULL)
2188 return -1;
2189 substr = PyUnicode_FromObject(substr);
2190 if (substr == NULL) {
2191 Py_DECREF(substr);
2192 return -1;
2193 }
2194
2195 result = findstring((PyUnicodeObject *)str,
2196 (PyUnicodeObject *)substr,
2197 start, end, direction);
2198 Py_DECREF(str);
2199 Py_DECREF(substr);
2200 return result;
2201}
2202
2203static
2204int tailmatch(PyUnicodeObject *self,
2205 PyUnicodeObject *substring,
2206 int start,
2207 int end,
2208 int direction)
2209{
2210 if (start < 0)
2211 start += self->length;
2212 if (start < 0)
2213 start = 0;
2214
2215 if (substring->length == 0)
2216 return 1;
2217
2218 if (end > self->length)
2219 end = self->length;
2220 if (end < 0)
2221 end += self->length;
2222 if (end < 0)
2223 end = 0;
2224
2225 end -= substring->length;
2226 if (end < start)
2227 return 0;
2228
2229 if (direction > 0) {
2230 if (Py_UNICODE_MATCH(self, end, substring))
2231 return 1;
2232 } else {
2233 if (Py_UNICODE_MATCH(self, start, substring))
2234 return 1;
2235 }
2236
2237 return 0;
2238}
2239
2240int PyUnicode_Tailmatch(PyObject *str,
2241 PyObject *substr,
2242 int start,
2243 int end,
2244 int direction)
2245{
2246 int result;
2247
2248 str = PyUnicode_FromObject(str);
2249 if (str == NULL)
2250 return -1;
2251 substr = PyUnicode_FromObject(substr);
2252 if (substr == NULL) {
2253 Py_DECREF(substr);
2254 return -1;
2255 }
2256
2257 result = tailmatch((PyUnicodeObject *)str,
2258 (PyUnicodeObject *)substr,
2259 start, end, direction);
2260 Py_DECREF(str);
2261 Py_DECREF(substr);
2262 return result;
2263}
2264
2265static
2266const Py_UNICODE *findchar(const Py_UNICODE *s,
2267 int size,
2268 Py_UNICODE ch)
2269{
2270 /* like wcschr, but doesn't stop at NULL characters */
2271
2272 while (size-- > 0) {
2273 if (*s == ch)
2274 return s;
2275 s++;
2276 }
2277
2278 return NULL;
2279}
2280
2281/* Apply fixfct filter to the Unicode object self and return a
2282 reference to the modified object */
2283
2284static
2285PyObject *fixup(PyUnicodeObject *self,
2286 int (*fixfct)(PyUnicodeObject *s))
2287{
2288
2289 PyUnicodeObject *u;
2290
2291 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2292 self->length);
2293 if (u == NULL)
2294 return NULL;
2295 if (!fixfct(u)) {
2296 /* fixfct should return TRUE if it modified the buffer. If
2297 FALSE, return a reference to the original buffer instead
2298 (to save space, not time) */
2299 Py_INCREF(self);
2300 Py_DECREF(u);
2301 return (PyObject*) self;
2302 }
2303 return (PyObject*) u;
2304}
2305
2306static
2307int fixupper(PyUnicodeObject *self)
2308{
2309 int len = self->length;
2310 Py_UNICODE *s = self->str;
2311 int status = 0;
2312
2313 while (len-- > 0) {
2314 register Py_UNICODE ch;
2315
2316 ch = Py_UNICODE_TOUPPER(*s);
2317 if (ch != *s) {
2318 status = 1;
2319 *s = ch;
2320 }
2321 s++;
2322 }
2323
2324 return status;
2325}
2326
2327static
2328int fixlower(PyUnicodeObject *self)
2329{
2330 int len = self->length;
2331 Py_UNICODE *s = self->str;
2332 int status = 0;
2333
2334 while (len-- > 0) {
2335 register Py_UNICODE ch;
2336
2337 ch = Py_UNICODE_TOLOWER(*s);
2338 if (ch != *s) {
2339 status = 1;
2340 *s = ch;
2341 }
2342 s++;
2343 }
2344
2345 return status;
2346}
2347
2348static
2349int fixswapcase(PyUnicodeObject *self)
2350{
2351 int len = self->length;
2352 Py_UNICODE *s = self->str;
2353 int status = 0;
2354
2355 while (len-- > 0) {
2356 if (Py_UNICODE_ISUPPER(*s)) {
2357 *s = Py_UNICODE_TOLOWER(*s);
2358 status = 1;
2359 } else if (Py_UNICODE_ISLOWER(*s)) {
2360 *s = Py_UNICODE_TOUPPER(*s);
2361 status = 1;
2362 }
2363 s++;
2364 }
2365
2366 return status;
2367}
2368
2369static
2370int fixcapitalize(PyUnicodeObject *self)
2371{
2372 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2373 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2374 return 1;
2375 }
2376 return 0;
2377}
2378
2379static
2380int fixtitle(PyUnicodeObject *self)
2381{
2382 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2383 register Py_UNICODE *e;
2384 int previous_is_cased;
2385
2386 /* Shortcut for single character strings */
2387 if (PyUnicode_GET_SIZE(self) == 1) {
2388 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2389 if (*p != ch) {
2390 *p = ch;
2391 return 1;
2392 }
2393 else
2394 return 0;
2395 }
2396
2397 e = p + PyUnicode_GET_SIZE(self);
2398 previous_is_cased = 0;
2399 for (; p < e; p++) {
2400 register const Py_UNICODE ch = *p;
2401
2402 if (previous_is_cased)
2403 *p = Py_UNICODE_TOLOWER(ch);
2404 else
2405 *p = Py_UNICODE_TOTITLE(ch);
2406
2407 if (Py_UNICODE_ISLOWER(ch) ||
2408 Py_UNICODE_ISUPPER(ch) ||
2409 Py_UNICODE_ISTITLE(ch))
2410 previous_is_cased = 1;
2411 else
2412 previous_is_cased = 0;
2413 }
2414 return 1;
2415}
2416
2417PyObject *PyUnicode_Join(PyObject *separator,
2418 PyObject *seq)
2419{
2420 Py_UNICODE *sep;
2421 int seplen;
2422 PyUnicodeObject *res = NULL;
2423 int reslen = 0;
2424 Py_UNICODE *p;
2425 int seqlen = 0;
2426 int sz = 100;
2427 int i;
2428
2429 seqlen = PySequence_Length(seq);
2430 if (seqlen < 0 && PyErr_Occurred())
2431 return NULL;
2432
2433 if (separator == NULL) {
2434 Py_UNICODE blank = ' ';
2435 sep = &blank;
2436 seplen = 1;
2437 }
2438 else {
2439 separator = PyUnicode_FromObject(separator);
2440 if (separator == NULL)
2441 return NULL;
2442 sep = PyUnicode_AS_UNICODE(separator);
2443 seplen = PyUnicode_GET_SIZE(separator);
2444 }
2445
2446 res = _PyUnicode_New(sz);
2447 if (res == NULL)
2448 goto onError;
2449 p = PyUnicode_AS_UNICODE(res);
2450 reslen = 0;
2451
2452 for (i = 0; i < seqlen; i++) {
2453 int itemlen;
2454 PyObject *item;
2455
2456 item = PySequence_GetItem(seq, i);
2457 if (item == NULL)
2458 goto onError;
2459 if (!PyUnicode_Check(item)) {
2460 PyObject *v;
2461 v = PyUnicode_FromObject(item);
2462 Py_DECREF(item);
2463 item = v;
2464 if (item == NULL)
2465 goto onError;
2466 }
2467 itemlen = PyUnicode_GET_SIZE(item);
2468 while (reslen + itemlen + seplen >= sz) {
2469 if (_PyUnicode_Resize(res, sz*2))
2470 goto onError;
2471 sz *= 2;
2472 p = PyUnicode_AS_UNICODE(res) + reslen;
2473 }
2474 if (i > 0) {
2475 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2476 p += seplen;
2477 reslen += seplen;
2478 }
2479 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2480 p += itemlen;
2481 reslen += itemlen;
2482 Py_DECREF(item);
2483 }
2484 if (_PyUnicode_Resize(res, reslen))
2485 goto onError;
2486
2487 Py_XDECREF(separator);
2488 return (PyObject *)res;
2489
2490 onError:
2491 Py_XDECREF(separator);
2492 Py_DECREF(res);
2493 return NULL;
2494}
2495
2496static
2497PyUnicodeObject *pad(PyUnicodeObject *self,
2498 int left,
2499 int right,
2500 Py_UNICODE fill)
2501{
2502 PyUnicodeObject *u;
2503
2504 if (left < 0)
2505 left = 0;
2506 if (right < 0)
2507 right = 0;
2508
2509 if (left == 0 && right == 0) {
2510 Py_INCREF(self);
2511 return self;
2512 }
2513
2514 u = _PyUnicode_New(left + self->length + right);
2515 if (u) {
2516 if (left)
2517 Py_UNICODE_FILL(u->str, fill, left);
2518 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2519 if (right)
2520 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2521 }
2522
2523 return u;
2524}
2525
2526#define SPLIT_APPEND(data, left, right) \
2527 str = PyUnicode_FromUnicode(data + left, right - left); \
2528 if (!str) \
2529 goto onError; \
2530 if (PyList_Append(list, str)) { \
2531 Py_DECREF(str); \
2532 goto onError; \
2533 } \
2534 else \
2535 Py_DECREF(str);
2536
2537static
2538PyObject *split_whitespace(PyUnicodeObject *self,
2539 PyObject *list,
2540 int maxcount)
2541{
2542 register int i;
2543 register int j;
2544 int len = self->length;
2545 PyObject *str;
2546
2547 for (i = j = 0; i < len; ) {
2548 /* find a token */
2549 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2550 i++;
2551 j = i;
2552 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2553 i++;
2554 if (j < i) {
2555 if (maxcount-- <= 0)
2556 break;
2557 SPLIT_APPEND(self->str, j, i);
2558 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2559 i++;
2560 j = i;
2561 }
2562 }
2563 if (j < len) {
2564 SPLIT_APPEND(self->str, j, len);
2565 }
2566 return list;
2567
2568 onError:
2569 Py_DECREF(list);
2570 return NULL;
2571}
2572
2573PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002574 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002575{
2576 register int i;
2577 register int j;
2578 int len;
2579 PyObject *list;
2580 PyObject *str;
2581 Py_UNICODE *data;
2582
2583 string = PyUnicode_FromObject(string);
2584 if (string == NULL)
2585 return NULL;
2586 data = PyUnicode_AS_UNICODE(string);
2587 len = PyUnicode_GET_SIZE(string);
2588
Guido van Rossumd57fd912000-03-10 22:53:23 +00002589 list = PyList_New(0);
2590 if (!list)
2591 goto onError;
2592
2593 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002594 int eol;
2595
Guido van Rossumd57fd912000-03-10 22:53:23 +00002596 /* Find a line and append it */
2597 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2598 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002599
2600 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002601 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002602 if (i < len) {
2603 if (data[i] == '\r' && i + 1 < len &&
2604 data[i+1] == '\n')
2605 i += 2;
2606 else
2607 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002608 if (keepends)
2609 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 }
Guido van Rossum86662912000-04-11 15:38:46 +00002611 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002612 j = i;
2613 }
2614 if (j < len) {
2615 SPLIT_APPEND(data, j, len);
2616 }
2617
2618 Py_DECREF(string);
2619 return list;
2620
2621 onError:
2622 Py_DECREF(list);
2623 Py_DECREF(string);
2624 return NULL;
2625}
2626
2627static
2628PyObject *split_char(PyUnicodeObject *self,
2629 PyObject *list,
2630 Py_UNICODE ch,
2631 int maxcount)
2632{
2633 register int i;
2634 register int j;
2635 int len = self->length;
2636 PyObject *str;
2637
2638 for (i = j = 0; i < len; ) {
2639 if (self->str[i] == ch) {
2640 if (maxcount-- <= 0)
2641 break;
2642 SPLIT_APPEND(self->str, j, i);
2643 i = j = i + 1;
2644 } else
2645 i++;
2646 }
2647 if (j <= len) {
2648 SPLIT_APPEND(self->str, j, len);
2649 }
2650 return list;
2651
2652 onError:
2653 Py_DECREF(list);
2654 return NULL;
2655}
2656
2657static
2658PyObject *split_substring(PyUnicodeObject *self,
2659 PyObject *list,
2660 PyUnicodeObject *substring,
2661 int maxcount)
2662{
2663 register int i;
2664 register int j;
2665 int len = self->length;
2666 int sublen = substring->length;
2667 PyObject *str;
2668
2669 for (i = j = 0; i < len - sublen; ) {
2670 if (Py_UNICODE_MATCH(self, i, substring)) {
2671 if (maxcount-- <= 0)
2672 break;
2673 SPLIT_APPEND(self->str, j, i);
2674 i = j = i + sublen;
2675 } else
2676 i++;
2677 }
2678 if (j <= len) {
2679 SPLIT_APPEND(self->str, j, len);
2680 }
2681 return list;
2682
2683 onError:
2684 Py_DECREF(list);
2685 return NULL;
2686}
2687
2688#undef SPLIT_APPEND
2689
2690static
2691PyObject *split(PyUnicodeObject *self,
2692 PyUnicodeObject *substring,
2693 int maxcount)
2694{
2695 PyObject *list;
2696
2697 if (maxcount < 0)
2698 maxcount = INT_MAX;
2699
2700 list = PyList_New(0);
2701 if (!list)
2702 return NULL;
2703
2704 if (substring == NULL)
2705 return split_whitespace(self,list,maxcount);
2706
2707 else if (substring->length == 1)
2708 return split_char(self,list,substring->str[0],maxcount);
2709
2710 else if (substring->length == 0) {
2711 Py_DECREF(list);
2712 PyErr_SetString(PyExc_ValueError, "empty separator");
2713 return NULL;
2714 }
2715 else
2716 return split_substring(self,list,substring,maxcount);
2717}
2718
2719static
2720PyObject *strip(PyUnicodeObject *self,
2721 int left,
2722 int right)
2723{
2724 Py_UNICODE *p = self->str;
2725 int start = 0;
2726 int end = self->length;
2727
2728 if (left)
2729 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2730 start++;
2731
2732 if (right)
2733 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2734 end--;
2735
2736 if (start == 0 && end == self->length) {
2737 /* couldn't strip anything off, return original string */
2738 Py_INCREF(self);
2739 return (PyObject*) self;
2740 }
2741
2742 return (PyObject*) PyUnicode_FromUnicode(
2743 self->str + start,
2744 end - start
2745 );
2746}
2747
2748static
2749PyObject *replace(PyUnicodeObject *self,
2750 PyUnicodeObject *str1,
2751 PyUnicodeObject *str2,
2752 int maxcount)
2753{
2754 PyUnicodeObject *u;
2755
2756 if (maxcount < 0)
2757 maxcount = INT_MAX;
2758
2759 if (str1->length == 1 && str2->length == 1) {
2760 int i;
2761
2762 /* replace characters */
2763 if (!findchar(self->str, self->length, str1->str[0])) {
2764 /* nothing to replace, return original string */
2765 Py_INCREF(self);
2766 u = self;
2767 } else {
2768 Py_UNICODE u1 = str1->str[0];
2769 Py_UNICODE u2 = str2->str[0];
2770
2771 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2772 self->str,
2773 self->length
2774 );
2775 if (u)
2776 for (i = 0; i < u->length; i++)
2777 if (u->str[i] == u1) {
2778 if (--maxcount < 0)
2779 break;
2780 u->str[i] = u2;
2781 }
2782 }
2783
2784 } else {
2785 int n, i;
2786 Py_UNICODE *p;
2787
2788 /* replace strings */
2789 n = count(self, 0, self->length, str1);
2790 if (n > maxcount)
2791 n = maxcount;
2792 if (n == 0) {
2793 /* nothing to replace, return original string */
2794 Py_INCREF(self);
2795 u = self;
2796 } else {
2797 u = _PyUnicode_New(
2798 self->length + n * (str2->length - str1->length));
2799 if (u) {
2800 i = 0;
2801 p = u->str;
2802 while (i <= self->length - str1->length)
2803 if (Py_UNICODE_MATCH(self, i, str1)) {
2804 /* replace string segment */
2805 Py_UNICODE_COPY(p, str2->str, str2->length);
2806 p += str2->length;
2807 i += str1->length;
2808 if (--n <= 0) {
2809 /* copy remaining part */
2810 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2811 break;
2812 }
2813 } else
2814 *p++ = self->str[i++];
2815 }
2816 }
2817 }
2818
2819 return (PyObject *) u;
2820}
2821
2822/* --- Unicode Object Methods --------------------------------------------- */
2823
2824static char title__doc__[] =
2825"S.title() -> unicode\n\
2826\n\
2827Return a titlecased version of S, i.e. words start with title case\n\
2828characters, all remaining cased characters have lower case.";
2829
2830static PyObject*
2831unicode_title(PyUnicodeObject *self, PyObject *args)
2832{
2833 if (!PyArg_NoArgs(args))
2834 return NULL;
2835 return fixup(self, fixtitle);
2836}
2837
2838static char capitalize__doc__[] =
2839"S.capitalize() -> unicode\n\
2840\n\
2841Return a capitalized version of S, i.e. make the first character\n\
2842have upper case.";
2843
2844static PyObject*
2845unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2846{
2847 if (!PyArg_NoArgs(args))
2848 return NULL;
2849 return fixup(self, fixcapitalize);
2850}
2851
2852#if 0
2853static char capwords__doc__[] =
2854"S.capwords() -> unicode\n\
2855\n\
2856Apply .capitalize() to all words in S and return the result with\n\
2857normalized whitespace (all whitespace strings are replaced by ' ').";
2858
2859static PyObject*
2860unicode_capwords(PyUnicodeObject *self, PyObject *args)
2861{
2862 PyObject *list;
2863 PyObject *item;
2864 int i;
2865
2866 if (!PyArg_NoArgs(args))
2867 return NULL;
2868
2869 /* Split into words */
2870 list = split(self, NULL, -1);
2871 if (!list)
2872 return NULL;
2873
2874 /* Capitalize each word */
2875 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2876 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2877 fixcapitalize);
2878 if (item == NULL)
2879 goto onError;
2880 Py_DECREF(PyList_GET_ITEM(list, i));
2881 PyList_SET_ITEM(list, i, item);
2882 }
2883
2884 /* Join the words to form a new string */
2885 item = PyUnicode_Join(NULL, list);
2886
2887onError:
2888 Py_DECREF(list);
2889 return (PyObject *)item;
2890}
2891#endif
2892
2893static char center__doc__[] =
2894"S.center(width) -> unicode\n\
2895\n\
2896Return S centered in a Unicode string of length width. Padding is done\n\
2897using spaces.";
2898
2899static PyObject *
2900unicode_center(PyUnicodeObject *self, PyObject *args)
2901{
2902 int marg, left;
2903 int width;
2904
2905 if (!PyArg_ParseTuple(args, "i:center", &width))
2906 return NULL;
2907
2908 if (self->length >= width) {
2909 Py_INCREF(self);
2910 return (PyObject*) self;
2911 }
2912
2913 marg = width - self->length;
2914 left = marg / 2 + (marg & width & 1);
2915
2916 return (PyObject*) pad(self, left, marg - left, ' ');
2917}
2918
2919static int
2920unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2921{
2922 int len1, len2;
2923 Py_UNICODE *s1 = str1->str;
2924 Py_UNICODE *s2 = str2->str;
2925
2926 len1 = str1->length;
2927 len2 = str2->length;
2928
2929 while (len1 > 0 && len2 > 0) {
2930 int cmp = (*s1++) - (*s2++);
2931 if (cmp)
2932 /* This should make Christian happy! */
2933 return (cmp < 0) ? -1 : (cmp != 0);
2934 len1--, len2--;
2935 }
2936
2937 return (len1 < len2) ? -1 : (len1 != len2);
2938}
2939
2940int PyUnicode_Compare(PyObject *left,
2941 PyObject *right)
2942{
2943 PyUnicodeObject *u = NULL, *v = NULL;
2944 int result;
2945
2946 /* Coerce the two arguments */
2947 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2948 if (u == NULL)
2949 goto onError;
2950 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2951 if (v == NULL)
2952 goto onError;
2953
2954 /* Shortcut for emtpy or interned objects */
2955 if (v == u) {
2956 Py_DECREF(u);
2957 Py_DECREF(v);
2958 return 0;
2959 }
2960
2961 result = unicode_compare(u, v);
2962
2963 Py_DECREF(u);
2964 Py_DECREF(v);
2965 return result;
2966
2967onError:
2968 Py_XDECREF(u);
2969 Py_XDECREF(v);
2970 return -1;
2971}
2972
Guido van Rossum403d68b2000-03-13 15:55:09 +00002973int PyUnicode_Contains(PyObject *container,
2974 PyObject *element)
2975{
2976 PyUnicodeObject *u = NULL, *v = NULL;
2977 int result;
2978 register const Py_UNICODE *p, *e;
2979 register Py_UNICODE ch;
2980
2981 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002982 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2983 if (v == NULL)
2984 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002985 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2986 if (u == NULL) {
2987 Py_DECREF(v);
2988 goto onError;
2989 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002990
2991 /* Check v in u */
2992 if (PyUnicode_GET_SIZE(v) != 1) {
2993 PyErr_SetString(PyExc_TypeError,
2994 "string member test needs char left operand");
2995 goto onError;
2996 }
2997 ch = *PyUnicode_AS_UNICODE(v);
2998 p = PyUnicode_AS_UNICODE(u);
2999 e = p + PyUnicode_GET_SIZE(u);
3000 result = 0;
3001 while (p < e) {
3002 if (*p++ == ch) {
3003 result = 1;
3004 break;
3005 }
3006 }
3007
3008 Py_DECREF(u);
3009 Py_DECREF(v);
3010 return result;
3011
3012onError:
3013 Py_XDECREF(u);
3014 Py_XDECREF(v);
3015 return -1;
3016}
3017
Guido van Rossumd57fd912000-03-10 22:53:23 +00003018/* Concat to string or Unicode object giving a new Unicode object. */
3019
3020PyObject *PyUnicode_Concat(PyObject *left,
3021 PyObject *right)
3022{
3023 PyUnicodeObject *u = NULL, *v = NULL, *w;
3024
3025 /* Coerce the two arguments */
3026 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3027 if (u == NULL)
3028 goto onError;
3029 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3030 if (v == NULL)
3031 goto onError;
3032
3033 /* Shortcuts */
3034 if (v == unicode_empty) {
3035 Py_DECREF(v);
3036 return (PyObject *)u;
3037 }
3038 if (u == unicode_empty) {
3039 Py_DECREF(u);
3040 return (PyObject *)v;
3041 }
3042
3043 /* Concat the two Unicode strings */
3044 w = _PyUnicode_New(u->length + v->length);
3045 if (w == NULL)
3046 goto onError;
3047 Py_UNICODE_COPY(w->str, u->str, u->length);
3048 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3049
3050 Py_DECREF(u);
3051 Py_DECREF(v);
3052 return (PyObject *)w;
3053
3054onError:
3055 Py_XDECREF(u);
3056 Py_XDECREF(v);
3057 return NULL;
3058}
3059
3060static char count__doc__[] =
3061"S.count(sub[, start[, end]]) -> int\n\
3062\n\
3063Return the number of occurrences of substring sub in Unicode string\n\
3064S[start:end]. Optional arguments start and end are\n\
3065interpreted as in slice notation.";
3066
3067static PyObject *
3068unicode_count(PyUnicodeObject *self, PyObject *args)
3069{
3070 PyUnicodeObject *substring;
3071 int start = 0;
3072 int end = INT_MAX;
3073 PyObject *result;
3074
Guido van Rossumb8872e62000-05-09 14:14:27 +00003075 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3076 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003077 return NULL;
3078
3079 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3080 (PyObject *)substring);
3081 if (substring == NULL)
3082 return NULL;
3083
3084 if (substring->length == 0) {
3085 Py_DECREF(substring);
3086 return PyInt_FromLong((long) 0);
3087 }
3088
3089 if (start < 0)
3090 start += self->length;
3091 if (start < 0)
3092 start = 0;
3093 if (end > self->length)
3094 end = self->length;
3095 if (end < 0)
3096 end += self->length;
3097 if (end < 0)
3098 end = 0;
3099
3100 result = PyInt_FromLong((long) count(self, start, end, substring));
3101
3102 Py_DECREF(substring);
3103 return result;
3104}
3105
3106static char encode__doc__[] =
3107"S.encode([encoding[,errors]]) -> string\n\
3108\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003109Return an encoded string version of S. Default encoding is the current\n\
3110default string encoding. errors may be given to set a different error\n\
3111handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3112a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003113
3114static PyObject *
3115unicode_encode(PyUnicodeObject *self, PyObject *args)
3116{
3117 char *encoding = NULL;
3118 char *errors = NULL;
3119 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3120 return NULL;
3121 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3122}
3123
3124static char expandtabs__doc__[] =
3125"S.expandtabs([tabsize]) -> unicode\n\
3126\n\
3127Return a copy of S where all tab characters are expanded using spaces.\n\
3128If tabsize is not given, a tab size of 8 characters is assumed.";
3129
3130static PyObject*
3131unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3132{
3133 Py_UNICODE *e;
3134 Py_UNICODE *p;
3135 Py_UNICODE *q;
3136 int i, j;
3137 PyUnicodeObject *u;
3138 int tabsize = 8;
3139
3140 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3141 return NULL;
3142
3143 /* First pass: determine size of ouput string */
3144 i = j = 0;
3145 e = self->str + self->length;
3146 for (p = self->str; p < e; p++)
3147 if (*p == '\t') {
3148 if (tabsize > 0)
3149 j += tabsize - (j % tabsize);
3150 }
3151 else {
3152 j++;
3153 if (*p == '\n' || *p == '\r') {
3154 i += j;
3155 j = 0;
3156 }
3157 }
3158
3159 /* Second pass: create output string and fill it */
3160 u = _PyUnicode_New(i + j);
3161 if (!u)
3162 return NULL;
3163
3164 j = 0;
3165 q = u->str;
3166
3167 for (p = self->str; p < e; p++)
3168 if (*p == '\t') {
3169 if (tabsize > 0) {
3170 i = tabsize - (j % tabsize);
3171 j += i;
3172 while (i--)
3173 *q++ = ' ';
3174 }
3175 }
3176 else {
3177 j++;
3178 *q++ = *p;
3179 if (*p == '\n' || *p == '\r')
3180 j = 0;
3181 }
3182
3183 return (PyObject*) u;
3184}
3185
3186static char find__doc__[] =
3187"S.find(sub [,start [,end]]) -> int\n\
3188\n\
3189Return the lowest index in S where substring sub is found,\n\
3190such that sub is contained within s[start,end]. Optional\n\
3191arguments start and end are interpreted as in slice notation.\n\
3192\n\
3193Return -1 on failure.";
3194
3195static PyObject *
3196unicode_find(PyUnicodeObject *self, PyObject *args)
3197{
3198 PyUnicodeObject *substring;
3199 int start = 0;
3200 int end = INT_MAX;
3201 PyObject *result;
3202
Guido van Rossumb8872e62000-05-09 14:14:27 +00003203 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3204 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 return NULL;
3206 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3207 (PyObject *)substring);
3208 if (substring == NULL)
3209 return NULL;
3210
3211 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3212
3213 Py_DECREF(substring);
3214 return result;
3215}
3216
3217static PyObject *
3218unicode_getitem(PyUnicodeObject *self, int index)
3219{
3220 if (index < 0 || index >= self->length) {
3221 PyErr_SetString(PyExc_IndexError, "string index out of range");
3222 return NULL;
3223 }
3224
3225 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3226}
3227
3228static long
3229unicode_hash(PyUnicodeObject *self)
3230{
3231 long hash;
3232 PyObject *utf8;
3233
3234 /* Since Unicode objects compare equal to their UTF-8 string
3235 counterparts, they should also use the UTF-8 strings as basis
3236 for their hash value. This is needed to assure that strings and
3237 Unicode objects behave in the same way as dictionary
3238 keys. Unfortunately, this costs some performance and also some
3239 memory if the cached UTF-8 representation is not used later
3240 on. */
3241 if (self->hash != -1)
3242 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003243 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003244 if (utf8 == NULL)
3245 return -1;
3246 hash = PyObject_Hash(utf8);
3247 if (hash == -1)
3248 return -1;
3249 self->hash = hash;
3250 return hash;
3251}
3252
3253static char index__doc__[] =
3254"S.index(sub [,start [,end]]) -> int\n\
3255\n\
3256Like S.find() but raise ValueError when the substring is not found.";
3257
3258static PyObject *
3259unicode_index(PyUnicodeObject *self, PyObject *args)
3260{
3261 int result;
3262 PyUnicodeObject *substring;
3263 int start = 0;
3264 int end = INT_MAX;
3265
Guido van Rossumb8872e62000-05-09 14:14:27 +00003266 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3267 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003268 return NULL;
3269
3270 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3271 (PyObject *)substring);
3272 if (substring == NULL)
3273 return NULL;
3274
3275 result = findstring(self, substring, start, end, 1);
3276
3277 Py_DECREF(substring);
3278 if (result < 0) {
3279 PyErr_SetString(PyExc_ValueError, "substring not found");
3280 return NULL;
3281 }
3282 return PyInt_FromLong(result);
3283}
3284
3285static char islower__doc__[] =
3286"S.islower() -> int\n\
3287\n\
3288Return 1 if all cased characters in S are lowercase and there is\n\
3289at least one cased character in S, 0 otherwise.";
3290
3291static PyObject*
3292unicode_islower(PyUnicodeObject *self, PyObject *args)
3293{
3294 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3295 register const Py_UNICODE *e;
3296 int cased;
3297
3298 if (!PyArg_NoArgs(args))
3299 return NULL;
3300
3301 /* Shortcut for single character strings */
3302 if (PyUnicode_GET_SIZE(self) == 1)
3303 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3304
3305 e = p + PyUnicode_GET_SIZE(self);
3306 cased = 0;
3307 for (; p < e; p++) {
3308 register const Py_UNICODE ch = *p;
3309
3310 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3311 return PyInt_FromLong(0);
3312 else if (!cased && Py_UNICODE_ISLOWER(ch))
3313 cased = 1;
3314 }
3315 return PyInt_FromLong(cased);
3316}
3317
3318static char isupper__doc__[] =
3319"S.isupper() -> int\n\
3320\n\
3321Return 1 if all cased characters in S are uppercase and there is\n\
3322at least one cased character in S, 0 otherwise.";
3323
3324static PyObject*
3325unicode_isupper(PyUnicodeObject *self, PyObject *args)
3326{
3327 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3328 register const Py_UNICODE *e;
3329 int cased;
3330
3331 if (!PyArg_NoArgs(args))
3332 return NULL;
3333
3334 /* Shortcut for single character strings */
3335 if (PyUnicode_GET_SIZE(self) == 1)
3336 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3337
3338 e = p + PyUnicode_GET_SIZE(self);
3339 cased = 0;
3340 for (; p < e; p++) {
3341 register const Py_UNICODE ch = *p;
3342
3343 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3344 return PyInt_FromLong(0);
3345 else if (!cased && Py_UNICODE_ISUPPER(ch))
3346 cased = 1;
3347 }
3348 return PyInt_FromLong(cased);
3349}
3350
3351static char istitle__doc__[] =
3352"S.istitle() -> int\n\
3353\n\
3354Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3355may only follow uncased characters and lowercase characters only cased\n\
3356ones. Return 0 otherwise.";
3357
3358static PyObject*
3359unicode_istitle(PyUnicodeObject *self, PyObject *args)
3360{
3361 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3362 register const Py_UNICODE *e;
3363 int cased, previous_is_cased;
3364
3365 if (!PyArg_NoArgs(args))
3366 return NULL;
3367
3368 /* Shortcut for single character strings */
3369 if (PyUnicode_GET_SIZE(self) == 1)
3370 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3371 (Py_UNICODE_ISUPPER(*p) != 0));
3372
3373 e = p + PyUnicode_GET_SIZE(self);
3374 cased = 0;
3375 previous_is_cased = 0;
3376 for (; p < e; p++) {
3377 register const Py_UNICODE ch = *p;
3378
3379 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3380 if (previous_is_cased)
3381 return PyInt_FromLong(0);
3382 previous_is_cased = 1;
3383 cased = 1;
3384 }
3385 else if (Py_UNICODE_ISLOWER(ch)) {
3386 if (!previous_is_cased)
3387 return PyInt_FromLong(0);
3388 previous_is_cased = 1;
3389 cased = 1;
3390 }
3391 else
3392 previous_is_cased = 0;
3393 }
3394 return PyInt_FromLong(cased);
3395}
3396
3397static char isspace__doc__[] =
3398"S.isspace() -> int\n\
3399\n\
3400Return 1 if there are only whitespace characters in S,\n\
34010 otherwise.";
3402
3403static PyObject*
3404unicode_isspace(PyUnicodeObject *self, PyObject *args)
3405{
3406 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3407 register const Py_UNICODE *e;
3408
3409 if (!PyArg_NoArgs(args))
3410 return NULL;
3411
3412 /* Shortcut for single character strings */
3413 if (PyUnicode_GET_SIZE(self) == 1 &&
3414 Py_UNICODE_ISSPACE(*p))
3415 return PyInt_FromLong(1);
3416
3417 e = p + PyUnicode_GET_SIZE(self);
3418 for (; p < e; p++) {
3419 if (!Py_UNICODE_ISSPACE(*p))
3420 return PyInt_FromLong(0);
3421 }
3422 return PyInt_FromLong(1);
3423}
3424
3425static char isdecimal__doc__[] =
3426"S.isdecimal() -> int\n\
3427\n\
3428Return 1 if there are only decimal characters in S,\n\
34290 otherwise.";
3430
3431static PyObject*
3432unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3433{
3434 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3435 register const Py_UNICODE *e;
3436
3437 if (!PyArg_NoArgs(args))
3438 return NULL;
3439
3440 /* Shortcut for single character strings */
3441 if (PyUnicode_GET_SIZE(self) == 1 &&
3442 Py_UNICODE_ISDECIMAL(*p))
3443 return PyInt_FromLong(1);
3444
3445 e = p + PyUnicode_GET_SIZE(self);
3446 for (; p < e; p++) {
3447 if (!Py_UNICODE_ISDECIMAL(*p))
3448 return PyInt_FromLong(0);
3449 }
3450 return PyInt_FromLong(1);
3451}
3452
3453static char isdigit__doc__[] =
3454"S.isdigit() -> int\n\
3455\n\
3456Return 1 if there are only digit characters in S,\n\
34570 otherwise.";
3458
3459static PyObject*
3460unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3461{
3462 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3463 register const Py_UNICODE *e;
3464
3465 if (!PyArg_NoArgs(args))
3466 return NULL;
3467
3468 /* Shortcut for single character strings */
3469 if (PyUnicode_GET_SIZE(self) == 1 &&
3470 Py_UNICODE_ISDIGIT(*p))
3471 return PyInt_FromLong(1);
3472
3473 e = p + PyUnicode_GET_SIZE(self);
3474 for (; p < e; p++) {
3475 if (!Py_UNICODE_ISDIGIT(*p))
3476 return PyInt_FromLong(0);
3477 }
3478 return PyInt_FromLong(1);
3479}
3480
3481static char isnumeric__doc__[] =
3482"S.isnumeric() -> int\n\
3483\n\
3484Return 1 if there are only numeric characters in S,\n\
34850 otherwise.";
3486
3487static PyObject*
3488unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3489{
3490 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3491 register const Py_UNICODE *e;
3492
3493 if (!PyArg_NoArgs(args))
3494 return NULL;
3495
3496 /* Shortcut for single character strings */
3497 if (PyUnicode_GET_SIZE(self) == 1 &&
3498 Py_UNICODE_ISNUMERIC(*p))
3499 return PyInt_FromLong(1);
3500
3501 e = p + PyUnicode_GET_SIZE(self);
3502 for (; p < e; p++) {
3503 if (!Py_UNICODE_ISNUMERIC(*p))
3504 return PyInt_FromLong(0);
3505 }
3506 return PyInt_FromLong(1);
3507}
3508
3509static char join__doc__[] =
3510"S.join(sequence) -> unicode\n\
3511\n\
3512Return a string which is the concatenation of the strings in the\n\
3513sequence. The separator between elements is S.";
3514
3515static PyObject*
3516unicode_join(PyUnicodeObject *self, PyObject *args)
3517{
3518 PyObject *data;
3519 if (!PyArg_ParseTuple(args, "O:join", &data))
3520 return NULL;
3521
3522 return PyUnicode_Join((PyObject *)self, data);
3523}
3524
3525static int
3526unicode_length(PyUnicodeObject *self)
3527{
3528 return self->length;
3529}
3530
3531static char ljust__doc__[] =
3532"S.ljust(width) -> unicode\n\
3533\n\
3534Return S left justified in a Unicode string of length width. Padding is\n\
3535done using spaces.";
3536
3537static PyObject *
3538unicode_ljust(PyUnicodeObject *self, PyObject *args)
3539{
3540 int width;
3541 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3542 return NULL;
3543
3544 if (self->length >= width) {
3545 Py_INCREF(self);
3546 return (PyObject*) self;
3547 }
3548
3549 return (PyObject*) pad(self, 0, width - self->length, ' ');
3550}
3551
3552static char lower__doc__[] =
3553"S.lower() -> unicode\n\
3554\n\
3555Return a copy of the string S converted to lowercase.";
3556
3557static PyObject*
3558unicode_lower(PyUnicodeObject *self, PyObject *args)
3559{
3560 if (!PyArg_NoArgs(args))
3561 return NULL;
3562 return fixup(self, fixlower);
3563}
3564
3565static char lstrip__doc__[] =
3566"S.lstrip() -> unicode\n\
3567\n\
3568Return a copy of the string S with leading whitespace removed.";
3569
3570static PyObject *
3571unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3572{
3573 if (!PyArg_NoArgs(args))
3574 return NULL;
3575 return strip(self, 1, 0);
3576}
3577
3578static PyObject*
3579unicode_repeat(PyUnicodeObject *str, int len)
3580{
3581 PyUnicodeObject *u;
3582 Py_UNICODE *p;
3583
3584 if (len < 0)
3585 len = 0;
3586
3587 if (len == 1) {
3588 /* no repeat, return original string */
3589 Py_INCREF(str);
3590 return (PyObject*) str;
3591 }
3592
3593 u = _PyUnicode_New(len * str->length);
3594 if (!u)
3595 return NULL;
3596
3597 p = u->str;
3598
3599 while (len-- > 0) {
3600 Py_UNICODE_COPY(p, str->str, str->length);
3601 p += str->length;
3602 }
3603
3604 return (PyObject*) u;
3605}
3606
3607PyObject *PyUnicode_Replace(PyObject *obj,
3608 PyObject *subobj,
3609 PyObject *replobj,
3610 int maxcount)
3611{
3612 PyObject *self;
3613 PyObject *str1;
3614 PyObject *str2;
3615 PyObject *result;
3616
3617 self = PyUnicode_FromObject(obj);
3618 if (self == NULL)
3619 return NULL;
3620 str1 = PyUnicode_FromObject(subobj);
3621 if (str1 == NULL) {
3622 Py_DECREF(self);
3623 return NULL;
3624 }
3625 str2 = PyUnicode_FromObject(replobj);
3626 if (str2 == NULL) {
3627 Py_DECREF(self);
3628 Py_DECREF(str1);
3629 return NULL;
3630 }
3631 result = replace((PyUnicodeObject *)self,
3632 (PyUnicodeObject *)str1,
3633 (PyUnicodeObject *)str2,
3634 maxcount);
3635 Py_DECREF(self);
3636 Py_DECREF(str1);
3637 Py_DECREF(str2);
3638 return result;
3639}
3640
3641static char replace__doc__[] =
3642"S.replace (old, new[, maxsplit]) -> unicode\n\
3643\n\
3644Return a copy of S with all occurrences of substring\n\
3645old replaced by new. If the optional argument maxsplit is\n\
3646given, only the first maxsplit occurrences are replaced.";
3647
3648static PyObject*
3649unicode_replace(PyUnicodeObject *self, PyObject *args)
3650{
3651 PyUnicodeObject *str1;
3652 PyUnicodeObject *str2;
3653 int maxcount = -1;
3654 PyObject *result;
3655
3656 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3657 return NULL;
3658 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3659 if (str1 == NULL)
3660 return NULL;
3661 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3662 if (str2 == NULL)
3663 return NULL;
3664
3665 result = replace(self, str1, str2, maxcount);
3666
3667 Py_DECREF(str1);
3668 Py_DECREF(str2);
3669 return result;
3670}
3671
3672static
3673PyObject *unicode_repr(PyObject *unicode)
3674{
3675 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3676 PyUnicode_GET_SIZE(unicode),
3677 1);
3678}
3679
3680static char rfind__doc__[] =
3681"S.rfind(sub [,start [,end]]) -> int\n\
3682\n\
3683Return the highest index in S where substring sub is found,\n\
3684such that sub is contained within s[start,end]. Optional\n\
3685arguments start and end are interpreted as in slice notation.\n\
3686\n\
3687Return -1 on failure.";
3688
3689static PyObject *
3690unicode_rfind(PyUnicodeObject *self, PyObject *args)
3691{
3692 PyUnicodeObject *substring;
3693 int start = 0;
3694 int end = INT_MAX;
3695 PyObject *result;
3696
Guido van Rossumb8872e62000-05-09 14:14:27 +00003697 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
3698 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003699 return NULL;
3700 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3701 (PyObject *)substring);
3702 if (substring == NULL)
3703 return NULL;
3704
3705 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3706
3707 Py_DECREF(substring);
3708 return result;
3709}
3710
3711static char rindex__doc__[] =
3712"S.rindex(sub [,start [,end]]) -> int\n\
3713\n\
3714Like S.rfind() but raise ValueError when the substring is not found.";
3715
3716static PyObject *
3717unicode_rindex(PyUnicodeObject *self, PyObject *args)
3718{
3719 int result;
3720 PyUnicodeObject *substring;
3721 int start = 0;
3722 int end = INT_MAX;
3723
Guido van Rossumb8872e62000-05-09 14:14:27 +00003724 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
3725 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003726 return NULL;
3727 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3728 (PyObject *)substring);
3729 if (substring == NULL)
3730 return NULL;
3731
3732 result = findstring(self, substring, start, end, -1);
3733
3734 Py_DECREF(substring);
3735 if (result < 0) {
3736 PyErr_SetString(PyExc_ValueError, "substring not found");
3737 return NULL;
3738 }
3739 return PyInt_FromLong(result);
3740}
3741
3742static char rjust__doc__[] =
3743"S.rjust(width) -> unicode\n\
3744\n\
3745Return S right justified in a Unicode string of length width. Padding is\n\
3746done using spaces.";
3747
3748static PyObject *
3749unicode_rjust(PyUnicodeObject *self, PyObject *args)
3750{
3751 int width;
3752 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3753 return NULL;
3754
3755 if (self->length >= width) {
3756 Py_INCREF(self);
3757 return (PyObject*) self;
3758 }
3759
3760 return (PyObject*) pad(self, width - self->length, 0, ' ');
3761}
3762
3763static char rstrip__doc__[] =
3764"S.rstrip() -> unicode\n\
3765\n\
3766Return a copy of the string S with trailing whitespace removed.";
3767
3768static PyObject *
3769unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3770{
3771 if (!PyArg_NoArgs(args))
3772 return NULL;
3773 return strip(self, 0, 1);
3774}
3775
3776static PyObject*
3777unicode_slice(PyUnicodeObject *self, int start, int end)
3778{
3779 /* standard clamping */
3780 if (start < 0)
3781 start = 0;
3782 if (end < 0)
3783 end = 0;
3784 if (end > self->length)
3785 end = self->length;
3786 if (start == 0 && end == self->length) {
3787 /* full slice, return original string */
3788 Py_INCREF(self);
3789 return (PyObject*) self;
3790 }
3791 if (start > end)
3792 start = end;
3793 /* copy slice */
3794 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3795 end - start);
3796}
3797
3798PyObject *PyUnicode_Split(PyObject *s,
3799 PyObject *sep,
3800 int maxsplit)
3801{
3802 PyObject *result;
3803
3804 s = PyUnicode_FromObject(s);
3805 if (s == NULL)
3806 return NULL;
3807 if (sep != NULL) {
3808 sep = PyUnicode_FromObject(sep);
3809 if (sep == NULL) {
3810 Py_DECREF(s);
3811 return NULL;
3812 }
3813 }
3814
3815 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3816
3817 Py_DECREF(s);
3818 Py_XDECREF(sep);
3819 return result;
3820}
3821
3822static char split__doc__[] =
3823"S.split([sep [,maxsplit]]) -> list of strings\n\
3824\n\
3825Return a list of the words in S, using sep as the\n\
3826delimiter string. If maxsplit is given, at most maxsplit\n\
3827splits are done. If sep is not specified, any whitespace string\n\
3828is a separator.";
3829
3830static PyObject*
3831unicode_split(PyUnicodeObject *self, PyObject *args)
3832{
3833 PyObject *substring = Py_None;
3834 int maxcount = -1;
3835
3836 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3837 return NULL;
3838
3839 if (substring == Py_None)
3840 return split(self, NULL, maxcount);
3841 else if (PyUnicode_Check(substring))
3842 return split(self, (PyUnicodeObject *)substring, maxcount);
3843 else
3844 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3845}
3846
3847static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00003848"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00003849\n\
3850Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00003851Line breaks are not included in the resulting list unless keepends\n\
3852is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003853
3854static PyObject*
3855unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3856{
Guido van Rossum86662912000-04-11 15:38:46 +00003857 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858
Guido van Rossum86662912000-04-11 15:38:46 +00003859 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003860 return NULL;
3861
Guido van Rossum86662912000-04-11 15:38:46 +00003862 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863}
3864
3865static
3866PyObject *unicode_str(PyUnicodeObject *self)
3867{
Fred Drakee4315f52000-05-09 19:53:39 +00003868 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003869}
3870
3871static char strip__doc__[] =
3872"S.strip() -> unicode\n\
3873\n\
3874Return a copy of S with leading and trailing whitespace removed.";
3875
3876static PyObject *
3877unicode_strip(PyUnicodeObject *self, PyObject *args)
3878{
3879 if (!PyArg_NoArgs(args))
3880 return NULL;
3881 return strip(self, 1, 1);
3882}
3883
3884static char swapcase__doc__[] =
3885"S.swapcase() -> unicode\n\
3886\n\
3887Return a copy of S with uppercase characters converted to lowercase\n\
3888and vice versa.";
3889
3890static PyObject*
3891unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3892{
3893 if (!PyArg_NoArgs(args))
3894 return NULL;
3895 return fixup(self, fixswapcase);
3896}
3897
3898static char translate__doc__[] =
3899"S.translate(table) -> unicode\n\
3900\n\
3901Return a copy of the string S, where all characters have been mapped\n\
3902through the given translation table, which must be a mapping of\n\
3903Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3904are left untouched. Characters mapped to None are deleted.";
3905
3906static PyObject*
3907unicode_translate(PyUnicodeObject *self, PyObject *args)
3908{
3909 PyObject *table;
3910
3911 if (!PyArg_ParseTuple(args, "O:translate", &table))
3912 return NULL;
3913 return PyUnicode_TranslateCharmap(self->str,
3914 self->length,
3915 table,
3916 "ignore");
3917}
3918
3919static char upper__doc__[] =
3920"S.upper() -> unicode\n\
3921\n\
3922Return a copy of S converted to uppercase.";
3923
3924static PyObject*
3925unicode_upper(PyUnicodeObject *self, PyObject *args)
3926{
3927 if (!PyArg_NoArgs(args))
3928 return NULL;
3929 return fixup(self, fixupper);
3930}
3931
3932#if 0
3933static char zfill__doc__[] =
3934"S.zfill(width) -> unicode\n\
3935\n\
3936Pad a numeric string x with zeros on the left, to fill a field\n\
3937of the specified width. The string x is never truncated.";
3938
3939static PyObject *
3940unicode_zfill(PyUnicodeObject *self, PyObject *args)
3941{
3942 int fill;
3943 PyUnicodeObject *u;
3944
3945 int width;
3946 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3947 return NULL;
3948
3949 if (self->length >= width) {
3950 Py_INCREF(self);
3951 return (PyObject*) self;
3952 }
3953
3954 fill = width - self->length;
3955
3956 u = pad(self, fill, 0, '0');
3957
3958 if (u->str[fill] == '+' || u->str[fill] == '-') {
3959 /* move sign to beginning of string */
3960 u->str[0] = u->str[fill];
3961 u->str[fill] = '0';
3962 }
3963
3964 return (PyObject*) u;
3965}
3966#endif
3967
3968#if 0
3969static PyObject*
3970unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3971{
3972 if (!PyArg_NoArgs(args))
3973 return NULL;
3974 return PyInt_FromLong(unicode_freelist_size);
3975}
3976#endif
3977
3978static char startswith__doc__[] =
3979"S.startswith(prefix[, start[, end]]) -> int\n\
3980\n\
3981Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3982optional start, test S beginning at that position. With optional end, stop\n\
3983comparing S at that position.";
3984
3985static PyObject *
3986unicode_startswith(PyUnicodeObject *self,
3987 PyObject *args)
3988{
3989 PyUnicodeObject *substring;
3990 int start = 0;
3991 int end = INT_MAX;
3992 PyObject *result;
3993
Guido van Rossumb8872e62000-05-09 14:14:27 +00003994 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
3995 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003996 return NULL;
3997 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3998 (PyObject *)substring);
3999 if (substring == NULL)
4000 return NULL;
4001
4002 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4003
4004 Py_DECREF(substring);
4005 return result;
4006}
4007
4008
4009static char endswith__doc__[] =
4010"S.endswith(suffix[, start[, end]]) -> int\n\
4011\n\
4012Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4013optional start, test S beginning at that position. With optional end, stop\n\
4014comparing S at that position.";
4015
4016static PyObject *
4017unicode_endswith(PyUnicodeObject *self,
4018 PyObject *args)
4019{
4020 PyUnicodeObject *substring;
4021 int start = 0;
4022 int end = INT_MAX;
4023 PyObject *result;
4024
Guido van Rossumb8872e62000-05-09 14:14:27 +00004025 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4026 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 return NULL;
4028 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4029 (PyObject *)substring);
4030 if (substring == NULL)
4031 return NULL;
4032
4033 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4034
4035 Py_DECREF(substring);
4036 return result;
4037}
4038
4039
4040static PyMethodDef unicode_methods[] = {
4041
4042 /* Order is according to common usage: often used methods should
4043 appear first, since lookup is done sequentially. */
4044
4045 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4046 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4047 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4048 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4049 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4050 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4051 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4052 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4053 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4054 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4055 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4056 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4057 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4058 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4059/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4060 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4061 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4062 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4063 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4064 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4065 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4066 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4067 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4068 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4069 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4070 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4071 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4072 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4073 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4074 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4075 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4076 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4077 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4078#if 0
4079 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4080 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4081#endif
4082
4083#if 0
4084 /* This one is just used for debugging the implementation. */
4085 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4086#endif
4087
4088 {NULL, NULL}
4089};
4090
4091static PyObject *
4092unicode_getattr(PyUnicodeObject *self, char *name)
4093{
4094 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4095}
4096
4097static PySequenceMethods unicode_as_sequence = {
4098 (inquiry) unicode_length, /* sq_length */
4099 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4100 (intargfunc) unicode_repeat, /* sq_repeat */
4101 (intargfunc) unicode_getitem, /* sq_item */
4102 (intintargfunc) unicode_slice, /* sq_slice */
4103 0, /* sq_ass_item */
4104 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004105 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004106};
4107
4108static int
4109unicode_buffer_getreadbuf(PyUnicodeObject *self,
4110 int index,
4111 const void **ptr)
4112{
4113 if (index != 0) {
4114 PyErr_SetString(PyExc_SystemError,
4115 "accessing non-existent unicode segment");
4116 return -1;
4117 }
4118 *ptr = (void *) self->str;
4119 return PyUnicode_GET_DATA_SIZE(self);
4120}
4121
4122static int
4123unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4124 const void **ptr)
4125{
4126 PyErr_SetString(PyExc_TypeError,
4127 "cannot use unicode as modifyable buffer");
4128 return -1;
4129}
4130
4131static int
4132unicode_buffer_getsegcount(PyUnicodeObject *self,
4133 int *lenp)
4134{
4135 if (lenp)
4136 *lenp = PyUnicode_GET_DATA_SIZE(self);
4137 return 1;
4138}
4139
4140static int
4141unicode_buffer_getcharbuf(PyUnicodeObject *self,
4142 int index,
4143 const void **ptr)
4144{
4145 PyObject *str;
4146
4147 if (index != 0) {
4148 PyErr_SetString(PyExc_SystemError,
4149 "accessing non-existent unicode segment");
4150 return -1;
4151 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004152 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004153 if (str == NULL)
4154 return -1;
4155 *ptr = (void *) PyString_AS_STRING(str);
4156 return PyString_GET_SIZE(str);
4157}
4158
4159/* Helpers for PyUnicode_Format() */
4160
4161static PyObject *
4162getnextarg(args, arglen, p_argidx)
4163 PyObject *args;
4164int arglen;
4165int *p_argidx;
4166{
4167 int argidx = *p_argidx;
4168 if (argidx < arglen) {
4169 (*p_argidx)++;
4170 if (arglen < 0)
4171 return args;
4172 else
4173 return PyTuple_GetItem(args, argidx);
4174 }
4175 PyErr_SetString(PyExc_TypeError,
4176 "not enough arguments for format string");
4177 return NULL;
4178}
4179
4180#define F_LJUST (1<<0)
4181#define F_SIGN (1<<1)
4182#define F_BLANK (1<<2)
4183#define F_ALT (1<<3)
4184#define F_ZERO (1<<4)
4185
4186static
4187#ifdef HAVE_STDARG_PROTOTYPES
4188int usprintf(register Py_UNICODE *buffer, char *format, ...)
4189#else
4190int usprintf(va_alist) va_dcl
4191#endif
4192{
4193 register int i;
4194 int len;
4195 va_list va;
4196 char *charbuffer;
4197#ifdef HAVE_STDARG_PROTOTYPES
4198 va_start(va, format);
4199#else
4200 Py_UNICODE *args;
4201 char *format;
4202
4203 va_start(va);
4204 buffer = va_arg(va, Py_UNICODE *);
4205 format = va_arg(va, char *);
4206#endif
4207
4208 /* First, format the string as char array, then expand to Py_UNICODE
4209 array. */
4210 charbuffer = (char *)buffer;
4211 len = vsprintf(charbuffer, format, va);
4212 for (i = len - 1; i >= 0; i--)
4213 buffer[i] = (Py_UNICODE) charbuffer[i];
4214
4215 va_end(va);
4216 return len;
4217}
4218
4219static int
4220formatfloat(Py_UNICODE *buf,
4221 int flags,
4222 int prec,
4223 int type,
4224 PyObject *v)
4225{
4226 char fmt[20];
4227 double x;
4228
4229 x = PyFloat_AsDouble(v);
4230 if (x == -1.0 && PyErr_Occurred())
4231 return -1;
4232 if (prec < 0)
4233 prec = 6;
4234 if (prec > 50)
4235 prec = 50; /* Arbitrary limitation */
4236 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4237 type = 'g';
4238 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4239 return usprintf(buf, fmt, x);
4240}
4241
4242static int
4243formatint(Py_UNICODE *buf,
4244 int flags,
4245 int prec,
4246 int type,
4247 PyObject *v)
4248{
4249 char fmt[20];
4250 long x;
4251
4252 x = PyInt_AsLong(v);
4253 if (x == -1 && PyErr_Occurred())
4254 return -1;
4255 if (prec < 0)
4256 prec = 1;
4257 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4258 return usprintf(buf, fmt, x);
4259}
4260
4261static int
4262formatchar(Py_UNICODE *buf,
4263 PyObject *v)
4264{
4265 if (PyUnicode_Check(v))
4266 buf[0] = PyUnicode_AS_UNICODE(v)[0];
4267
4268 else if (PyString_Check(v))
4269 buf[0] = (Py_UNICODE) PyString_AS_STRING(v)[0];
4270
4271 else {
4272 /* Integer input truncated to a character */
4273 long x;
4274 x = PyInt_AsLong(v);
4275 if (x == -1 && PyErr_Occurred())
4276 return -1;
4277 buf[0] = (char) x;
4278 }
4279 buf[1] = '\0';
4280 return 1;
4281}
4282
4283PyObject *PyUnicode_Format(PyObject *format,
4284 PyObject *args)
4285{
4286 Py_UNICODE *fmt, *res;
4287 int fmtcnt, rescnt, reslen, arglen, argidx;
4288 int args_owned = 0;
4289 PyUnicodeObject *result = NULL;
4290 PyObject *dict = NULL;
4291 PyObject *uformat;
4292
4293 if (format == NULL || args == NULL) {
4294 PyErr_BadInternalCall();
4295 return NULL;
4296 }
4297 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004298 if (uformat == NULL)
4299 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 fmt = PyUnicode_AS_UNICODE(uformat);
4301 fmtcnt = PyUnicode_GET_SIZE(uformat);
4302
4303 reslen = rescnt = fmtcnt + 100;
4304 result = _PyUnicode_New(reslen);
4305 if (result == NULL)
4306 goto onError;
4307 res = PyUnicode_AS_UNICODE(result);
4308
4309 if (PyTuple_Check(args)) {
4310 arglen = PyTuple_Size(args);
4311 argidx = 0;
4312 }
4313 else {
4314 arglen = -1;
4315 argidx = -2;
4316 }
4317 if (args->ob_type->tp_as_mapping)
4318 dict = args;
4319
4320 while (--fmtcnt >= 0) {
4321 if (*fmt != '%') {
4322 if (--rescnt < 0) {
4323 rescnt = fmtcnt + 100;
4324 reslen += rescnt;
4325 if (_PyUnicode_Resize(result, reslen) < 0)
4326 return NULL;
4327 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4328 --rescnt;
4329 }
4330 *res++ = *fmt++;
4331 }
4332 else {
4333 /* Got a format specifier */
4334 int flags = 0;
4335 int width = -1;
4336 int prec = -1;
4337 int size = 0;
4338 Py_UNICODE c = '\0';
4339 Py_UNICODE fill;
4340 PyObject *v = NULL;
4341 PyObject *temp = NULL;
4342 Py_UNICODE *buf;
4343 Py_UNICODE sign;
4344 int len;
4345 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4346
4347 fmt++;
4348 if (*fmt == '(') {
4349 Py_UNICODE *keystart;
4350 int keylen;
4351 PyObject *key;
4352 int pcount = 1;
4353
4354 if (dict == NULL) {
4355 PyErr_SetString(PyExc_TypeError,
4356 "format requires a mapping");
4357 goto onError;
4358 }
4359 ++fmt;
4360 --fmtcnt;
4361 keystart = fmt;
4362 /* Skip over balanced parentheses */
4363 while (pcount > 0 && --fmtcnt >= 0) {
4364 if (*fmt == ')')
4365 --pcount;
4366 else if (*fmt == '(')
4367 ++pcount;
4368 fmt++;
4369 }
4370 keylen = fmt - keystart - 1;
4371 if (fmtcnt < 0 || pcount > 0) {
4372 PyErr_SetString(PyExc_ValueError,
4373 "incomplete format key");
4374 goto onError;
4375 }
Fred Drakee4315f52000-05-09 19:53:39 +00004376 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004377 then looked up since Python uses strings to hold
4378 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004379 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004380 key = PyUnicode_EncodeUTF8(keystart,
4381 keylen,
4382 NULL);
4383 if (key == NULL)
4384 goto onError;
4385 if (args_owned) {
4386 Py_DECREF(args);
4387 args_owned = 0;
4388 }
4389 args = PyObject_GetItem(dict, key);
4390 Py_DECREF(key);
4391 if (args == NULL) {
4392 goto onError;
4393 }
4394 args_owned = 1;
4395 arglen = -1;
4396 argidx = -2;
4397 }
4398 while (--fmtcnt >= 0) {
4399 switch (c = *fmt++) {
4400 case '-': flags |= F_LJUST; continue;
4401 case '+': flags |= F_SIGN; continue;
4402 case ' ': flags |= F_BLANK; continue;
4403 case '#': flags |= F_ALT; continue;
4404 case '0': flags |= F_ZERO; continue;
4405 }
4406 break;
4407 }
4408 if (c == '*') {
4409 v = getnextarg(args, arglen, &argidx);
4410 if (v == NULL)
4411 goto onError;
4412 if (!PyInt_Check(v)) {
4413 PyErr_SetString(PyExc_TypeError,
4414 "* wants int");
4415 goto onError;
4416 }
4417 width = PyInt_AsLong(v);
4418 if (width < 0) {
4419 flags |= F_LJUST;
4420 width = -width;
4421 }
4422 if (--fmtcnt >= 0)
4423 c = *fmt++;
4424 }
4425 else if (c >= '0' && c <= '9') {
4426 width = c - '0';
4427 while (--fmtcnt >= 0) {
4428 c = *fmt++;
4429 if (c < '0' || c > '9')
4430 break;
4431 if ((width*10) / 10 != width) {
4432 PyErr_SetString(PyExc_ValueError,
4433 "width too big");
4434 goto onError;
4435 }
4436 width = width*10 + (c - '0');
4437 }
4438 }
4439 if (c == '.') {
4440 prec = 0;
4441 if (--fmtcnt >= 0)
4442 c = *fmt++;
4443 if (c == '*') {
4444 v = getnextarg(args, arglen, &argidx);
4445 if (v == NULL)
4446 goto onError;
4447 if (!PyInt_Check(v)) {
4448 PyErr_SetString(PyExc_TypeError,
4449 "* wants int");
4450 goto onError;
4451 }
4452 prec = PyInt_AsLong(v);
4453 if (prec < 0)
4454 prec = 0;
4455 if (--fmtcnt >= 0)
4456 c = *fmt++;
4457 }
4458 else if (c >= '0' && c <= '9') {
4459 prec = c - '0';
4460 while (--fmtcnt >= 0) {
4461 c = Py_CHARMASK(*fmt++);
4462 if (c < '0' || c > '9')
4463 break;
4464 if ((prec*10) / 10 != prec) {
4465 PyErr_SetString(PyExc_ValueError,
4466 "prec too big");
4467 goto onError;
4468 }
4469 prec = prec*10 + (c - '0');
4470 }
4471 }
4472 } /* prec */
4473 if (fmtcnt >= 0) {
4474 if (c == 'h' || c == 'l' || c == 'L') {
4475 size = c;
4476 if (--fmtcnt >= 0)
4477 c = *fmt++;
4478 }
4479 }
4480 if (fmtcnt < 0) {
4481 PyErr_SetString(PyExc_ValueError,
4482 "incomplete format");
4483 goto onError;
4484 }
4485 if (c != '%') {
4486 v = getnextarg(args, arglen, &argidx);
4487 if (v == NULL)
4488 goto onError;
4489 }
4490 sign = 0;
4491 fill = ' ';
4492 switch (c) {
4493
4494 case '%':
4495 buf = tmpbuf;
4496 buf[0] = '%';
4497 len = 1;
4498 break;
4499
4500 case 's':
4501 case 'r':
4502 if (PyUnicode_Check(v) && c == 's') {
4503 temp = v;
4504 Py_INCREF(temp);
4505 }
4506 else {
4507 PyObject *unicode;
4508 if (c == 's')
4509 temp = PyObject_Str(v);
4510 else
4511 temp = PyObject_Repr(v);
4512 if (temp == NULL)
4513 goto onError;
4514 if (!PyString_Check(temp)) {
4515 /* XXX Note: this should never happen, since
4516 PyObject_Repr() and PyObject_Str() assure
4517 this */
4518 Py_DECREF(temp);
4519 PyErr_SetString(PyExc_TypeError,
4520 "%s argument has non-string str()");
4521 goto onError;
4522 }
Fred Drakee4315f52000-05-09 19:53:39 +00004523 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004524 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004525 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004526 "strict");
4527 Py_DECREF(temp);
4528 temp = unicode;
4529 if (temp == NULL)
4530 goto onError;
4531 }
4532 buf = PyUnicode_AS_UNICODE(temp);
4533 len = PyUnicode_GET_SIZE(temp);
4534 if (prec >= 0 && len > prec)
4535 len = prec;
4536 break;
4537
4538 case 'i':
4539 case 'd':
4540 case 'u':
4541 case 'o':
4542 case 'x':
4543 case 'X':
4544 if (c == 'i')
4545 c = 'd';
4546 buf = tmpbuf;
4547 len = formatint(buf, flags, prec, c, v);
4548 if (len < 0)
4549 goto onError;
4550 sign = (c == 'd');
4551 if (flags & F_ZERO) {
4552 fill = '0';
4553 if ((flags&F_ALT) &&
4554 (c == 'x' || c == 'X') &&
4555 buf[0] == '0' && buf[1] == c) {
4556 *res++ = *buf++;
4557 *res++ = *buf++;
4558 rescnt -= 2;
4559 len -= 2;
4560 width -= 2;
4561 if (width < 0)
4562 width = 0;
4563 }
4564 }
4565 break;
4566
4567 case 'e':
4568 case 'E':
4569 case 'f':
4570 case 'g':
4571 case 'G':
4572 buf = tmpbuf;
4573 len = formatfloat(buf, flags, prec, c, v);
4574 if (len < 0)
4575 goto onError;
4576 sign = 1;
4577 if (flags&F_ZERO)
4578 fill = '0';
4579 break;
4580
4581 case 'c':
4582 buf = tmpbuf;
4583 len = formatchar(buf, v);
4584 if (len < 0)
4585 goto onError;
4586 break;
4587
4588 default:
4589 PyErr_Format(PyExc_ValueError,
4590 "unsupported format character '%c' (0x%x)",
4591 c, c);
4592 goto onError;
4593 }
4594 if (sign) {
4595 if (*buf == '-' || *buf == '+') {
4596 sign = *buf++;
4597 len--;
4598 }
4599 else if (flags & F_SIGN)
4600 sign = '+';
4601 else if (flags & F_BLANK)
4602 sign = ' ';
4603 else
4604 sign = 0;
4605 }
4606 if (width < len)
4607 width = len;
4608 if (rescnt < width + (sign != 0)) {
4609 reslen -= rescnt;
4610 rescnt = width + fmtcnt + 100;
4611 reslen += rescnt;
4612 if (_PyUnicode_Resize(result, reslen) < 0)
4613 return NULL;
4614 res = PyUnicode_AS_UNICODE(result)
4615 + reslen - rescnt;
4616 }
4617 if (sign) {
4618 if (fill != ' ')
4619 *res++ = sign;
4620 rescnt--;
4621 if (width > len)
4622 width--;
4623 }
4624 if (width > len && !(flags & F_LJUST)) {
4625 do {
4626 --rescnt;
4627 *res++ = fill;
4628 } while (--width > len);
4629 }
4630 if (sign && fill == ' ')
4631 *res++ = sign;
4632 memcpy(res, buf, len * sizeof(Py_UNICODE));
4633 res += len;
4634 rescnt -= len;
4635 while (--width >= len) {
4636 --rescnt;
4637 *res++ = ' ';
4638 }
4639 if (dict && (argidx < arglen) && c != '%') {
4640 PyErr_SetString(PyExc_TypeError,
4641 "not all arguments converted");
4642 goto onError;
4643 }
4644 Py_XDECREF(temp);
4645 } /* '%' */
4646 } /* until end */
4647 if (argidx < arglen && !dict) {
4648 PyErr_SetString(PyExc_TypeError,
4649 "not all arguments converted");
4650 goto onError;
4651 }
4652
4653 if (args_owned) {
4654 Py_DECREF(args);
4655 }
4656 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004657 if (_PyUnicode_Resize(result, reslen - rescnt))
4658 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004659 return (PyObject *)result;
4660
4661 onError:
4662 Py_XDECREF(result);
4663 Py_DECREF(uformat);
4664 if (args_owned) {
4665 Py_DECREF(args);
4666 }
4667 return NULL;
4668}
4669
4670static PyBufferProcs unicode_as_buffer = {
4671 (getreadbufferproc) unicode_buffer_getreadbuf,
4672 (getwritebufferproc) unicode_buffer_getwritebuf,
4673 (getsegcountproc) unicode_buffer_getsegcount,
4674 (getcharbufferproc) unicode_buffer_getcharbuf,
4675};
4676
4677PyTypeObject PyUnicode_Type = {
4678 PyObject_HEAD_INIT(&PyType_Type)
4679 0, /* ob_size */
4680 "unicode", /* tp_name */
4681 sizeof(PyUnicodeObject), /* tp_size */
4682 0, /* tp_itemsize */
4683 /* Slots */
4684 (destructor)_PyUnicode_Free, /* tp_dealloc */
4685 0, /* tp_print */
4686 (getattrfunc)unicode_getattr, /* tp_getattr */
4687 0, /* tp_setattr */
4688 (cmpfunc) unicode_compare, /* tp_compare */
4689 (reprfunc) unicode_repr, /* tp_repr */
4690 0, /* tp_as_number */
4691 &unicode_as_sequence, /* tp_as_sequence */
4692 0, /* tp_as_mapping */
4693 (hashfunc) unicode_hash, /* tp_hash*/
4694 0, /* tp_call*/
4695 (reprfunc) unicode_str, /* tp_str */
4696 (getattrofunc) NULL, /* tp_getattro */
4697 (setattrofunc) NULL, /* tp_setattro */
4698 &unicode_as_buffer, /* tp_as_buffer */
4699 Py_TPFLAGS_DEFAULT, /* tp_flags */
4700};
4701
4702/* Initialize the Unicode implementation */
4703
4704void _PyUnicode_Init()
4705{
4706 /* Doublecheck the configuration... */
4707 if (sizeof(Py_UNICODE) != 2)
4708 Py_FatalError("Unicode configuration error: "
4709 "sizeof(Py_UNICODE) != 2 bytes");
4710
Fred Drakee4315f52000-05-09 19:53:39 +00004711 /* Init the implementation */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004712 unicode_empty = _PyUnicode_New(0);
Fred Drakee4315f52000-05-09 19:53:39 +00004713 strcpy(unicode_default_encoding, "utf-8");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004714}
4715
4716/* Finalize the Unicode implementation */
4717
4718void
4719_PyUnicode_Fini()
4720{
4721 PyUnicodeObject *u = unicode_freelist;
4722
4723 while (u != NULL) {
4724 PyUnicodeObject *v = u;
4725 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004726 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00004727 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004728 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00004729 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 }
4731 Py_XDECREF(unicode_empty);
4732}