blob: 1ea83f6ea56de9e444b972d4e33eb0383baacca9 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000111/* --- Globals ------------------------------------------------------------
112
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
115
116*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
118/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000119static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000120
121/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000122static PyUnicodeObject *unicode_freelist;
123static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124
Fred Drakee4315f52000-05-09 19:53:39 +0000125/* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
127
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
130
131*/
132
133static char unicode_default_encoding[100];
134
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135/* --- Unicode Object ----------------------------------------------------- */
136
137static
138int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139 int length)
140{
141 void *oldstr;
142
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000143 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode == unicode_empty) {
149 PyErr_SetString(PyExc_SystemError,
150 "can't resize empty unicode object");
151 return -1;
152 }
153
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr = unicode->str;
157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158 if (!unicode->str) {
159 unicode->str = oldstr;
160 PyErr_NoMemory();
161 return -1;
162 }
163 unicode->str[length] = 0;
164 unicode->length = length;
165
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000166 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 /* Reset the object caches */
168 if (unicode->utf8str) {
169 Py_DECREF(unicode->utf8str);
170 unicode->utf8str = NULL;
171 }
172 unicode->hash = -1;
173
174 return 0;
175}
176
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000177int PyUnicode_Resize(PyObject **unicode,
178 int length)
179{
180 PyUnicodeObject *v;
181
182 if (unicode == NULL) {
183 PyErr_BadInternalCall();
184 return -1;
185 }
186 v = (PyUnicodeObject *)*unicode;
187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188 PyErr_BadInternalCall();
189 return -1;
190 }
191 return _PyUnicode_Resize(v, length);
192}
193
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194/* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
196
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
199
200*/
201
202static
203PyUnicodeObject *_PyUnicode_New(int length)
204{
205 register PyUnicodeObject *unicode;
206
207 /* Optimization for empty strings */
208 if (length == 0 && unicode_empty != NULL) {
209 Py_INCREF(unicode_empty);
210 return unicode_empty;
211 }
212
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist) {
215 unicode = unicode_freelist;
216 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
217 unicode_freelist_size--;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000218 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 /* Keep-Alive optimization: we only upsize the buffer,
221 never downsize it. */
222 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000224 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 }
227 }
228 else
229 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
230 }
231 else {
232 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
233 if (unicode == NULL)
234 return NULL;
235 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
236 }
237
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000238 if (!unicode->str) {
239 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000240 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode->str[length] = 0;
243 unicode->length = length;
244 unicode->hash = -1;
245 unicode->utf8str = NULL;
246 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000247
248 onError:
249 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000250 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252}
253
254static
255void _PyUnicode_Free(register PyUnicodeObject *unicode)
256{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000258 /* Keep-Alive optimization */
259 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 unicode->str = NULL;
262 unicode->length = 0;
263 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000264 if (unicode->utf8str) {
265 Py_DECREF(unicode->utf8str);
266 unicode->utf8str = NULL;
267 }
268 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 *(PyUnicodeObject **)unicode = unicode_freelist;
270 unicode_freelist = unicode;
271 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 }
273 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000274 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000275 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000276 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278}
279
280PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
281 int size)
282{
283 PyUnicodeObject *unicode;
284
285 unicode = _PyUnicode_New(size);
286 if (!unicode)
287 return NULL;
288
289 /* Copy the Unicode data into the new object */
290 if (u != NULL)
291 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
292
293 return (PyObject *)unicode;
294}
295
296#ifdef HAVE_WCHAR_H
297
298PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
299 int size)
300{
301 PyUnicodeObject *unicode;
302
303 if (w == NULL) {
304 PyErr_BadInternalCall();
305 return NULL;
306 }
307
308 unicode = _PyUnicode_New(size);
309 if (!unicode)
310 return NULL;
311
312 /* Copy the wchar_t data into the new object */
313#ifdef HAVE_USABLE_WCHAR_T
314 memcpy(unicode->str, w, size * sizeof(wchar_t));
315#else
316 {
317 register Py_UNICODE *u;
318 register int i;
319 u = PyUnicode_AS_UNICODE(unicode);
320 for (i = size; i >= 0; i--)
321 *u++ = *w++;
322 }
323#endif
324
325 return (PyObject *)unicode;
326}
327
328int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
329 register wchar_t *w,
330 int size)
331{
332 if (unicode == NULL) {
333 PyErr_BadInternalCall();
334 return -1;
335 }
336 if (size > PyUnicode_GET_SIZE(unicode))
337 size = PyUnicode_GET_SIZE(unicode);
338#ifdef HAVE_USABLE_WCHAR_T
339 memcpy(w, unicode->str, size * sizeof(wchar_t));
340#else
341 {
342 register Py_UNICODE *u;
343 register int i;
344 u = PyUnicode_AS_UNICODE(unicode);
345 for (i = size; i >= 0; i--)
346 *w++ = *u++;
347 }
348#endif
349
350 return size;
351}
352
353#endif
354
355PyObject *PyUnicode_FromObject(register PyObject *obj)
356{
357 const char *s;
358 int len;
359
360 if (obj == NULL) {
361 PyErr_BadInternalCall();
362 return NULL;
363 }
364 else if (PyUnicode_Check(obj)) {
365 Py_INCREF(obj);
366 return obj;
367 }
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
371 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
376 PyErr_SetString(PyExc_TypeError,
377 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 if (len == 0) {
381 Py_INCREF(unicode_empty);
382 return (PyObject *)unicode_empty;
383 }
Fred Drakee4315f52000-05-09 19:53:39 +0000384 return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387PyObject *PyUnicode_Decode(const char *s,
388 int size,
389 const char *encoding,
390 const char *errors)
391{
392 PyObject *buffer = NULL, *unicode;
393
Fred Drakee4315f52000-05-09 19:53:39 +0000394 if (encoding == NULL)
395 encoding = PyUnicode_GetDefaultEncoding();
396
397 /* Shortcuts for common default encodings */
398 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000400 else if (strcmp(encoding, "latin-1") == 0)
401 return PyUnicode_DecodeLatin1(s, size, errors);
402 else if (strcmp(encoding, "ascii") == 0)
403 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404
405 /* Decode via the codec registry */
406 buffer = PyBuffer_FromMemory((void *)s, size);
407 if (buffer == NULL)
408 goto onError;
409 unicode = PyCodec_Decode(buffer, encoding, errors);
410 if (unicode == NULL)
411 goto onError;
412 if (!PyUnicode_Check(unicode)) {
413 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000414 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode->ob_type->tp_name);
416 Py_DECREF(unicode);
417 goto onError;
418 }
419 Py_DECREF(buffer);
420 return unicode;
421
422 onError:
423 Py_XDECREF(buffer);
424 return NULL;
425}
426
427PyObject *PyUnicode_Encode(const Py_UNICODE *s,
428 int size,
429 const char *encoding,
430 const char *errors)
431{
432 PyObject *v, *unicode;
433
434 unicode = PyUnicode_FromUnicode(s, size);
435 if (unicode == NULL)
436 return NULL;
437 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
438 Py_DECREF(unicode);
439 return v;
440}
441
442PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
443 const char *encoding,
444 const char *errors)
445{
446 PyObject *v;
447
448 if (!PyUnicode_Check(unicode)) {
449 PyErr_BadArgument();
450 goto onError;
451 }
Fred Drakee4315f52000-05-09 19:53:39 +0000452
453 if (encoding == NULL)
454 encoding = PyUnicode_GetDefaultEncoding();
455
456 /* Shortcuts for common default encodings */
457 if (errors == NULL) {
458 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000460 else if (strcmp(encoding, "latin-1") == 0)
461 return PyUnicode_AsLatin1String(unicode);
462 else if (strcmp(encoding, "ascii") == 0)
463 return PyUnicode_AsASCIIString(unicode);
464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465
466 /* Encode via the codec registry */
467 v = PyCodec_Encode(unicode, encoding, errors);
468 if (v == NULL)
469 goto onError;
470 /* XXX Should we really enforce this ? */
471 if (!PyString_Check(v)) {
472 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000473 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 v->ob_type->tp_name);
475 Py_DECREF(v);
476 goto onError;
477 }
478 return v;
479
480 onError:
481 return NULL;
482}
483
484Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
485{
486 if (!PyUnicode_Check(unicode)) {
487 PyErr_BadArgument();
488 goto onError;
489 }
490 return PyUnicode_AS_UNICODE(unicode);
491
492 onError:
493 return NULL;
494}
495
496int PyUnicode_GetSize(PyObject *unicode)
497{
498 if (!PyUnicode_Check(unicode)) {
499 PyErr_BadArgument();
500 goto onError;
501 }
502 return PyUnicode_GET_SIZE(unicode);
503
504 onError:
505 return -1;
506}
507
Fred Drakee4315f52000-05-09 19:53:39 +0000508const char *PyUnicode_GetDefaultEncoding()
509{
510 return unicode_default_encoding;
511}
512
513int PyUnicode_SetDefaultEncoding(const char *encoding)
514{
515 PyObject *v;
516
517 /* Make sure the encoding is valid. As side effect, this also
518 loads the encoding into the codec registry cache. */
519 v = _PyCodec_Lookup(encoding);
520 if (v == NULL)
521 goto onError;
522 Py_DECREF(v);
523 strncpy(unicode_default_encoding,
524 encoding,
525 sizeof(unicode_default_encoding));
526 return 0;
527
528 onError:
529 return -1;
530}
531
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532/* --- UTF-8 Codec -------------------------------------------------------- */
533
534static
535char utf8_code_length[256] = {
536 /* Map UTF-8 encoded prefix byte to sequence length. zero means
537 illegal prefix. see RFC 2279 for details */
538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
545 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
551 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
552 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
553 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
554};
555
556static
557int utf8_decoding_error(const char **source,
558 Py_UNICODE **dest,
559 const char *errors,
560 const char *details)
561{
562 if ((errors == NULL) ||
563 (strcmp(errors,"strict") == 0)) {
564 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000565 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 details);
567 return -1;
568 }
569 else if (strcmp(errors,"ignore") == 0) {
570 (*source)++;
571 return 0;
572 }
573 else if (strcmp(errors,"replace") == 0) {
574 (*source)++;
575 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
576 (*dest)++;
577 return 0;
578 }
579 else {
580 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000581 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 errors);
583 return -1;
584 }
585}
586
587#define UTF8_ERROR(details) do { \
588 if (utf8_decoding_error(&s, &p, errors, details)) \
589 goto onError; \
590 continue; \
591} while (0)
592
593PyObject *PyUnicode_DecodeUTF8(const char *s,
594 int size,
595 const char *errors)
596{
597 int n;
598 const char *e;
599 PyUnicodeObject *unicode;
600 Py_UNICODE *p;
601
602 /* Note: size will always be longer than the resulting Unicode
603 character count */
604 unicode = _PyUnicode_New(size);
605 if (!unicode)
606 return NULL;
607 if (size == 0)
608 return (PyObject *)unicode;
609
610 /* Unpack UTF-8 encoded data */
611 p = unicode->str;
612 e = s + size;
613
614 while (s < e) {
615 register Py_UNICODE ch = (unsigned char)*s;
616
617 if (ch < 0x80) {
618 *p++ = ch;
619 s++;
620 continue;
621 }
622
623 n = utf8_code_length[ch];
624
625 if (s + n > e)
626 UTF8_ERROR("unexpected end of data");
627
628 switch (n) {
629
630 case 0:
631 UTF8_ERROR("unexpected code byte");
632 break;
633
634 case 1:
635 UTF8_ERROR("internal error");
636 break;
637
638 case 2:
639 if ((s[1] & 0xc0) != 0x80)
640 UTF8_ERROR("invalid data");
641 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
642 if (ch < 0x80)
643 UTF8_ERROR("illegal encoding");
644 else
645 *p++ = ch;
646 break;
647
648 case 3:
649 if ((s[1] & 0xc0) != 0x80 ||
650 (s[2] & 0xc0) != 0x80)
651 UTF8_ERROR("invalid data");
652 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
653 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
654 UTF8_ERROR("illegal encoding");
655 else
656 *p++ = ch;
657 break;
658
659 default:
660 /* Other sizes are only needed for UCS-4 */
661 UTF8_ERROR("unsupported Unicode code range");
662 }
663 s += n;
664 }
665
666 /* Adjust length */
667 if (_PyUnicode_Resize(unicode, p - unicode->str))
668 goto onError;
669
670 return (PyObject *)unicode;
671
672onError:
673 Py_DECREF(unicode);
674 return NULL;
675}
676
677#undef UTF8_ERROR
678
679static
680int utf8_encoding_error(const Py_UNICODE **source,
681 char **dest,
682 const char *errors,
683 const char *details)
684{
685 if ((errors == NULL) ||
686 (strcmp(errors,"strict") == 0)) {
687 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000688 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 details);
690 return -1;
691 }
692 else if (strcmp(errors,"ignore") == 0) {
693 return 0;
694 }
695 else if (strcmp(errors,"replace") == 0) {
696 **dest = '?';
697 (*dest)++;
698 return 0;
699 }
700 else {
701 PyErr_Format(PyExc_ValueError,
702 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000703 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000704 errors);
705 return -1;
706 }
707}
708
709PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
710 int size,
711 const char *errors)
712{
713 PyObject *v;
714 char *p;
715 char *q;
716
717 v = PyString_FromStringAndSize(NULL, 3 * size);
718 if (v == NULL)
719 return NULL;
720 if (size == 0)
721 goto done;
722
723 p = q = PyString_AS_STRING(v);
724 while (size-- > 0) {
725 Py_UNICODE ch = *s++;
726 if (ch < 0x80)
727 *p++ = (char) ch;
728 else if (ch < 0x0800) {
729 *p++ = 0xc0 | (ch >> 6);
730 *p++ = 0x80 | (ch & 0x3f);
731 } else if (0xD800 <= ch && ch <= 0xDFFF) {
732 /* These byte ranges are reserved for UTF-16 surrogate
733 bytes which the Python implementation currently does
734 not support. */
735 printf("code range problem: U+%04x\n", ch);
736 if (utf8_encoding_error(&s, &p, errors,
737 "unsupported code range"))
738 goto onError;
739 } else {
740 *p++ = 0xe0 | (ch >> 12);
741 *p++ = 0x80 | ((ch >> 6) & 0x3f);
742 *p++ = 0x80 | (ch & 0x3f);
743 }
744 }
745 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000746 if (_PyString_Resize(&v, p - q))
747 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000748
749 done:
750 return v;
751
752 onError:
753 Py_DECREF(v);
754 return NULL;
755}
756
757/* Return a Python string holding the UTF-8 encoded value of the
758 Unicode object.
759
760 The resulting string is cached in the Unicode object for subsequent
761 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000762 the character buffer interface and will live (at least) as long as
763 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000764
765 The refcount of the string is *not* incremented.
766
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000767 *** Exported for internal use by the interpreter only !!! ***
768
Guido van Rossumd57fd912000-03-10 22:53:23 +0000769*/
770
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000771PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772 const char *errors)
773{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000774 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775
776 if (v)
777 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000778 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
779 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 errors);
781 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000782 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000783 return v;
784}
785
786PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
787{
788 PyObject *str;
789
790 if (!PyUnicode_Check(unicode)) {
791 PyErr_BadArgument();
792 return NULL;
793 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000794 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795 if (str == NULL)
796 return NULL;
797 Py_INCREF(str);
798 return str;
799}
800
801/* --- UTF-16 Codec ------------------------------------------------------- */
802
803static
804int utf16_decoding_error(const Py_UNICODE **source,
805 Py_UNICODE **dest,
806 const char *errors,
807 const char *details)
808{
809 if ((errors == NULL) ||
810 (strcmp(errors,"strict") == 0)) {
811 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000812 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000813 details);
814 return -1;
815 }
816 else if (strcmp(errors,"ignore") == 0) {
817 return 0;
818 }
819 else if (strcmp(errors,"replace") == 0) {
820 if (dest) {
821 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
822 (*dest)++;
823 }
824 return 0;
825 }
826 else {
827 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000828 "UTF-16 decoding error; "
829 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830 errors);
831 return -1;
832 }
833}
834
835#define UTF16_ERROR(details) do { \
836 if (utf16_decoding_error(&q, &p, errors, details)) \
837 goto onError; \
838 continue; \
839} while(0)
840
841PyObject *PyUnicode_DecodeUTF16(const char *s,
842 int size,
843 const char *errors,
844 int *byteorder)
845{
846 PyUnicodeObject *unicode;
847 Py_UNICODE *p;
848 const Py_UNICODE *q, *e;
849 int bo = 0;
850
851 /* size should be an even number */
852 if (size % sizeof(Py_UNICODE) != 0) {
853 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
854 return NULL;
855 /* The remaining input chars are ignored if we fall through
856 here... */
857 }
858
859 /* Note: size will always be longer than the resulting Unicode
860 character count */
861 unicode = _PyUnicode_New(size);
862 if (!unicode)
863 return NULL;
864 if (size == 0)
865 return (PyObject *)unicode;
866
867 /* Unpack UTF-16 encoded data */
868 p = unicode->str;
869 q = (Py_UNICODE *)s;
870 e = q + (size / sizeof(Py_UNICODE));
871
872 if (byteorder)
873 bo = *byteorder;
874
875 while (q < e) {
876 register Py_UNICODE ch = *q++;
877
878 /* Check for BOM marks (U+FEFF) in the input and adjust
879 current byte order setting accordingly. Swap input
880 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
881 !) */
882#ifdef BYTEORDER_IS_LITTLE_ENDIAN
883 if (ch == 0xFEFF) {
884 bo = -1;
885 continue;
886 } else if (ch == 0xFFFE) {
887 bo = 1;
888 continue;
889 }
890 if (bo == 1)
891 ch = (ch >> 8) | (ch << 8);
892#else
893 if (ch == 0xFEFF) {
894 bo = 1;
895 continue;
896 } else if (ch == 0xFFFE) {
897 bo = -1;
898 continue;
899 }
900 if (bo == -1)
901 ch = (ch >> 8) | (ch << 8);
902#endif
903 if (ch < 0xD800 || ch > 0xDFFF) {
904 *p++ = ch;
905 continue;
906 }
907
908 /* UTF-16 code pair: */
909 if (q >= e)
910 UTF16_ERROR("unexpected end of data");
911 if (0xDC00 <= *q && *q <= 0xDFFF) {
912 q++;
913 if (0xD800 <= *q && *q <= 0xDBFF)
914 /* This is valid data (a UTF-16 surrogate pair), but
915 we are not able to store this information since our
916 Py_UNICODE type only has 16 bits... this might
917 change someday, even though it's unlikely. */
918 UTF16_ERROR("code pairs are not supported");
919 else
920 continue;
921 }
922 UTF16_ERROR("illegal encoding");
923 }
924
925 if (byteorder)
926 *byteorder = bo;
927
928 /* Adjust length */
929 if (_PyUnicode_Resize(unicode, p - unicode->str))
930 goto onError;
931
932 return (PyObject *)unicode;
933
934onError:
935 Py_DECREF(unicode);
936 return NULL;
937}
938
939#undef UTF16_ERROR
940
941PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
942 int size,
943 const char *errors,
944 int byteorder)
945{
946 PyObject *v;
947 Py_UNICODE *p;
948 char *q;
949
950 /* We don't create UTF-16 pairs... */
951 v = PyString_FromStringAndSize(NULL,
952 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
953 if (v == NULL)
954 return NULL;
955 if (size == 0)
956 goto done;
957
958 q = PyString_AS_STRING(v);
959 p = (Py_UNICODE *)q;
960
961 if (byteorder == 0)
962 *p++ = 0xFEFF;
963 if (byteorder == 0 ||
964#ifdef BYTEORDER_IS_LITTLE_ENDIAN
965 byteorder == -1
966#else
967 byteorder == 1
968#endif
969 )
970 memcpy(p, s, size * sizeof(Py_UNICODE));
971 else
972 while (size-- > 0) {
973 Py_UNICODE ch = *s++;
974 *p++ = (ch >> 8) | (ch << 8);
975 }
976 done:
977 return v;
978}
979
980PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
981{
982 if (!PyUnicode_Check(unicode)) {
983 PyErr_BadArgument();
984 return NULL;
985 }
986 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
987 PyUnicode_GET_SIZE(unicode),
988 NULL,
989 0);
990}
991
992/* --- Unicode Escape Codec ----------------------------------------------- */
993
994static
995int unicodeescape_decoding_error(const char **source,
996 unsigned int *x,
997 const char *errors,
998 const char *details)
999{
1000 if ((errors == NULL) ||
1001 (strcmp(errors,"strict") == 0)) {
1002 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001003 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001004 details);
1005 return -1;
1006 }
1007 else if (strcmp(errors,"ignore") == 0) {
1008 return 0;
1009 }
1010 else if (strcmp(errors,"replace") == 0) {
1011 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
1012 return 0;
1013 }
1014 else {
1015 PyErr_Format(PyExc_ValueError,
1016 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001017 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001018 errors);
1019 return -1;
1020 }
1021}
1022
1023PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1024 int size,
1025 const char *errors)
1026{
1027 PyUnicodeObject *v;
1028 Py_UNICODE *p = NULL, *buf = NULL;
1029 const char *end;
1030
1031 /* Escaped strings will always be longer than the resulting
1032 Unicode string, so we start with size here and then reduce the
1033 length after conversion to the true value. */
1034 v = _PyUnicode_New(size);
1035 if (v == NULL)
1036 goto onError;
1037 if (size == 0)
1038 return (PyObject *)v;
1039 p = buf = PyUnicode_AS_UNICODE(v);
1040 end = s + size;
1041 while (s < end) {
1042 unsigned char c;
1043 unsigned int x;
1044 int i;
1045
1046 /* Non-escape characters are interpreted as Unicode ordinals */
1047 if (*s != '\\') {
1048 *p++ = (unsigned char)*s++;
1049 continue;
1050 }
1051
1052 /* \ - Escapes */
1053 s++;
1054 switch (*s++) {
1055
1056 /* \x escapes */
1057 case '\n': break;
1058 case '\\': *p++ = '\\'; break;
1059 case '\'': *p++ = '\''; break;
1060 case '\"': *p++ = '\"'; break;
1061 case 'b': *p++ = '\b'; break;
1062 case 'f': *p++ = '\014'; break; /* FF */
1063 case 't': *p++ = '\t'; break;
1064 case 'n': *p++ = '\n'; break;
1065 case 'r': *p++ = '\r'; break;
1066 case 'v': *p++ = '\013'; break; /* VT */
1067 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1068
1069 /* \OOO (octal) escapes */
1070 case '0': case '1': case '2': case '3':
1071 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001072 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001074 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001076 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001078 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 break;
1080
1081 /* \xXXXX escape with 0-4 hex digits */
1082 case 'x':
1083 x = 0;
1084 c = (unsigned char)*s;
1085 if (isxdigit(c)) {
1086 do {
1087 x = (x<<4) & ~0xF;
1088 if ('0' <= c && c <= '9')
1089 x += c - '0';
1090 else if ('a' <= c && c <= 'f')
1091 x += 10 + c - 'a';
1092 else
1093 x += 10 + c - 'A';
1094 c = (unsigned char)*++s;
1095 } while (isxdigit(c));
1096 *p++ = x;
1097 } else {
1098 *p++ = '\\';
1099 *p++ = (unsigned char)s[-1];
1100 }
1101 break;
1102
1103 /* \uXXXX with 4 hex digits */
1104 case 'u':
1105 for (x = 0, i = 0; i < 4; i++) {
1106 c = (unsigned char)s[i];
1107 if (!isxdigit(c)) {
1108 if (unicodeescape_decoding_error(&s, &x, errors,
1109 "truncated \\uXXXX"))
1110 goto onError;
1111 i++;
1112 break;
1113 }
1114 x = (x<<4) & ~0xF;
1115 if (c >= '0' && c <= '9')
1116 x += c - '0';
1117 else if (c >= 'a' && c <= 'f')
1118 x += 10 + c - 'a';
1119 else
1120 x += 10 + c - 'A';
1121 }
1122 s += i;
1123 *p++ = x;
1124 break;
1125
1126 default:
1127 *p++ = '\\';
1128 *p++ = (unsigned char)s[-1];
1129 break;
1130 }
1131 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001132 if (_PyUnicode_Resize(v, (int)(p - buf)))
1133 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 return (PyObject *)v;
1135
1136 onError:
1137 Py_XDECREF(v);
1138 return NULL;
1139}
1140
1141/* Return a Unicode-Escape string version of the Unicode object.
1142
1143 If quotes is true, the string is enclosed in u"" or u'' quotes as
1144 appropriate.
1145
1146*/
1147
Barry Warsaw51ac5802000-03-20 16:36:48 +00001148static const Py_UNICODE *findchar(const Py_UNICODE *s,
1149 int size,
1150 Py_UNICODE ch);
1151
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152static
1153PyObject *unicodeescape_string(const Py_UNICODE *s,
1154 int size,
1155 int quotes)
1156{
1157 PyObject *repr;
1158 char *p;
1159 char *q;
1160
1161 static const char *hexdigit = "0123456789ABCDEF";
1162
1163 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1164 if (repr == NULL)
1165 return NULL;
1166
1167 p = q = PyString_AS_STRING(repr);
1168
1169 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 *p++ = 'u';
1171 *p++ = (findchar(s, size, '\'') &&
1172 !findchar(s, size, '"')) ? '"' : '\'';
1173 }
1174 while (size-- > 0) {
1175 Py_UNICODE ch = *s++;
1176 /* Escape quotes */
1177 if (quotes && (ch == q[1] || ch == '\\')) {
1178 *p++ = '\\';
1179 *p++ = (char) ch;
1180 }
1181 /* Map 16-bit characters to '\uxxxx' */
1182 else if (ch >= 256) {
1183 *p++ = '\\';
1184 *p++ = 'u';
1185 *p++ = hexdigit[(ch >> 12) & 0xf];
1186 *p++ = hexdigit[(ch >> 8) & 0xf];
1187 *p++ = hexdigit[(ch >> 4) & 0xf];
1188 *p++ = hexdigit[ch & 15];
1189 }
1190 /* Map non-printable US ASCII to '\ooo' */
1191 else if (ch < ' ' || ch >= 128) {
1192 *p++ = '\\';
1193 *p++ = hexdigit[(ch >> 6) & 7];
1194 *p++ = hexdigit[(ch >> 3) & 7];
1195 *p++ = hexdigit[ch & 7];
1196 }
1197 /* Copy everything else as-is */
1198 else
1199 *p++ = (char) ch;
1200 }
1201 if (quotes)
1202 *p++ = q[1];
1203
1204 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001205 if (_PyString_Resize(&repr, p - q))
1206 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207
1208 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001209
1210 onError:
1211 Py_DECREF(repr);
1212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213}
1214
1215PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1216 int size)
1217{
1218 return unicodeescape_string(s, size, 0);
1219}
1220
1221PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1222{
1223 if (!PyUnicode_Check(unicode)) {
1224 PyErr_BadArgument();
1225 return NULL;
1226 }
1227 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1228 PyUnicode_GET_SIZE(unicode));
1229}
1230
1231/* --- Raw Unicode Escape Codec ------------------------------------------- */
1232
1233PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1234 int size,
1235 const char *errors)
1236{
1237 PyUnicodeObject *v;
1238 Py_UNICODE *p, *buf;
1239 const char *end;
1240 const char *bs;
1241
1242 /* Escaped strings will always be longer than the resulting
1243 Unicode string, so we start with size here and then reduce the
1244 length after conversion to the true value. */
1245 v = _PyUnicode_New(size);
1246 if (v == NULL)
1247 goto onError;
1248 if (size == 0)
1249 return (PyObject *)v;
1250 p = buf = PyUnicode_AS_UNICODE(v);
1251 end = s + size;
1252 while (s < end) {
1253 unsigned char c;
1254 unsigned int x;
1255 int i;
1256
1257 /* Non-escape characters are interpreted as Unicode ordinals */
1258 if (*s != '\\') {
1259 *p++ = (unsigned char)*s++;
1260 continue;
1261 }
1262
1263 /* \u-escapes are only interpreted iff the number of leading
1264 backslashes if odd */
1265 bs = s;
1266 for (;s < end;) {
1267 if (*s != '\\')
1268 break;
1269 *p++ = (unsigned char)*s++;
1270 }
1271 if (((s - bs) & 1) == 0 ||
1272 s >= end ||
1273 *s != 'u') {
1274 continue;
1275 }
1276 p--;
1277 s++;
1278
1279 /* \uXXXX with 4 hex digits */
1280 for (x = 0, i = 0; i < 4; i++) {
1281 c = (unsigned char)s[i];
1282 if (!isxdigit(c)) {
1283 if (unicodeescape_decoding_error(&s, &x, errors,
1284 "truncated \\uXXXX"))
1285 goto onError;
1286 i++;
1287 break;
1288 }
1289 x = (x<<4) & ~0xF;
1290 if (c >= '0' && c <= '9')
1291 x += c - '0';
1292 else if (c >= 'a' && c <= 'f')
1293 x += 10 + c - 'a';
1294 else
1295 x += 10 + c - 'A';
1296 }
1297 s += i;
1298 *p++ = x;
1299 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001300 if (_PyUnicode_Resize(v, (int)(p - buf)))
1301 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 return (PyObject *)v;
1303
1304 onError:
1305 Py_XDECREF(v);
1306 return NULL;
1307}
1308
1309PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1310 int size)
1311{
1312 PyObject *repr;
1313 char *p;
1314 char *q;
1315
1316 static const char *hexdigit = "0123456789ABCDEF";
1317
1318 repr = PyString_FromStringAndSize(NULL, 6 * size);
1319 if (repr == NULL)
1320 return NULL;
1321
1322 p = q = PyString_AS_STRING(repr);
1323 while (size-- > 0) {
1324 Py_UNICODE ch = *s++;
1325 /* Map 16-bit characters to '\uxxxx' */
1326 if (ch >= 256) {
1327 *p++ = '\\';
1328 *p++ = 'u';
1329 *p++ = hexdigit[(ch >> 12) & 0xf];
1330 *p++ = hexdigit[(ch >> 8) & 0xf];
1331 *p++ = hexdigit[(ch >> 4) & 0xf];
1332 *p++ = hexdigit[ch & 15];
1333 }
1334 /* Copy everything else as-is */
1335 else
1336 *p++ = (char) ch;
1337 }
1338 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001339 if (_PyString_Resize(&repr, p - q))
1340 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341
1342 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001343
1344 onError:
1345 Py_DECREF(repr);
1346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001347}
1348
1349PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1350{
1351 if (!PyUnicode_Check(unicode)) {
1352 PyErr_BadArgument();
1353 return NULL;
1354 }
1355 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1356 PyUnicode_GET_SIZE(unicode));
1357}
1358
1359/* --- Latin-1 Codec ------------------------------------------------------ */
1360
1361PyObject *PyUnicode_DecodeLatin1(const char *s,
1362 int size,
1363 const char *errors)
1364{
1365 PyUnicodeObject *v;
1366 Py_UNICODE *p;
1367
1368 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1369 v = _PyUnicode_New(size);
1370 if (v == NULL)
1371 goto onError;
1372 if (size == 0)
1373 return (PyObject *)v;
1374 p = PyUnicode_AS_UNICODE(v);
1375 while (size-- > 0)
1376 *p++ = (unsigned char)*s++;
1377 return (PyObject *)v;
1378
1379 onError:
1380 Py_XDECREF(v);
1381 return NULL;
1382}
1383
1384static
1385int latin1_encoding_error(const Py_UNICODE **source,
1386 char **dest,
1387 const char *errors,
1388 const char *details)
1389{
1390 if ((errors == NULL) ||
1391 (strcmp(errors,"strict") == 0)) {
1392 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001393 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 details);
1395 return -1;
1396 }
1397 else if (strcmp(errors,"ignore") == 0) {
1398 return 0;
1399 }
1400 else if (strcmp(errors,"replace") == 0) {
1401 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001402 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 return 0;
1404 }
1405 else {
1406 PyErr_Format(PyExc_ValueError,
1407 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001408 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 errors);
1410 return -1;
1411 }
1412}
1413
1414PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1415 int size,
1416 const char *errors)
1417{
1418 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001419 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420 repr = PyString_FromStringAndSize(NULL, size);
1421 if (repr == NULL)
1422 return NULL;
1423
1424 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001425 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 while (size-- > 0) {
1427 Py_UNICODE ch = *p++;
1428 if (ch >= 256) {
1429 if (latin1_encoding_error(&p, &s, errors,
1430 "ordinal not in range(256)"))
1431 goto onError;
1432 }
1433 else
1434 *s++ = (char)ch;
1435 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001436 /* Resize if error handling skipped some characters */
1437 if (s - start < PyString_GET_SIZE(repr))
1438 if (_PyString_Resize(&repr, s - start))
1439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 return repr;
1441
1442 onError:
1443 Py_DECREF(repr);
1444 return NULL;
1445}
1446
1447PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1448{
1449 if (!PyUnicode_Check(unicode)) {
1450 PyErr_BadArgument();
1451 return NULL;
1452 }
1453 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1454 PyUnicode_GET_SIZE(unicode),
1455 NULL);
1456}
1457
1458/* --- 7-bit ASCII Codec -------------------------------------------------- */
1459
1460static
1461int ascii_decoding_error(const char **source,
1462 Py_UNICODE **dest,
1463 const char *errors,
1464 const char *details)
1465{
1466 if ((errors == NULL) ||
1467 (strcmp(errors,"strict") == 0)) {
1468 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001469 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470 details);
1471 return -1;
1472 }
1473 else if (strcmp(errors,"ignore") == 0) {
1474 return 0;
1475 }
1476 else if (strcmp(errors,"replace") == 0) {
1477 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1478 (*dest)++;
1479 return 0;
1480 }
1481 else {
1482 PyErr_Format(PyExc_ValueError,
1483 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001484 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485 errors);
1486 return -1;
1487 }
1488}
1489
1490PyObject *PyUnicode_DecodeASCII(const char *s,
1491 int size,
1492 const char *errors)
1493{
1494 PyUnicodeObject *v;
1495 Py_UNICODE *p;
1496
1497 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1498 v = _PyUnicode_New(size);
1499 if (v == NULL)
1500 goto onError;
1501 if (size == 0)
1502 return (PyObject *)v;
1503 p = PyUnicode_AS_UNICODE(v);
1504 while (size-- > 0) {
1505 register unsigned char c;
1506
1507 c = (unsigned char)*s++;
1508 if (c < 128)
1509 *p++ = c;
1510 else if (ascii_decoding_error(&s, &p, errors,
1511 "ordinal not in range(128)"))
1512 goto onError;
1513 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001514 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1515 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1516 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 return (PyObject *)v;
1518
1519 onError:
1520 Py_XDECREF(v);
1521 return NULL;
1522}
1523
1524static
1525int ascii_encoding_error(const Py_UNICODE **source,
1526 char **dest,
1527 const char *errors,
1528 const char *details)
1529{
1530 if ((errors == NULL) ||
1531 (strcmp(errors,"strict") == 0)) {
1532 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001533 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 details);
1535 return -1;
1536 }
1537 else if (strcmp(errors,"ignore") == 0) {
1538 return 0;
1539 }
1540 else if (strcmp(errors,"replace") == 0) {
1541 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001542 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543 return 0;
1544 }
1545 else {
1546 PyErr_Format(PyExc_ValueError,
1547 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001548 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 errors);
1550 return -1;
1551 }
1552}
1553
1554PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1555 int size,
1556 const char *errors)
1557{
1558 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001559 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560 repr = PyString_FromStringAndSize(NULL, size);
1561 if (repr == NULL)
1562 return NULL;
1563
1564 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001565 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 while (size-- > 0) {
1567 Py_UNICODE ch = *p++;
1568 if (ch >= 128) {
1569 if (ascii_encoding_error(&p, &s, errors,
1570 "ordinal not in range(128)"))
1571 goto onError;
1572 }
1573 else
1574 *s++ = (char)ch;
1575 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001576 /* Resize if error handling skipped some characters */
1577 if (s - start < PyString_GET_SIZE(repr))
1578 if (_PyString_Resize(&repr, s - start))
1579 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 return repr;
1581
1582 onError:
1583 Py_DECREF(repr);
1584 return NULL;
1585}
1586
1587PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1588{
1589 if (!PyUnicode_Check(unicode)) {
1590 PyErr_BadArgument();
1591 return NULL;
1592 }
1593 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1594 PyUnicode_GET_SIZE(unicode),
1595 NULL);
1596}
1597
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001598#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001599
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001600/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001601
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001602PyObject *PyUnicode_DecodeMBCS(const char *s,
1603 int size,
1604 const char *errors)
1605{
1606 PyUnicodeObject *v;
1607 Py_UNICODE *p;
1608
1609 /* First get the size of the result */
1610 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001611 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001612 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1613
1614 v = _PyUnicode_New(usize);
1615 if (v == NULL)
1616 return NULL;
1617 if (usize == 0)
1618 return (PyObject *)v;
1619 p = PyUnicode_AS_UNICODE(v);
1620 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1621 Py_DECREF(v);
1622 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1623 }
1624
1625 return (PyObject *)v;
1626}
1627
1628PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1629 int size,
1630 const char *errors)
1631{
1632 PyObject *repr;
1633 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001634 DWORD mbcssize;
1635
1636 /* If there are no characters, bail now! */
1637 if (size==0)
1638 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001639
1640 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001641 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001642 if (mbcssize==0)
1643 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1644
1645 repr = PyString_FromStringAndSize(NULL, mbcssize);
1646 if (repr == NULL)
1647 return NULL;
1648 if (mbcssize==0)
1649 return repr;
1650
1651 /* Do the conversion */
1652 s = PyString_AS_STRING(repr);
1653 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1654 Py_DECREF(repr);
1655 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1656 }
1657 return repr;
1658}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001659
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001660#endif /* MS_WIN32 */
1661
Guido van Rossumd57fd912000-03-10 22:53:23 +00001662/* --- Character Mapping Codec -------------------------------------------- */
1663
1664static
1665int charmap_decoding_error(const char **source,
1666 Py_UNICODE **dest,
1667 const char *errors,
1668 const char *details)
1669{
1670 if ((errors == NULL) ||
1671 (strcmp(errors,"strict") == 0)) {
1672 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001673 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 details);
1675 return -1;
1676 }
1677 else if (strcmp(errors,"ignore") == 0) {
1678 return 0;
1679 }
1680 else if (strcmp(errors,"replace") == 0) {
1681 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1682 (*dest)++;
1683 return 0;
1684 }
1685 else {
1686 PyErr_Format(PyExc_ValueError,
1687 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001688 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689 errors);
1690 return -1;
1691 }
1692}
1693
1694PyObject *PyUnicode_DecodeCharmap(const char *s,
1695 int size,
1696 PyObject *mapping,
1697 const char *errors)
1698{
1699 PyUnicodeObject *v;
1700 Py_UNICODE *p;
1701
1702 /* Default to Latin-1 */
1703 if (mapping == NULL)
1704 return PyUnicode_DecodeLatin1(s, size, errors);
1705
1706 v = _PyUnicode_New(size);
1707 if (v == NULL)
1708 goto onError;
1709 if (size == 0)
1710 return (PyObject *)v;
1711 p = PyUnicode_AS_UNICODE(v);
1712 while (size-- > 0) {
1713 unsigned char ch = *s++;
1714 PyObject *w, *x;
1715
1716 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1717 w = PyInt_FromLong((long)ch);
1718 if (w == NULL)
1719 goto onError;
1720 x = PyObject_GetItem(mapping, w);
1721 Py_DECREF(w);
1722 if (x == NULL) {
1723 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1724 /* No mapping found: default to Latin-1 mapping */
1725 PyErr_Clear();
1726 *p++ = (Py_UNICODE)ch;
1727 continue;
1728 }
1729 goto onError;
1730 }
1731
1732 /* Apply mapping */
1733 if (PyInt_Check(x)) {
1734 int value = PyInt_AS_LONG(x);
1735 if (value < 0 || value > 65535) {
1736 PyErr_SetString(PyExc_TypeError,
1737 "character mapping must be in range(65336)");
1738 Py_DECREF(x);
1739 goto onError;
1740 }
1741 *p++ = (Py_UNICODE)value;
1742 }
1743 else if (x == Py_None) {
1744 /* undefined mapping */
1745 if (charmap_decoding_error(&s, &p, errors,
1746 "character maps to <undefined>")) {
1747 Py_DECREF(x);
1748 goto onError;
1749 }
1750 }
1751 else if (PyUnicode_Check(x)) {
1752 if (PyUnicode_GET_SIZE(x) != 1) {
1753 /* 1-n mapping */
1754 PyErr_SetString(PyExc_NotImplementedError,
1755 "1-n mappings are currently not implemented");
1756 Py_DECREF(x);
1757 goto onError;
1758 }
1759 *p++ = *PyUnicode_AS_UNICODE(x);
1760 }
1761 else {
1762 /* wrong return value */
1763 PyErr_SetString(PyExc_TypeError,
1764 "character mapping must return integer, None or unicode");
1765 Py_DECREF(x);
1766 goto onError;
1767 }
1768 Py_DECREF(x);
1769 }
1770 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1771 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1772 goto onError;
1773 return (PyObject *)v;
1774
1775 onError:
1776 Py_XDECREF(v);
1777 return NULL;
1778}
1779
1780static
1781int charmap_encoding_error(const Py_UNICODE **source,
1782 char **dest,
1783 const char *errors,
1784 const char *details)
1785{
1786 if ((errors == NULL) ||
1787 (strcmp(errors,"strict") == 0)) {
1788 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001789 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 details);
1791 return -1;
1792 }
1793 else if (strcmp(errors,"ignore") == 0) {
1794 return 0;
1795 }
1796 else if (strcmp(errors,"replace") == 0) {
1797 **dest = '?';
1798 (*dest)++;
1799 return 0;
1800 }
1801 else {
1802 PyErr_Format(PyExc_ValueError,
1803 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001804 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 errors);
1806 return -1;
1807 }
1808}
1809
1810PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1811 int size,
1812 PyObject *mapping,
1813 const char *errors)
1814{
1815 PyObject *v;
1816 char *s;
1817
1818 /* Default to Latin-1 */
1819 if (mapping == NULL)
1820 return PyUnicode_EncodeLatin1(p, size, errors);
1821
1822 v = PyString_FromStringAndSize(NULL, size);
1823 if (v == NULL)
1824 return NULL;
1825 s = PyString_AS_STRING(v);
1826 while (size-- > 0) {
1827 Py_UNICODE ch = *p++;
1828 PyObject *w, *x;
1829
1830 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1831 w = PyInt_FromLong((long)ch);
1832 if (w == NULL)
1833 goto onError;
1834 x = PyObject_GetItem(mapping, w);
1835 Py_DECREF(w);
1836 if (x == NULL) {
1837 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1838 /* No mapping found: default to Latin-1 mapping if possible */
1839 PyErr_Clear();
1840 if (ch < 256) {
1841 *s++ = (char)ch;
1842 continue;
1843 }
1844 else if (!charmap_encoding_error(&p, &s, errors,
1845 "missing character mapping"))
1846 continue;
1847 }
1848 goto onError;
1849 }
1850
1851 /* Apply mapping */
1852 if (PyInt_Check(x)) {
1853 int value = PyInt_AS_LONG(x);
1854 if (value < 0 || value > 255) {
1855 PyErr_SetString(PyExc_TypeError,
1856 "character mapping must be in range(256)");
1857 Py_DECREF(x);
1858 goto onError;
1859 }
1860 *s++ = (char)value;
1861 }
1862 else if (x == Py_None) {
1863 /* undefined mapping */
1864 if (charmap_encoding_error(&p, &s, errors,
1865 "character maps to <undefined>")) {
1866 Py_DECREF(x);
1867 goto onError;
1868 }
1869 }
1870 else if (PyString_Check(x)) {
1871 if (PyString_GET_SIZE(x) != 1) {
1872 /* 1-n mapping */
1873 PyErr_SetString(PyExc_NotImplementedError,
1874 "1-n mappings are currently not implemented");
1875 Py_DECREF(x);
1876 goto onError;
1877 }
1878 *s++ = *PyString_AS_STRING(x);
1879 }
1880 else {
1881 /* wrong return value */
1882 PyErr_SetString(PyExc_TypeError,
1883 "character mapping must return integer, None or unicode");
1884 Py_DECREF(x);
1885 goto onError;
1886 }
1887 Py_DECREF(x);
1888 }
1889 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1890 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1891 goto onError;
1892 return v;
1893
1894 onError:
1895 Py_DECREF(v);
1896 return NULL;
1897}
1898
1899PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1900 PyObject *mapping)
1901{
1902 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1903 PyErr_BadArgument();
1904 return NULL;
1905 }
1906 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1907 PyUnicode_GET_SIZE(unicode),
1908 mapping,
1909 NULL);
1910}
1911
1912static
1913int translate_error(const Py_UNICODE **source,
1914 Py_UNICODE **dest,
1915 const char *errors,
1916 const char *details)
1917{
1918 if ((errors == NULL) ||
1919 (strcmp(errors,"strict") == 0)) {
1920 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001921 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 details);
1923 return -1;
1924 }
1925 else if (strcmp(errors,"ignore") == 0) {
1926 return 0;
1927 }
1928 else if (strcmp(errors,"replace") == 0) {
1929 **dest = '?';
1930 (*dest)++;
1931 return 0;
1932 }
1933 else {
1934 PyErr_Format(PyExc_ValueError,
1935 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001936 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 errors);
1938 return -1;
1939 }
1940}
1941
1942PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1943 int size,
1944 PyObject *mapping,
1945 const char *errors)
1946{
1947 PyUnicodeObject *v;
1948 Py_UNICODE *p;
1949
1950 if (mapping == NULL) {
1951 PyErr_BadArgument();
1952 return NULL;
1953 }
1954
1955 /* Output will never be longer than input */
1956 v = _PyUnicode_New(size);
1957 if (v == NULL)
1958 goto onError;
1959 if (size == 0)
1960 goto done;
1961 p = PyUnicode_AS_UNICODE(v);
1962 while (size-- > 0) {
1963 Py_UNICODE ch = *s++;
1964 PyObject *w, *x;
1965
1966 /* Get mapping */
1967 w = PyInt_FromLong(ch);
1968 if (w == NULL)
1969 goto onError;
1970 x = PyObject_GetItem(mapping, w);
1971 Py_DECREF(w);
1972 if (x == NULL) {
1973 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1974 /* No mapping found: default to 1-1 mapping */
1975 PyErr_Clear();
1976 *p++ = ch;
1977 continue;
1978 }
1979 goto onError;
1980 }
1981
1982 /* Apply mapping */
1983 if (PyInt_Check(x))
1984 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1985 else if (x == Py_None) {
1986 /* undefined mapping */
1987 if (translate_error(&s, &p, errors,
1988 "character maps to <undefined>")) {
1989 Py_DECREF(x);
1990 goto onError;
1991 }
1992 }
1993 else if (PyUnicode_Check(x)) {
1994 if (PyUnicode_GET_SIZE(x) != 1) {
1995 /* 1-n mapping */
1996 PyErr_SetString(PyExc_NotImplementedError,
1997 "1-n mappings are currently not implemented");
1998 Py_DECREF(x);
1999 goto onError;
2000 }
2001 *p++ = *PyUnicode_AS_UNICODE(x);
2002 }
2003 else {
2004 /* wrong return value */
2005 PyErr_SetString(PyExc_TypeError,
2006 "translate mapping must return integer, None or unicode");
2007 Py_DECREF(x);
2008 goto onError;
2009 }
2010 Py_DECREF(x);
2011 }
2012 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002013 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2014 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015
2016 done:
2017 return (PyObject *)v;
2018
2019 onError:
2020 Py_XDECREF(v);
2021 return NULL;
2022}
2023
2024PyObject *PyUnicode_Translate(PyObject *str,
2025 PyObject *mapping,
2026 const char *errors)
2027{
2028 PyObject *result;
2029
2030 str = PyUnicode_FromObject(str);
2031 if (str == NULL)
2032 goto onError;
2033 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2034 PyUnicode_GET_SIZE(str),
2035 mapping,
2036 errors);
2037 Py_DECREF(str);
2038 return result;
2039
2040 onError:
2041 Py_XDECREF(str);
2042 return NULL;
2043}
2044
Guido van Rossum9e896b32000-04-05 20:11:21 +00002045/* --- Decimal Encoder ---------------------------------------------------- */
2046
2047int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2048 int length,
2049 char *output,
2050 const char *errors)
2051{
2052 Py_UNICODE *p, *end;
2053
2054 if (output == NULL) {
2055 PyErr_BadArgument();
2056 return -1;
2057 }
2058
2059 p = s;
2060 end = s + length;
2061 while (p < end) {
2062 register Py_UNICODE ch = *p++;
2063 int decimal;
2064
2065 if (Py_UNICODE_ISSPACE(ch)) {
2066 *output++ = ' ';
2067 continue;
2068 }
2069 decimal = Py_UNICODE_TODECIMAL(ch);
2070 if (decimal >= 0) {
2071 *output++ = '0' + decimal;
2072 continue;
2073 }
Guido van Rossumba477042000-04-06 18:18:10 +00002074 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002075 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002076 continue;
2077 }
2078 /* All other characters are considered invalid */
2079 if (errors == NULL || strcmp(errors, "strict") == 0) {
2080 PyErr_SetString(PyExc_ValueError,
2081 "invalid decimal Unicode string");
2082 goto onError;
2083 }
2084 else if (strcmp(errors, "ignore") == 0)
2085 continue;
2086 else if (strcmp(errors, "replace") == 0) {
2087 *output++ = '?';
2088 continue;
2089 }
2090 }
2091 /* 0-terminate the output string */
2092 *output++ = '\0';
2093 return 0;
2094
2095 onError:
2096 return -1;
2097}
2098
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099/* --- Helpers ------------------------------------------------------------ */
2100
2101static
2102int count(PyUnicodeObject *self,
2103 int start,
2104 int end,
2105 PyUnicodeObject *substring)
2106{
2107 int count = 0;
2108
2109 end -= substring->length;
2110
2111 while (start <= end)
2112 if (Py_UNICODE_MATCH(self, start, substring)) {
2113 count++;
2114 start += substring->length;
2115 } else
2116 start++;
2117
2118 return count;
2119}
2120
2121int PyUnicode_Count(PyObject *str,
2122 PyObject *substr,
2123 int start,
2124 int end)
2125{
2126 int result;
2127
2128 str = PyUnicode_FromObject(str);
2129 if (str == NULL)
2130 return -1;
2131 substr = PyUnicode_FromObject(substr);
2132 if (substr == NULL) {
2133 Py_DECREF(substr);
2134 return -1;
2135 }
2136
2137 result = count((PyUnicodeObject *)str,
2138 start, end,
2139 (PyUnicodeObject *)substr);
2140
2141 Py_DECREF(str);
2142 Py_DECREF(substr);
2143 return result;
2144}
2145
2146static
2147int findstring(PyUnicodeObject *self,
2148 PyUnicodeObject *substring,
2149 int start,
2150 int end,
2151 int direction)
2152{
2153 if (start < 0)
2154 start += self->length;
2155 if (start < 0)
2156 start = 0;
2157
2158 if (substring->length == 0)
2159 return start;
2160
2161 if (end > self->length)
2162 end = self->length;
2163 if (end < 0)
2164 end += self->length;
2165 if (end < 0)
2166 end = 0;
2167
2168 end -= substring->length;
2169
2170 if (direction < 0) {
2171 for (; end >= start; end--)
2172 if (Py_UNICODE_MATCH(self, end, substring))
2173 return end;
2174 } else {
2175 for (; start <= end; start++)
2176 if (Py_UNICODE_MATCH(self, start, substring))
2177 return start;
2178 }
2179
2180 return -1;
2181}
2182
2183int PyUnicode_Find(PyObject *str,
2184 PyObject *substr,
2185 int start,
2186 int end,
2187 int direction)
2188{
2189 int result;
2190
2191 str = PyUnicode_FromObject(str);
2192 if (str == NULL)
2193 return -1;
2194 substr = PyUnicode_FromObject(substr);
2195 if (substr == NULL) {
2196 Py_DECREF(substr);
2197 return -1;
2198 }
2199
2200 result = findstring((PyUnicodeObject *)str,
2201 (PyUnicodeObject *)substr,
2202 start, end, direction);
2203 Py_DECREF(str);
2204 Py_DECREF(substr);
2205 return result;
2206}
2207
2208static
2209int tailmatch(PyUnicodeObject *self,
2210 PyUnicodeObject *substring,
2211 int start,
2212 int end,
2213 int direction)
2214{
2215 if (start < 0)
2216 start += self->length;
2217 if (start < 0)
2218 start = 0;
2219
2220 if (substring->length == 0)
2221 return 1;
2222
2223 if (end > self->length)
2224 end = self->length;
2225 if (end < 0)
2226 end += self->length;
2227 if (end < 0)
2228 end = 0;
2229
2230 end -= substring->length;
2231 if (end < start)
2232 return 0;
2233
2234 if (direction > 0) {
2235 if (Py_UNICODE_MATCH(self, end, substring))
2236 return 1;
2237 } else {
2238 if (Py_UNICODE_MATCH(self, start, substring))
2239 return 1;
2240 }
2241
2242 return 0;
2243}
2244
2245int PyUnicode_Tailmatch(PyObject *str,
2246 PyObject *substr,
2247 int start,
2248 int end,
2249 int direction)
2250{
2251 int result;
2252
2253 str = PyUnicode_FromObject(str);
2254 if (str == NULL)
2255 return -1;
2256 substr = PyUnicode_FromObject(substr);
2257 if (substr == NULL) {
2258 Py_DECREF(substr);
2259 return -1;
2260 }
2261
2262 result = tailmatch((PyUnicodeObject *)str,
2263 (PyUnicodeObject *)substr,
2264 start, end, direction);
2265 Py_DECREF(str);
2266 Py_DECREF(substr);
2267 return result;
2268}
2269
2270static
2271const Py_UNICODE *findchar(const Py_UNICODE *s,
2272 int size,
2273 Py_UNICODE ch)
2274{
2275 /* like wcschr, but doesn't stop at NULL characters */
2276
2277 while (size-- > 0) {
2278 if (*s == ch)
2279 return s;
2280 s++;
2281 }
2282
2283 return NULL;
2284}
2285
2286/* Apply fixfct filter to the Unicode object self and return a
2287 reference to the modified object */
2288
2289static
2290PyObject *fixup(PyUnicodeObject *self,
2291 int (*fixfct)(PyUnicodeObject *s))
2292{
2293
2294 PyUnicodeObject *u;
2295
2296 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2297 self->length);
2298 if (u == NULL)
2299 return NULL;
2300 if (!fixfct(u)) {
2301 /* fixfct should return TRUE if it modified the buffer. If
2302 FALSE, return a reference to the original buffer instead
2303 (to save space, not time) */
2304 Py_INCREF(self);
2305 Py_DECREF(u);
2306 return (PyObject*) self;
2307 }
2308 return (PyObject*) u;
2309}
2310
2311static
2312int fixupper(PyUnicodeObject *self)
2313{
2314 int len = self->length;
2315 Py_UNICODE *s = self->str;
2316 int status = 0;
2317
2318 while (len-- > 0) {
2319 register Py_UNICODE ch;
2320
2321 ch = Py_UNICODE_TOUPPER(*s);
2322 if (ch != *s) {
2323 status = 1;
2324 *s = ch;
2325 }
2326 s++;
2327 }
2328
2329 return status;
2330}
2331
2332static
2333int fixlower(PyUnicodeObject *self)
2334{
2335 int len = self->length;
2336 Py_UNICODE *s = self->str;
2337 int status = 0;
2338
2339 while (len-- > 0) {
2340 register Py_UNICODE ch;
2341
2342 ch = Py_UNICODE_TOLOWER(*s);
2343 if (ch != *s) {
2344 status = 1;
2345 *s = ch;
2346 }
2347 s++;
2348 }
2349
2350 return status;
2351}
2352
2353static
2354int fixswapcase(PyUnicodeObject *self)
2355{
2356 int len = self->length;
2357 Py_UNICODE *s = self->str;
2358 int status = 0;
2359
2360 while (len-- > 0) {
2361 if (Py_UNICODE_ISUPPER(*s)) {
2362 *s = Py_UNICODE_TOLOWER(*s);
2363 status = 1;
2364 } else if (Py_UNICODE_ISLOWER(*s)) {
2365 *s = Py_UNICODE_TOUPPER(*s);
2366 status = 1;
2367 }
2368 s++;
2369 }
2370
2371 return status;
2372}
2373
2374static
2375int fixcapitalize(PyUnicodeObject *self)
2376{
2377 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2378 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2379 return 1;
2380 }
2381 return 0;
2382}
2383
2384static
2385int fixtitle(PyUnicodeObject *self)
2386{
2387 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2388 register Py_UNICODE *e;
2389 int previous_is_cased;
2390
2391 /* Shortcut for single character strings */
2392 if (PyUnicode_GET_SIZE(self) == 1) {
2393 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2394 if (*p != ch) {
2395 *p = ch;
2396 return 1;
2397 }
2398 else
2399 return 0;
2400 }
2401
2402 e = p + PyUnicode_GET_SIZE(self);
2403 previous_is_cased = 0;
2404 for (; p < e; p++) {
2405 register const Py_UNICODE ch = *p;
2406
2407 if (previous_is_cased)
2408 *p = Py_UNICODE_TOLOWER(ch);
2409 else
2410 *p = Py_UNICODE_TOTITLE(ch);
2411
2412 if (Py_UNICODE_ISLOWER(ch) ||
2413 Py_UNICODE_ISUPPER(ch) ||
2414 Py_UNICODE_ISTITLE(ch))
2415 previous_is_cased = 1;
2416 else
2417 previous_is_cased = 0;
2418 }
2419 return 1;
2420}
2421
2422PyObject *PyUnicode_Join(PyObject *separator,
2423 PyObject *seq)
2424{
2425 Py_UNICODE *sep;
2426 int seplen;
2427 PyUnicodeObject *res = NULL;
2428 int reslen = 0;
2429 Py_UNICODE *p;
2430 int seqlen = 0;
2431 int sz = 100;
2432 int i;
2433
2434 seqlen = PySequence_Length(seq);
2435 if (seqlen < 0 && PyErr_Occurred())
2436 return NULL;
2437
2438 if (separator == NULL) {
2439 Py_UNICODE blank = ' ';
2440 sep = &blank;
2441 seplen = 1;
2442 }
2443 else {
2444 separator = PyUnicode_FromObject(separator);
2445 if (separator == NULL)
2446 return NULL;
2447 sep = PyUnicode_AS_UNICODE(separator);
2448 seplen = PyUnicode_GET_SIZE(separator);
2449 }
2450
2451 res = _PyUnicode_New(sz);
2452 if (res == NULL)
2453 goto onError;
2454 p = PyUnicode_AS_UNICODE(res);
2455 reslen = 0;
2456
2457 for (i = 0; i < seqlen; i++) {
2458 int itemlen;
2459 PyObject *item;
2460
2461 item = PySequence_GetItem(seq, i);
2462 if (item == NULL)
2463 goto onError;
2464 if (!PyUnicode_Check(item)) {
2465 PyObject *v;
2466 v = PyUnicode_FromObject(item);
2467 Py_DECREF(item);
2468 item = v;
2469 if (item == NULL)
2470 goto onError;
2471 }
2472 itemlen = PyUnicode_GET_SIZE(item);
2473 while (reslen + itemlen + seplen >= sz) {
2474 if (_PyUnicode_Resize(res, sz*2))
2475 goto onError;
2476 sz *= 2;
2477 p = PyUnicode_AS_UNICODE(res) + reslen;
2478 }
2479 if (i > 0) {
2480 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2481 p += seplen;
2482 reslen += seplen;
2483 }
2484 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2485 p += itemlen;
2486 reslen += itemlen;
2487 Py_DECREF(item);
2488 }
2489 if (_PyUnicode_Resize(res, reslen))
2490 goto onError;
2491
2492 Py_XDECREF(separator);
2493 return (PyObject *)res;
2494
2495 onError:
2496 Py_XDECREF(separator);
2497 Py_DECREF(res);
2498 return NULL;
2499}
2500
2501static
2502PyUnicodeObject *pad(PyUnicodeObject *self,
2503 int left,
2504 int right,
2505 Py_UNICODE fill)
2506{
2507 PyUnicodeObject *u;
2508
2509 if (left < 0)
2510 left = 0;
2511 if (right < 0)
2512 right = 0;
2513
2514 if (left == 0 && right == 0) {
2515 Py_INCREF(self);
2516 return self;
2517 }
2518
2519 u = _PyUnicode_New(left + self->length + right);
2520 if (u) {
2521 if (left)
2522 Py_UNICODE_FILL(u->str, fill, left);
2523 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2524 if (right)
2525 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2526 }
2527
2528 return u;
2529}
2530
2531#define SPLIT_APPEND(data, left, right) \
2532 str = PyUnicode_FromUnicode(data + left, right - left); \
2533 if (!str) \
2534 goto onError; \
2535 if (PyList_Append(list, str)) { \
2536 Py_DECREF(str); \
2537 goto onError; \
2538 } \
2539 else \
2540 Py_DECREF(str);
2541
2542static
2543PyObject *split_whitespace(PyUnicodeObject *self,
2544 PyObject *list,
2545 int maxcount)
2546{
2547 register int i;
2548 register int j;
2549 int len = self->length;
2550 PyObject *str;
2551
2552 for (i = j = 0; i < len; ) {
2553 /* find a token */
2554 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2555 i++;
2556 j = i;
2557 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2558 i++;
2559 if (j < i) {
2560 if (maxcount-- <= 0)
2561 break;
2562 SPLIT_APPEND(self->str, j, i);
2563 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2564 i++;
2565 j = i;
2566 }
2567 }
2568 if (j < len) {
2569 SPLIT_APPEND(self->str, j, len);
2570 }
2571 return list;
2572
2573 onError:
2574 Py_DECREF(list);
2575 return NULL;
2576}
2577
2578PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002579 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002580{
2581 register int i;
2582 register int j;
2583 int len;
2584 PyObject *list;
2585 PyObject *str;
2586 Py_UNICODE *data;
2587
2588 string = PyUnicode_FromObject(string);
2589 if (string == NULL)
2590 return NULL;
2591 data = PyUnicode_AS_UNICODE(string);
2592 len = PyUnicode_GET_SIZE(string);
2593
Guido van Rossumd57fd912000-03-10 22:53:23 +00002594 list = PyList_New(0);
2595 if (!list)
2596 goto onError;
2597
2598 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002599 int eol;
2600
Guido van Rossumd57fd912000-03-10 22:53:23 +00002601 /* Find a line and append it */
2602 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2603 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604
2605 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002606 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607 if (i < len) {
2608 if (data[i] == '\r' && i + 1 < len &&
2609 data[i+1] == '\n')
2610 i += 2;
2611 else
2612 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002613 if (keepends)
2614 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002615 }
Guido van Rossum86662912000-04-11 15:38:46 +00002616 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002617 j = i;
2618 }
2619 if (j < len) {
2620 SPLIT_APPEND(data, j, len);
2621 }
2622
2623 Py_DECREF(string);
2624 return list;
2625
2626 onError:
2627 Py_DECREF(list);
2628 Py_DECREF(string);
2629 return NULL;
2630}
2631
2632static
2633PyObject *split_char(PyUnicodeObject *self,
2634 PyObject *list,
2635 Py_UNICODE ch,
2636 int maxcount)
2637{
2638 register int i;
2639 register int j;
2640 int len = self->length;
2641 PyObject *str;
2642
2643 for (i = j = 0; i < len; ) {
2644 if (self->str[i] == ch) {
2645 if (maxcount-- <= 0)
2646 break;
2647 SPLIT_APPEND(self->str, j, i);
2648 i = j = i + 1;
2649 } else
2650 i++;
2651 }
2652 if (j <= len) {
2653 SPLIT_APPEND(self->str, j, len);
2654 }
2655 return list;
2656
2657 onError:
2658 Py_DECREF(list);
2659 return NULL;
2660}
2661
2662static
2663PyObject *split_substring(PyUnicodeObject *self,
2664 PyObject *list,
2665 PyUnicodeObject *substring,
2666 int maxcount)
2667{
2668 register int i;
2669 register int j;
2670 int len = self->length;
2671 int sublen = substring->length;
2672 PyObject *str;
2673
2674 for (i = j = 0; i < len - sublen; ) {
2675 if (Py_UNICODE_MATCH(self, i, substring)) {
2676 if (maxcount-- <= 0)
2677 break;
2678 SPLIT_APPEND(self->str, j, i);
2679 i = j = i + sublen;
2680 } else
2681 i++;
2682 }
2683 if (j <= len) {
2684 SPLIT_APPEND(self->str, j, len);
2685 }
2686 return list;
2687
2688 onError:
2689 Py_DECREF(list);
2690 return NULL;
2691}
2692
2693#undef SPLIT_APPEND
2694
2695static
2696PyObject *split(PyUnicodeObject *self,
2697 PyUnicodeObject *substring,
2698 int maxcount)
2699{
2700 PyObject *list;
2701
2702 if (maxcount < 0)
2703 maxcount = INT_MAX;
2704
2705 list = PyList_New(0);
2706 if (!list)
2707 return NULL;
2708
2709 if (substring == NULL)
2710 return split_whitespace(self,list,maxcount);
2711
2712 else if (substring->length == 1)
2713 return split_char(self,list,substring->str[0],maxcount);
2714
2715 else if (substring->length == 0) {
2716 Py_DECREF(list);
2717 PyErr_SetString(PyExc_ValueError, "empty separator");
2718 return NULL;
2719 }
2720 else
2721 return split_substring(self,list,substring,maxcount);
2722}
2723
2724static
2725PyObject *strip(PyUnicodeObject *self,
2726 int left,
2727 int right)
2728{
2729 Py_UNICODE *p = self->str;
2730 int start = 0;
2731 int end = self->length;
2732
2733 if (left)
2734 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2735 start++;
2736
2737 if (right)
2738 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2739 end--;
2740
2741 if (start == 0 && end == self->length) {
2742 /* couldn't strip anything off, return original string */
2743 Py_INCREF(self);
2744 return (PyObject*) self;
2745 }
2746
2747 return (PyObject*) PyUnicode_FromUnicode(
2748 self->str + start,
2749 end - start
2750 );
2751}
2752
2753static
2754PyObject *replace(PyUnicodeObject *self,
2755 PyUnicodeObject *str1,
2756 PyUnicodeObject *str2,
2757 int maxcount)
2758{
2759 PyUnicodeObject *u;
2760
2761 if (maxcount < 0)
2762 maxcount = INT_MAX;
2763
2764 if (str1->length == 1 && str2->length == 1) {
2765 int i;
2766
2767 /* replace characters */
2768 if (!findchar(self->str, self->length, str1->str[0])) {
2769 /* nothing to replace, return original string */
2770 Py_INCREF(self);
2771 u = self;
2772 } else {
2773 Py_UNICODE u1 = str1->str[0];
2774 Py_UNICODE u2 = str2->str[0];
2775
2776 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2777 self->str,
2778 self->length
2779 );
2780 if (u)
2781 for (i = 0; i < u->length; i++)
2782 if (u->str[i] == u1) {
2783 if (--maxcount < 0)
2784 break;
2785 u->str[i] = u2;
2786 }
2787 }
2788
2789 } else {
2790 int n, i;
2791 Py_UNICODE *p;
2792
2793 /* replace strings */
2794 n = count(self, 0, self->length, str1);
2795 if (n > maxcount)
2796 n = maxcount;
2797 if (n == 0) {
2798 /* nothing to replace, return original string */
2799 Py_INCREF(self);
2800 u = self;
2801 } else {
2802 u = _PyUnicode_New(
2803 self->length + n * (str2->length - str1->length));
2804 if (u) {
2805 i = 0;
2806 p = u->str;
2807 while (i <= self->length - str1->length)
2808 if (Py_UNICODE_MATCH(self, i, str1)) {
2809 /* replace string segment */
2810 Py_UNICODE_COPY(p, str2->str, str2->length);
2811 p += str2->length;
2812 i += str1->length;
2813 if (--n <= 0) {
2814 /* copy remaining part */
2815 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2816 break;
2817 }
2818 } else
2819 *p++ = self->str[i++];
2820 }
2821 }
2822 }
2823
2824 return (PyObject *) u;
2825}
2826
2827/* --- Unicode Object Methods --------------------------------------------- */
2828
2829static char title__doc__[] =
2830"S.title() -> unicode\n\
2831\n\
2832Return a titlecased version of S, i.e. words start with title case\n\
2833characters, all remaining cased characters have lower case.";
2834
2835static PyObject*
2836unicode_title(PyUnicodeObject *self, PyObject *args)
2837{
2838 if (!PyArg_NoArgs(args))
2839 return NULL;
2840 return fixup(self, fixtitle);
2841}
2842
2843static char capitalize__doc__[] =
2844"S.capitalize() -> unicode\n\
2845\n\
2846Return a capitalized version of S, i.e. make the first character\n\
2847have upper case.";
2848
2849static PyObject*
2850unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2851{
2852 if (!PyArg_NoArgs(args))
2853 return NULL;
2854 return fixup(self, fixcapitalize);
2855}
2856
2857#if 0
2858static char capwords__doc__[] =
2859"S.capwords() -> unicode\n\
2860\n\
2861Apply .capitalize() to all words in S and return the result with\n\
2862normalized whitespace (all whitespace strings are replaced by ' ').";
2863
2864static PyObject*
2865unicode_capwords(PyUnicodeObject *self, PyObject *args)
2866{
2867 PyObject *list;
2868 PyObject *item;
2869 int i;
2870
2871 if (!PyArg_NoArgs(args))
2872 return NULL;
2873
2874 /* Split into words */
2875 list = split(self, NULL, -1);
2876 if (!list)
2877 return NULL;
2878
2879 /* Capitalize each word */
2880 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2881 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2882 fixcapitalize);
2883 if (item == NULL)
2884 goto onError;
2885 Py_DECREF(PyList_GET_ITEM(list, i));
2886 PyList_SET_ITEM(list, i, item);
2887 }
2888
2889 /* Join the words to form a new string */
2890 item = PyUnicode_Join(NULL, list);
2891
2892onError:
2893 Py_DECREF(list);
2894 return (PyObject *)item;
2895}
2896#endif
2897
2898static char center__doc__[] =
2899"S.center(width) -> unicode\n\
2900\n\
2901Return S centered in a Unicode string of length width. Padding is done\n\
2902using spaces.";
2903
2904static PyObject *
2905unicode_center(PyUnicodeObject *self, PyObject *args)
2906{
2907 int marg, left;
2908 int width;
2909
2910 if (!PyArg_ParseTuple(args, "i:center", &width))
2911 return NULL;
2912
2913 if (self->length >= width) {
2914 Py_INCREF(self);
2915 return (PyObject*) self;
2916 }
2917
2918 marg = width - self->length;
2919 left = marg / 2 + (marg & width & 1);
2920
2921 return (PyObject*) pad(self, left, marg - left, ' ');
2922}
2923
2924static int
2925unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2926{
2927 int len1, len2;
2928 Py_UNICODE *s1 = str1->str;
2929 Py_UNICODE *s2 = str2->str;
2930
2931 len1 = str1->length;
2932 len2 = str2->length;
2933
2934 while (len1 > 0 && len2 > 0) {
2935 int cmp = (*s1++) - (*s2++);
2936 if (cmp)
2937 /* This should make Christian happy! */
2938 return (cmp < 0) ? -1 : (cmp != 0);
2939 len1--, len2--;
2940 }
2941
2942 return (len1 < len2) ? -1 : (len1 != len2);
2943}
2944
2945int PyUnicode_Compare(PyObject *left,
2946 PyObject *right)
2947{
2948 PyUnicodeObject *u = NULL, *v = NULL;
2949 int result;
2950
2951 /* Coerce the two arguments */
2952 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2953 if (u == NULL)
2954 goto onError;
2955 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2956 if (v == NULL)
2957 goto onError;
2958
2959 /* Shortcut for emtpy or interned objects */
2960 if (v == u) {
2961 Py_DECREF(u);
2962 Py_DECREF(v);
2963 return 0;
2964 }
2965
2966 result = unicode_compare(u, v);
2967
2968 Py_DECREF(u);
2969 Py_DECREF(v);
2970 return result;
2971
2972onError:
2973 Py_XDECREF(u);
2974 Py_XDECREF(v);
2975 return -1;
2976}
2977
Guido van Rossum403d68b2000-03-13 15:55:09 +00002978int PyUnicode_Contains(PyObject *container,
2979 PyObject *element)
2980{
2981 PyUnicodeObject *u = NULL, *v = NULL;
2982 int result;
2983 register const Py_UNICODE *p, *e;
2984 register Py_UNICODE ch;
2985
2986 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002987 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2988 if (v == NULL)
2989 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002990 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2991 if (u == NULL) {
2992 Py_DECREF(v);
2993 goto onError;
2994 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002995
2996 /* Check v in u */
2997 if (PyUnicode_GET_SIZE(v) != 1) {
2998 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00002999 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003000 goto onError;
3001 }
3002 ch = *PyUnicode_AS_UNICODE(v);
3003 p = PyUnicode_AS_UNICODE(u);
3004 e = p + PyUnicode_GET_SIZE(u);
3005 result = 0;
3006 while (p < e) {
3007 if (*p++ == ch) {
3008 result = 1;
3009 break;
3010 }
3011 }
3012
3013 Py_DECREF(u);
3014 Py_DECREF(v);
3015 return result;
3016
3017onError:
3018 Py_XDECREF(u);
3019 Py_XDECREF(v);
3020 return -1;
3021}
3022
Guido van Rossumd57fd912000-03-10 22:53:23 +00003023/* Concat to string or Unicode object giving a new Unicode object. */
3024
3025PyObject *PyUnicode_Concat(PyObject *left,
3026 PyObject *right)
3027{
3028 PyUnicodeObject *u = NULL, *v = NULL, *w;
3029
3030 /* Coerce the two arguments */
3031 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3032 if (u == NULL)
3033 goto onError;
3034 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3035 if (v == NULL)
3036 goto onError;
3037
3038 /* Shortcuts */
3039 if (v == unicode_empty) {
3040 Py_DECREF(v);
3041 return (PyObject *)u;
3042 }
3043 if (u == unicode_empty) {
3044 Py_DECREF(u);
3045 return (PyObject *)v;
3046 }
3047
3048 /* Concat the two Unicode strings */
3049 w = _PyUnicode_New(u->length + v->length);
3050 if (w == NULL)
3051 goto onError;
3052 Py_UNICODE_COPY(w->str, u->str, u->length);
3053 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3054
3055 Py_DECREF(u);
3056 Py_DECREF(v);
3057 return (PyObject *)w;
3058
3059onError:
3060 Py_XDECREF(u);
3061 Py_XDECREF(v);
3062 return NULL;
3063}
3064
3065static char count__doc__[] =
3066"S.count(sub[, start[, end]]) -> int\n\
3067\n\
3068Return the number of occurrences of substring sub in Unicode string\n\
3069S[start:end]. Optional arguments start and end are\n\
3070interpreted as in slice notation.";
3071
3072static PyObject *
3073unicode_count(PyUnicodeObject *self, PyObject *args)
3074{
3075 PyUnicodeObject *substring;
3076 int start = 0;
3077 int end = INT_MAX;
3078 PyObject *result;
3079
Guido van Rossumb8872e62000-05-09 14:14:27 +00003080 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3081 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003082 return NULL;
3083
3084 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3085 (PyObject *)substring);
3086 if (substring == NULL)
3087 return NULL;
3088
3089 if (substring->length == 0) {
3090 Py_DECREF(substring);
3091 return PyInt_FromLong((long) 0);
3092 }
3093
3094 if (start < 0)
3095 start += self->length;
3096 if (start < 0)
3097 start = 0;
3098 if (end > self->length)
3099 end = self->length;
3100 if (end < 0)
3101 end += self->length;
3102 if (end < 0)
3103 end = 0;
3104
3105 result = PyInt_FromLong((long) count(self, start, end, substring));
3106
3107 Py_DECREF(substring);
3108 return result;
3109}
3110
3111static char encode__doc__[] =
3112"S.encode([encoding[,errors]]) -> string\n\
3113\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003114Return an encoded string version of S. Default encoding is the current\n\
3115default string encoding. errors may be given to set a different error\n\
3116handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3117a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003118
3119static PyObject *
3120unicode_encode(PyUnicodeObject *self, PyObject *args)
3121{
3122 char *encoding = NULL;
3123 char *errors = NULL;
3124 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3125 return NULL;
3126 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3127}
3128
3129static char expandtabs__doc__[] =
3130"S.expandtabs([tabsize]) -> unicode\n\
3131\n\
3132Return a copy of S where all tab characters are expanded using spaces.\n\
3133If tabsize is not given, a tab size of 8 characters is assumed.";
3134
3135static PyObject*
3136unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3137{
3138 Py_UNICODE *e;
3139 Py_UNICODE *p;
3140 Py_UNICODE *q;
3141 int i, j;
3142 PyUnicodeObject *u;
3143 int tabsize = 8;
3144
3145 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3146 return NULL;
3147
3148 /* First pass: determine size of ouput string */
3149 i = j = 0;
3150 e = self->str + self->length;
3151 for (p = self->str; p < e; p++)
3152 if (*p == '\t') {
3153 if (tabsize > 0)
3154 j += tabsize - (j % tabsize);
3155 }
3156 else {
3157 j++;
3158 if (*p == '\n' || *p == '\r') {
3159 i += j;
3160 j = 0;
3161 }
3162 }
3163
3164 /* Second pass: create output string and fill it */
3165 u = _PyUnicode_New(i + j);
3166 if (!u)
3167 return NULL;
3168
3169 j = 0;
3170 q = u->str;
3171
3172 for (p = self->str; p < e; p++)
3173 if (*p == '\t') {
3174 if (tabsize > 0) {
3175 i = tabsize - (j % tabsize);
3176 j += i;
3177 while (i--)
3178 *q++ = ' ';
3179 }
3180 }
3181 else {
3182 j++;
3183 *q++ = *p;
3184 if (*p == '\n' || *p == '\r')
3185 j = 0;
3186 }
3187
3188 return (PyObject*) u;
3189}
3190
3191static char find__doc__[] =
3192"S.find(sub [,start [,end]]) -> int\n\
3193\n\
3194Return the lowest index in S where substring sub is found,\n\
3195such that sub is contained within s[start,end]. Optional\n\
3196arguments start and end are interpreted as in slice notation.\n\
3197\n\
3198Return -1 on failure.";
3199
3200static PyObject *
3201unicode_find(PyUnicodeObject *self, PyObject *args)
3202{
3203 PyUnicodeObject *substring;
3204 int start = 0;
3205 int end = INT_MAX;
3206 PyObject *result;
3207
Guido van Rossumb8872e62000-05-09 14:14:27 +00003208 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3209 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003210 return NULL;
3211 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3212 (PyObject *)substring);
3213 if (substring == NULL)
3214 return NULL;
3215
3216 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3217
3218 Py_DECREF(substring);
3219 return result;
3220}
3221
3222static PyObject *
3223unicode_getitem(PyUnicodeObject *self, int index)
3224{
3225 if (index < 0 || index >= self->length) {
3226 PyErr_SetString(PyExc_IndexError, "string index out of range");
3227 return NULL;
3228 }
3229
3230 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3231}
3232
3233static long
3234unicode_hash(PyUnicodeObject *self)
3235{
3236 long hash;
3237 PyObject *utf8;
3238
3239 /* Since Unicode objects compare equal to their UTF-8 string
3240 counterparts, they should also use the UTF-8 strings as basis
3241 for their hash value. This is needed to assure that strings and
3242 Unicode objects behave in the same way as dictionary
3243 keys. Unfortunately, this costs some performance and also some
3244 memory if the cached UTF-8 representation is not used later
3245 on. */
3246 if (self->hash != -1)
3247 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003248 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003249 if (utf8 == NULL)
3250 return -1;
3251 hash = PyObject_Hash(utf8);
3252 if (hash == -1)
3253 return -1;
3254 self->hash = hash;
3255 return hash;
3256}
3257
3258static char index__doc__[] =
3259"S.index(sub [,start [,end]]) -> int\n\
3260\n\
3261Like S.find() but raise ValueError when the substring is not found.";
3262
3263static PyObject *
3264unicode_index(PyUnicodeObject *self, PyObject *args)
3265{
3266 int result;
3267 PyUnicodeObject *substring;
3268 int start = 0;
3269 int end = INT_MAX;
3270
Guido van Rossumb8872e62000-05-09 14:14:27 +00003271 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3272 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003273 return NULL;
3274
3275 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3276 (PyObject *)substring);
3277 if (substring == NULL)
3278 return NULL;
3279
3280 result = findstring(self, substring, start, end, 1);
3281
3282 Py_DECREF(substring);
3283 if (result < 0) {
3284 PyErr_SetString(PyExc_ValueError, "substring not found");
3285 return NULL;
3286 }
3287 return PyInt_FromLong(result);
3288}
3289
3290static char islower__doc__[] =
3291"S.islower() -> int\n\
3292\n\
3293Return 1 if all cased characters in S are lowercase and there is\n\
3294at least one cased character in S, 0 otherwise.";
3295
3296static PyObject*
3297unicode_islower(PyUnicodeObject *self, PyObject *args)
3298{
3299 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3300 register const Py_UNICODE *e;
3301 int cased;
3302
3303 if (!PyArg_NoArgs(args))
3304 return NULL;
3305
3306 /* Shortcut for single character strings */
3307 if (PyUnicode_GET_SIZE(self) == 1)
3308 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3309
3310 e = p + PyUnicode_GET_SIZE(self);
3311 cased = 0;
3312 for (; p < e; p++) {
3313 register const Py_UNICODE ch = *p;
3314
3315 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3316 return PyInt_FromLong(0);
3317 else if (!cased && Py_UNICODE_ISLOWER(ch))
3318 cased = 1;
3319 }
3320 return PyInt_FromLong(cased);
3321}
3322
3323static char isupper__doc__[] =
3324"S.isupper() -> int\n\
3325\n\
3326Return 1 if all cased characters in S are uppercase and there is\n\
3327at least one cased character in S, 0 otherwise.";
3328
3329static PyObject*
3330unicode_isupper(PyUnicodeObject *self, PyObject *args)
3331{
3332 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3333 register const Py_UNICODE *e;
3334 int cased;
3335
3336 if (!PyArg_NoArgs(args))
3337 return NULL;
3338
3339 /* Shortcut for single character strings */
3340 if (PyUnicode_GET_SIZE(self) == 1)
3341 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3342
3343 e = p + PyUnicode_GET_SIZE(self);
3344 cased = 0;
3345 for (; p < e; p++) {
3346 register const Py_UNICODE ch = *p;
3347
3348 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3349 return PyInt_FromLong(0);
3350 else if (!cased && Py_UNICODE_ISUPPER(ch))
3351 cased = 1;
3352 }
3353 return PyInt_FromLong(cased);
3354}
3355
3356static char istitle__doc__[] =
3357"S.istitle() -> int\n\
3358\n\
3359Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3360may only follow uncased characters and lowercase characters only cased\n\
3361ones. Return 0 otherwise.";
3362
3363static PyObject*
3364unicode_istitle(PyUnicodeObject *self, PyObject *args)
3365{
3366 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3367 register const Py_UNICODE *e;
3368 int cased, previous_is_cased;
3369
3370 if (!PyArg_NoArgs(args))
3371 return NULL;
3372
3373 /* Shortcut for single character strings */
3374 if (PyUnicode_GET_SIZE(self) == 1)
3375 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3376 (Py_UNICODE_ISUPPER(*p) != 0));
3377
3378 e = p + PyUnicode_GET_SIZE(self);
3379 cased = 0;
3380 previous_is_cased = 0;
3381 for (; p < e; p++) {
3382 register const Py_UNICODE ch = *p;
3383
3384 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3385 if (previous_is_cased)
3386 return PyInt_FromLong(0);
3387 previous_is_cased = 1;
3388 cased = 1;
3389 }
3390 else if (Py_UNICODE_ISLOWER(ch)) {
3391 if (!previous_is_cased)
3392 return PyInt_FromLong(0);
3393 previous_is_cased = 1;
3394 cased = 1;
3395 }
3396 else
3397 previous_is_cased = 0;
3398 }
3399 return PyInt_FromLong(cased);
3400}
3401
3402static char isspace__doc__[] =
3403"S.isspace() -> int\n\
3404\n\
3405Return 1 if there are only whitespace characters in S,\n\
34060 otherwise.";
3407
3408static PyObject*
3409unicode_isspace(PyUnicodeObject *self, PyObject *args)
3410{
3411 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3412 register const Py_UNICODE *e;
3413
3414 if (!PyArg_NoArgs(args))
3415 return NULL;
3416
3417 /* Shortcut for single character strings */
3418 if (PyUnicode_GET_SIZE(self) == 1 &&
3419 Py_UNICODE_ISSPACE(*p))
3420 return PyInt_FromLong(1);
3421
3422 e = p + PyUnicode_GET_SIZE(self);
3423 for (; p < e; p++) {
3424 if (!Py_UNICODE_ISSPACE(*p))
3425 return PyInt_FromLong(0);
3426 }
3427 return PyInt_FromLong(1);
3428}
3429
3430static char isdecimal__doc__[] =
3431"S.isdecimal() -> int\n\
3432\n\
3433Return 1 if there are only decimal characters in S,\n\
34340 otherwise.";
3435
3436static PyObject*
3437unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3438{
3439 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3440 register const Py_UNICODE *e;
3441
3442 if (!PyArg_NoArgs(args))
3443 return NULL;
3444
3445 /* Shortcut for single character strings */
3446 if (PyUnicode_GET_SIZE(self) == 1 &&
3447 Py_UNICODE_ISDECIMAL(*p))
3448 return PyInt_FromLong(1);
3449
3450 e = p + PyUnicode_GET_SIZE(self);
3451 for (; p < e; p++) {
3452 if (!Py_UNICODE_ISDECIMAL(*p))
3453 return PyInt_FromLong(0);
3454 }
3455 return PyInt_FromLong(1);
3456}
3457
3458static char isdigit__doc__[] =
3459"S.isdigit() -> int\n\
3460\n\
3461Return 1 if there are only digit characters in S,\n\
34620 otherwise.";
3463
3464static PyObject*
3465unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3466{
3467 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3468 register const Py_UNICODE *e;
3469
3470 if (!PyArg_NoArgs(args))
3471 return NULL;
3472
3473 /* Shortcut for single character strings */
3474 if (PyUnicode_GET_SIZE(self) == 1 &&
3475 Py_UNICODE_ISDIGIT(*p))
3476 return PyInt_FromLong(1);
3477
3478 e = p + PyUnicode_GET_SIZE(self);
3479 for (; p < e; p++) {
3480 if (!Py_UNICODE_ISDIGIT(*p))
3481 return PyInt_FromLong(0);
3482 }
3483 return PyInt_FromLong(1);
3484}
3485
3486static char isnumeric__doc__[] =
3487"S.isnumeric() -> int\n\
3488\n\
3489Return 1 if there are only numeric characters in S,\n\
34900 otherwise.";
3491
3492static PyObject*
3493unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3494{
3495 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3496 register const Py_UNICODE *e;
3497
3498 if (!PyArg_NoArgs(args))
3499 return NULL;
3500
3501 /* Shortcut for single character strings */
3502 if (PyUnicode_GET_SIZE(self) == 1 &&
3503 Py_UNICODE_ISNUMERIC(*p))
3504 return PyInt_FromLong(1);
3505
3506 e = p + PyUnicode_GET_SIZE(self);
3507 for (; p < e; p++) {
3508 if (!Py_UNICODE_ISNUMERIC(*p))
3509 return PyInt_FromLong(0);
3510 }
3511 return PyInt_FromLong(1);
3512}
3513
3514static char join__doc__[] =
3515"S.join(sequence) -> unicode\n\
3516\n\
3517Return a string which is the concatenation of the strings in the\n\
3518sequence. The separator between elements is S.";
3519
3520static PyObject*
3521unicode_join(PyUnicodeObject *self, PyObject *args)
3522{
3523 PyObject *data;
3524 if (!PyArg_ParseTuple(args, "O:join", &data))
3525 return NULL;
3526
3527 return PyUnicode_Join((PyObject *)self, data);
3528}
3529
3530static int
3531unicode_length(PyUnicodeObject *self)
3532{
3533 return self->length;
3534}
3535
3536static char ljust__doc__[] =
3537"S.ljust(width) -> unicode\n\
3538\n\
3539Return S left justified in a Unicode string of length width. Padding is\n\
3540done using spaces.";
3541
3542static PyObject *
3543unicode_ljust(PyUnicodeObject *self, PyObject *args)
3544{
3545 int width;
3546 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3547 return NULL;
3548
3549 if (self->length >= width) {
3550 Py_INCREF(self);
3551 return (PyObject*) self;
3552 }
3553
3554 return (PyObject*) pad(self, 0, width - self->length, ' ');
3555}
3556
3557static char lower__doc__[] =
3558"S.lower() -> unicode\n\
3559\n\
3560Return a copy of the string S converted to lowercase.";
3561
3562static PyObject*
3563unicode_lower(PyUnicodeObject *self, PyObject *args)
3564{
3565 if (!PyArg_NoArgs(args))
3566 return NULL;
3567 return fixup(self, fixlower);
3568}
3569
3570static char lstrip__doc__[] =
3571"S.lstrip() -> unicode\n\
3572\n\
3573Return a copy of the string S with leading whitespace removed.";
3574
3575static PyObject *
3576unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3577{
3578 if (!PyArg_NoArgs(args))
3579 return NULL;
3580 return strip(self, 1, 0);
3581}
3582
3583static PyObject*
3584unicode_repeat(PyUnicodeObject *str, int len)
3585{
3586 PyUnicodeObject *u;
3587 Py_UNICODE *p;
3588
3589 if (len < 0)
3590 len = 0;
3591
3592 if (len == 1) {
3593 /* no repeat, return original string */
3594 Py_INCREF(str);
3595 return (PyObject*) str;
3596 }
3597
3598 u = _PyUnicode_New(len * str->length);
3599 if (!u)
3600 return NULL;
3601
3602 p = u->str;
3603
3604 while (len-- > 0) {
3605 Py_UNICODE_COPY(p, str->str, str->length);
3606 p += str->length;
3607 }
3608
3609 return (PyObject*) u;
3610}
3611
3612PyObject *PyUnicode_Replace(PyObject *obj,
3613 PyObject *subobj,
3614 PyObject *replobj,
3615 int maxcount)
3616{
3617 PyObject *self;
3618 PyObject *str1;
3619 PyObject *str2;
3620 PyObject *result;
3621
3622 self = PyUnicode_FromObject(obj);
3623 if (self == NULL)
3624 return NULL;
3625 str1 = PyUnicode_FromObject(subobj);
3626 if (str1 == NULL) {
3627 Py_DECREF(self);
3628 return NULL;
3629 }
3630 str2 = PyUnicode_FromObject(replobj);
3631 if (str2 == NULL) {
3632 Py_DECREF(self);
3633 Py_DECREF(str1);
3634 return NULL;
3635 }
3636 result = replace((PyUnicodeObject *)self,
3637 (PyUnicodeObject *)str1,
3638 (PyUnicodeObject *)str2,
3639 maxcount);
3640 Py_DECREF(self);
3641 Py_DECREF(str1);
3642 Py_DECREF(str2);
3643 return result;
3644}
3645
3646static char replace__doc__[] =
3647"S.replace (old, new[, maxsplit]) -> unicode\n\
3648\n\
3649Return a copy of S with all occurrences of substring\n\
3650old replaced by new. If the optional argument maxsplit is\n\
3651given, only the first maxsplit occurrences are replaced.";
3652
3653static PyObject*
3654unicode_replace(PyUnicodeObject *self, PyObject *args)
3655{
3656 PyUnicodeObject *str1;
3657 PyUnicodeObject *str2;
3658 int maxcount = -1;
3659 PyObject *result;
3660
3661 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3662 return NULL;
3663 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3664 if (str1 == NULL)
3665 return NULL;
3666 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3667 if (str2 == NULL)
3668 return NULL;
3669
3670 result = replace(self, str1, str2, maxcount);
3671
3672 Py_DECREF(str1);
3673 Py_DECREF(str2);
3674 return result;
3675}
3676
3677static
3678PyObject *unicode_repr(PyObject *unicode)
3679{
3680 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3681 PyUnicode_GET_SIZE(unicode),
3682 1);
3683}
3684
3685static char rfind__doc__[] =
3686"S.rfind(sub [,start [,end]]) -> int\n\
3687\n\
3688Return the highest index in S where substring sub is found,\n\
3689such that sub is contained within s[start,end]. Optional\n\
3690arguments start and end are interpreted as in slice notation.\n\
3691\n\
3692Return -1 on failure.";
3693
3694static PyObject *
3695unicode_rfind(PyUnicodeObject *self, PyObject *args)
3696{
3697 PyUnicodeObject *substring;
3698 int start = 0;
3699 int end = INT_MAX;
3700 PyObject *result;
3701
Guido van Rossumb8872e62000-05-09 14:14:27 +00003702 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
3703 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003704 return NULL;
3705 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3706 (PyObject *)substring);
3707 if (substring == NULL)
3708 return NULL;
3709
3710 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3711
3712 Py_DECREF(substring);
3713 return result;
3714}
3715
3716static char rindex__doc__[] =
3717"S.rindex(sub [,start [,end]]) -> int\n\
3718\n\
3719Like S.rfind() but raise ValueError when the substring is not found.";
3720
3721static PyObject *
3722unicode_rindex(PyUnicodeObject *self, PyObject *args)
3723{
3724 int result;
3725 PyUnicodeObject *substring;
3726 int start = 0;
3727 int end = INT_MAX;
3728
Guido van Rossumb8872e62000-05-09 14:14:27 +00003729 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
3730 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 return NULL;
3732 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3733 (PyObject *)substring);
3734 if (substring == NULL)
3735 return NULL;
3736
3737 result = findstring(self, substring, start, end, -1);
3738
3739 Py_DECREF(substring);
3740 if (result < 0) {
3741 PyErr_SetString(PyExc_ValueError, "substring not found");
3742 return NULL;
3743 }
3744 return PyInt_FromLong(result);
3745}
3746
3747static char rjust__doc__[] =
3748"S.rjust(width) -> unicode\n\
3749\n\
3750Return S right justified in a Unicode string of length width. Padding is\n\
3751done using spaces.";
3752
3753static PyObject *
3754unicode_rjust(PyUnicodeObject *self, PyObject *args)
3755{
3756 int width;
3757 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3758 return NULL;
3759
3760 if (self->length >= width) {
3761 Py_INCREF(self);
3762 return (PyObject*) self;
3763 }
3764
3765 return (PyObject*) pad(self, width - self->length, 0, ' ');
3766}
3767
3768static char rstrip__doc__[] =
3769"S.rstrip() -> unicode\n\
3770\n\
3771Return a copy of the string S with trailing whitespace removed.";
3772
3773static PyObject *
3774unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3775{
3776 if (!PyArg_NoArgs(args))
3777 return NULL;
3778 return strip(self, 0, 1);
3779}
3780
3781static PyObject*
3782unicode_slice(PyUnicodeObject *self, int start, int end)
3783{
3784 /* standard clamping */
3785 if (start < 0)
3786 start = 0;
3787 if (end < 0)
3788 end = 0;
3789 if (end > self->length)
3790 end = self->length;
3791 if (start == 0 && end == self->length) {
3792 /* full slice, return original string */
3793 Py_INCREF(self);
3794 return (PyObject*) self;
3795 }
3796 if (start > end)
3797 start = end;
3798 /* copy slice */
3799 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3800 end - start);
3801}
3802
3803PyObject *PyUnicode_Split(PyObject *s,
3804 PyObject *sep,
3805 int maxsplit)
3806{
3807 PyObject *result;
3808
3809 s = PyUnicode_FromObject(s);
3810 if (s == NULL)
3811 return NULL;
3812 if (sep != NULL) {
3813 sep = PyUnicode_FromObject(sep);
3814 if (sep == NULL) {
3815 Py_DECREF(s);
3816 return NULL;
3817 }
3818 }
3819
3820 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3821
3822 Py_DECREF(s);
3823 Py_XDECREF(sep);
3824 return result;
3825}
3826
3827static char split__doc__[] =
3828"S.split([sep [,maxsplit]]) -> list of strings\n\
3829\n\
3830Return a list of the words in S, using sep as the\n\
3831delimiter string. If maxsplit is given, at most maxsplit\n\
3832splits are done. If sep is not specified, any whitespace string\n\
3833is a separator.";
3834
3835static PyObject*
3836unicode_split(PyUnicodeObject *self, PyObject *args)
3837{
3838 PyObject *substring = Py_None;
3839 int maxcount = -1;
3840
3841 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3842 return NULL;
3843
3844 if (substring == Py_None)
3845 return split(self, NULL, maxcount);
3846 else if (PyUnicode_Check(substring))
3847 return split(self, (PyUnicodeObject *)substring, maxcount);
3848 else
3849 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3850}
3851
3852static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00003853"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00003854\n\
3855Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00003856Line breaks are not included in the resulting list unless keepends\n\
3857is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003858
3859static PyObject*
3860unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3861{
Guido van Rossum86662912000-04-11 15:38:46 +00003862 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003863
Guido van Rossum86662912000-04-11 15:38:46 +00003864 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003865 return NULL;
3866
Guido van Rossum86662912000-04-11 15:38:46 +00003867 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003868}
3869
3870static
3871PyObject *unicode_str(PyUnicodeObject *self)
3872{
Fred Drakee4315f52000-05-09 19:53:39 +00003873 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003874}
3875
3876static char strip__doc__[] =
3877"S.strip() -> unicode\n\
3878\n\
3879Return a copy of S with leading and trailing whitespace removed.";
3880
3881static PyObject *
3882unicode_strip(PyUnicodeObject *self, PyObject *args)
3883{
3884 if (!PyArg_NoArgs(args))
3885 return NULL;
3886 return strip(self, 1, 1);
3887}
3888
3889static char swapcase__doc__[] =
3890"S.swapcase() -> unicode\n\
3891\n\
3892Return a copy of S with uppercase characters converted to lowercase\n\
3893and vice versa.";
3894
3895static PyObject*
3896unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3897{
3898 if (!PyArg_NoArgs(args))
3899 return NULL;
3900 return fixup(self, fixswapcase);
3901}
3902
3903static char translate__doc__[] =
3904"S.translate(table) -> unicode\n\
3905\n\
3906Return a copy of the string S, where all characters have been mapped\n\
3907through the given translation table, which must be a mapping of\n\
3908Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3909are left untouched. Characters mapped to None are deleted.";
3910
3911static PyObject*
3912unicode_translate(PyUnicodeObject *self, PyObject *args)
3913{
3914 PyObject *table;
3915
3916 if (!PyArg_ParseTuple(args, "O:translate", &table))
3917 return NULL;
3918 return PyUnicode_TranslateCharmap(self->str,
3919 self->length,
3920 table,
3921 "ignore");
3922}
3923
3924static char upper__doc__[] =
3925"S.upper() -> unicode\n\
3926\n\
3927Return a copy of S converted to uppercase.";
3928
3929static PyObject*
3930unicode_upper(PyUnicodeObject *self, PyObject *args)
3931{
3932 if (!PyArg_NoArgs(args))
3933 return NULL;
3934 return fixup(self, fixupper);
3935}
3936
3937#if 0
3938static char zfill__doc__[] =
3939"S.zfill(width) -> unicode\n\
3940\n\
3941Pad a numeric string x with zeros on the left, to fill a field\n\
3942of the specified width. The string x is never truncated.";
3943
3944static PyObject *
3945unicode_zfill(PyUnicodeObject *self, PyObject *args)
3946{
3947 int fill;
3948 PyUnicodeObject *u;
3949
3950 int width;
3951 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3952 return NULL;
3953
3954 if (self->length >= width) {
3955 Py_INCREF(self);
3956 return (PyObject*) self;
3957 }
3958
3959 fill = width - self->length;
3960
3961 u = pad(self, fill, 0, '0');
3962
3963 if (u->str[fill] == '+' || u->str[fill] == '-') {
3964 /* move sign to beginning of string */
3965 u->str[0] = u->str[fill];
3966 u->str[fill] = '0';
3967 }
3968
3969 return (PyObject*) u;
3970}
3971#endif
3972
3973#if 0
3974static PyObject*
3975unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
3976{
3977 if (!PyArg_NoArgs(args))
3978 return NULL;
3979 return PyInt_FromLong(unicode_freelist_size);
3980}
3981#endif
3982
3983static char startswith__doc__[] =
3984"S.startswith(prefix[, start[, end]]) -> int\n\
3985\n\
3986Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
3987optional start, test S beginning at that position. With optional end, stop\n\
3988comparing S at that position.";
3989
3990static PyObject *
3991unicode_startswith(PyUnicodeObject *self,
3992 PyObject *args)
3993{
3994 PyUnicodeObject *substring;
3995 int start = 0;
3996 int end = INT_MAX;
3997 PyObject *result;
3998
Guido van Rossumb8872e62000-05-09 14:14:27 +00003999 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4000 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004001 return NULL;
4002 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4003 (PyObject *)substring);
4004 if (substring == NULL)
4005 return NULL;
4006
4007 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4008
4009 Py_DECREF(substring);
4010 return result;
4011}
4012
4013
4014static char endswith__doc__[] =
4015"S.endswith(suffix[, start[, end]]) -> int\n\
4016\n\
4017Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4018optional start, test S beginning at that position. With optional end, stop\n\
4019comparing S at that position.";
4020
4021static PyObject *
4022unicode_endswith(PyUnicodeObject *self,
4023 PyObject *args)
4024{
4025 PyUnicodeObject *substring;
4026 int start = 0;
4027 int end = INT_MAX;
4028 PyObject *result;
4029
Guido van Rossumb8872e62000-05-09 14:14:27 +00004030 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4031 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032 return NULL;
4033 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4034 (PyObject *)substring);
4035 if (substring == NULL)
4036 return NULL;
4037
4038 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4039
4040 Py_DECREF(substring);
4041 return result;
4042}
4043
4044
4045static PyMethodDef unicode_methods[] = {
4046
4047 /* Order is according to common usage: often used methods should
4048 appear first, since lookup is done sequentially. */
4049
4050 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4051 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4052 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4053 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4054 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4055 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4056 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4057 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4058 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4059 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4060 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4061 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4062 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4063 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4064/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4065 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4066 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4067 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4068 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4069 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4070 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4071 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4072 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4073 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4074 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4075 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4076 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4077 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4078 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4079 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4080 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4081 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4082 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4083#if 0
4084 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4085 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4086#endif
4087
4088#if 0
4089 /* This one is just used for debugging the implementation. */
4090 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4091#endif
4092
4093 {NULL, NULL}
4094};
4095
4096static PyObject *
4097unicode_getattr(PyUnicodeObject *self, char *name)
4098{
4099 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4100}
4101
4102static PySequenceMethods unicode_as_sequence = {
4103 (inquiry) unicode_length, /* sq_length */
4104 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4105 (intargfunc) unicode_repeat, /* sq_repeat */
4106 (intargfunc) unicode_getitem, /* sq_item */
4107 (intintargfunc) unicode_slice, /* sq_slice */
4108 0, /* sq_ass_item */
4109 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004110 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111};
4112
4113static int
4114unicode_buffer_getreadbuf(PyUnicodeObject *self,
4115 int index,
4116 const void **ptr)
4117{
4118 if (index != 0) {
4119 PyErr_SetString(PyExc_SystemError,
4120 "accessing non-existent unicode segment");
4121 return -1;
4122 }
4123 *ptr = (void *) self->str;
4124 return PyUnicode_GET_DATA_SIZE(self);
4125}
4126
4127static int
4128unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4129 const void **ptr)
4130{
4131 PyErr_SetString(PyExc_TypeError,
4132 "cannot use unicode as modifyable buffer");
4133 return -1;
4134}
4135
4136static int
4137unicode_buffer_getsegcount(PyUnicodeObject *self,
4138 int *lenp)
4139{
4140 if (lenp)
4141 *lenp = PyUnicode_GET_DATA_SIZE(self);
4142 return 1;
4143}
4144
4145static int
4146unicode_buffer_getcharbuf(PyUnicodeObject *self,
4147 int index,
4148 const void **ptr)
4149{
4150 PyObject *str;
4151
4152 if (index != 0) {
4153 PyErr_SetString(PyExc_SystemError,
4154 "accessing non-existent unicode segment");
4155 return -1;
4156 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004157 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004158 if (str == NULL)
4159 return -1;
4160 *ptr = (void *) PyString_AS_STRING(str);
4161 return PyString_GET_SIZE(str);
4162}
4163
4164/* Helpers for PyUnicode_Format() */
4165
4166static PyObject *
4167getnextarg(args, arglen, p_argidx)
4168 PyObject *args;
4169int arglen;
4170int *p_argidx;
4171{
4172 int argidx = *p_argidx;
4173 if (argidx < arglen) {
4174 (*p_argidx)++;
4175 if (arglen < 0)
4176 return args;
4177 else
4178 return PyTuple_GetItem(args, argidx);
4179 }
4180 PyErr_SetString(PyExc_TypeError,
4181 "not enough arguments for format string");
4182 return NULL;
4183}
4184
4185#define F_LJUST (1<<0)
4186#define F_SIGN (1<<1)
4187#define F_BLANK (1<<2)
4188#define F_ALT (1<<3)
4189#define F_ZERO (1<<4)
4190
4191static
4192#ifdef HAVE_STDARG_PROTOTYPES
4193int usprintf(register Py_UNICODE *buffer, char *format, ...)
4194#else
4195int usprintf(va_alist) va_dcl
4196#endif
4197{
4198 register int i;
4199 int len;
4200 va_list va;
4201 char *charbuffer;
4202#ifdef HAVE_STDARG_PROTOTYPES
4203 va_start(va, format);
4204#else
4205 Py_UNICODE *args;
4206 char *format;
4207
4208 va_start(va);
4209 buffer = va_arg(va, Py_UNICODE *);
4210 format = va_arg(va, char *);
4211#endif
4212
4213 /* First, format the string as char array, then expand to Py_UNICODE
4214 array. */
4215 charbuffer = (char *)buffer;
4216 len = vsprintf(charbuffer, format, va);
4217 for (i = len - 1; i >= 0; i--)
4218 buffer[i] = (Py_UNICODE) charbuffer[i];
4219
4220 va_end(va);
4221 return len;
4222}
4223
4224static int
4225formatfloat(Py_UNICODE *buf,
4226 int flags,
4227 int prec,
4228 int type,
4229 PyObject *v)
4230{
4231 char fmt[20];
4232 double x;
4233
4234 x = PyFloat_AsDouble(v);
4235 if (x == -1.0 && PyErr_Occurred())
4236 return -1;
4237 if (prec < 0)
4238 prec = 6;
4239 if (prec > 50)
4240 prec = 50; /* Arbitrary limitation */
4241 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4242 type = 'g';
4243 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4244 return usprintf(buf, fmt, x);
4245}
4246
4247static int
4248formatint(Py_UNICODE *buf,
4249 int flags,
4250 int prec,
4251 int type,
4252 PyObject *v)
4253{
4254 char fmt[20];
4255 long x;
4256
4257 x = PyInt_AsLong(v);
4258 if (x == -1 && PyErr_Occurred())
4259 return -1;
4260 if (prec < 0)
4261 prec = 1;
4262 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4263 return usprintf(buf, fmt, x);
4264}
4265
4266static int
4267formatchar(Py_UNICODE *buf,
4268 PyObject *v)
4269{
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004270 if (PyUnicode_Check(v)) {
4271 if (PyUnicode_GET_SIZE(v) != 1)
4272 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004273 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004274 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004275
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004276 else if (PyString_Check(v)) {
4277 if (PyString_GET_SIZE(v) != 1)
4278 goto onError;
4279 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4280 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004281
4282 else {
4283 /* Integer input truncated to a character */
4284 long x;
4285 x = PyInt_AsLong(v);
4286 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004287 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004288 buf[0] = (char) x;
4289 }
4290 buf[1] = '\0';
4291 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004292
4293 onError:
4294 PyErr_SetString(PyExc_TypeError,
4295 "%c requires int or char");
4296 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004297}
4298
4299PyObject *PyUnicode_Format(PyObject *format,
4300 PyObject *args)
4301{
4302 Py_UNICODE *fmt, *res;
4303 int fmtcnt, rescnt, reslen, arglen, argidx;
4304 int args_owned = 0;
4305 PyUnicodeObject *result = NULL;
4306 PyObject *dict = NULL;
4307 PyObject *uformat;
4308
4309 if (format == NULL || args == NULL) {
4310 PyErr_BadInternalCall();
4311 return NULL;
4312 }
4313 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004314 if (uformat == NULL)
4315 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004316 fmt = PyUnicode_AS_UNICODE(uformat);
4317 fmtcnt = PyUnicode_GET_SIZE(uformat);
4318
4319 reslen = rescnt = fmtcnt + 100;
4320 result = _PyUnicode_New(reslen);
4321 if (result == NULL)
4322 goto onError;
4323 res = PyUnicode_AS_UNICODE(result);
4324
4325 if (PyTuple_Check(args)) {
4326 arglen = PyTuple_Size(args);
4327 argidx = 0;
4328 }
4329 else {
4330 arglen = -1;
4331 argidx = -2;
4332 }
4333 if (args->ob_type->tp_as_mapping)
4334 dict = args;
4335
4336 while (--fmtcnt >= 0) {
4337 if (*fmt != '%') {
4338 if (--rescnt < 0) {
4339 rescnt = fmtcnt + 100;
4340 reslen += rescnt;
4341 if (_PyUnicode_Resize(result, reslen) < 0)
4342 return NULL;
4343 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4344 --rescnt;
4345 }
4346 *res++ = *fmt++;
4347 }
4348 else {
4349 /* Got a format specifier */
4350 int flags = 0;
4351 int width = -1;
4352 int prec = -1;
4353 int size = 0;
4354 Py_UNICODE c = '\0';
4355 Py_UNICODE fill;
4356 PyObject *v = NULL;
4357 PyObject *temp = NULL;
4358 Py_UNICODE *buf;
4359 Py_UNICODE sign;
4360 int len;
4361 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4362
4363 fmt++;
4364 if (*fmt == '(') {
4365 Py_UNICODE *keystart;
4366 int keylen;
4367 PyObject *key;
4368 int pcount = 1;
4369
4370 if (dict == NULL) {
4371 PyErr_SetString(PyExc_TypeError,
4372 "format requires a mapping");
4373 goto onError;
4374 }
4375 ++fmt;
4376 --fmtcnt;
4377 keystart = fmt;
4378 /* Skip over balanced parentheses */
4379 while (pcount > 0 && --fmtcnt >= 0) {
4380 if (*fmt == ')')
4381 --pcount;
4382 else if (*fmt == '(')
4383 ++pcount;
4384 fmt++;
4385 }
4386 keylen = fmt - keystart - 1;
4387 if (fmtcnt < 0 || pcount > 0) {
4388 PyErr_SetString(PyExc_ValueError,
4389 "incomplete format key");
4390 goto onError;
4391 }
Fred Drakee4315f52000-05-09 19:53:39 +00004392 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004393 then looked up since Python uses strings to hold
4394 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004395 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004396 key = PyUnicode_EncodeUTF8(keystart,
4397 keylen,
4398 NULL);
4399 if (key == NULL)
4400 goto onError;
4401 if (args_owned) {
4402 Py_DECREF(args);
4403 args_owned = 0;
4404 }
4405 args = PyObject_GetItem(dict, key);
4406 Py_DECREF(key);
4407 if (args == NULL) {
4408 goto onError;
4409 }
4410 args_owned = 1;
4411 arglen = -1;
4412 argidx = -2;
4413 }
4414 while (--fmtcnt >= 0) {
4415 switch (c = *fmt++) {
4416 case '-': flags |= F_LJUST; continue;
4417 case '+': flags |= F_SIGN; continue;
4418 case ' ': flags |= F_BLANK; continue;
4419 case '#': flags |= F_ALT; continue;
4420 case '0': flags |= F_ZERO; continue;
4421 }
4422 break;
4423 }
4424 if (c == '*') {
4425 v = getnextarg(args, arglen, &argidx);
4426 if (v == NULL)
4427 goto onError;
4428 if (!PyInt_Check(v)) {
4429 PyErr_SetString(PyExc_TypeError,
4430 "* wants int");
4431 goto onError;
4432 }
4433 width = PyInt_AsLong(v);
4434 if (width < 0) {
4435 flags |= F_LJUST;
4436 width = -width;
4437 }
4438 if (--fmtcnt >= 0)
4439 c = *fmt++;
4440 }
4441 else if (c >= '0' && c <= '9') {
4442 width = c - '0';
4443 while (--fmtcnt >= 0) {
4444 c = *fmt++;
4445 if (c < '0' || c > '9')
4446 break;
4447 if ((width*10) / 10 != width) {
4448 PyErr_SetString(PyExc_ValueError,
4449 "width too big");
4450 goto onError;
4451 }
4452 width = width*10 + (c - '0');
4453 }
4454 }
4455 if (c == '.') {
4456 prec = 0;
4457 if (--fmtcnt >= 0)
4458 c = *fmt++;
4459 if (c == '*') {
4460 v = getnextarg(args, arglen, &argidx);
4461 if (v == NULL)
4462 goto onError;
4463 if (!PyInt_Check(v)) {
4464 PyErr_SetString(PyExc_TypeError,
4465 "* wants int");
4466 goto onError;
4467 }
4468 prec = PyInt_AsLong(v);
4469 if (prec < 0)
4470 prec = 0;
4471 if (--fmtcnt >= 0)
4472 c = *fmt++;
4473 }
4474 else if (c >= '0' && c <= '9') {
4475 prec = c - '0';
4476 while (--fmtcnt >= 0) {
4477 c = Py_CHARMASK(*fmt++);
4478 if (c < '0' || c > '9')
4479 break;
4480 if ((prec*10) / 10 != prec) {
4481 PyErr_SetString(PyExc_ValueError,
4482 "prec too big");
4483 goto onError;
4484 }
4485 prec = prec*10 + (c - '0');
4486 }
4487 }
4488 } /* prec */
4489 if (fmtcnt >= 0) {
4490 if (c == 'h' || c == 'l' || c == 'L') {
4491 size = c;
4492 if (--fmtcnt >= 0)
4493 c = *fmt++;
4494 }
4495 }
4496 if (fmtcnt < 0) {
4497 PyErr_SetString(PyExc_ValueError,
4498 "incomplete format");
4499 goto onError;
4500 }
4501 if (c != '%') {
4502 v = getnextarg(args, arglen, &argidx);
4503 if (v == NULL)
4504 goto onError;
4505 }
4506 sign = 0;
4507 fill = ' ';
4508 switch (c) {
4509
4510 case '%':
4511 buf = tmpbuf;
4512 buf[0] = '%';
4513 len = 1;
4514 break;
4515
4516 case 's':
4517 case 'r':
4518 if (PyUnicode_Check(v) && c == 's') {
4519 temp = v;
4520 Py_INCREF(temp);
4521 }
4522 else {
4523 PyObject *unicode;
4524 if (c == 's')
4525 temp = PyObject_Str(v);
4526 else
4527 temp = PyObject_Repr(v);
4528 if (temp == NULL)
4529 goto onError;
4530 if (!PyString_Check(temp)) {
4531 /* XXX Note: this should never happen, since
4532 PyObject_Repr() and PyObject_Str() assure
4533 this */
4534 Py_DECREF(temp);
4535 PyErr_SetString(PyExc_TypeError,
4536 "%s argument has non-string str()");
4537 goto onError;
4538 }
Fred Drakee4315f52000-05-09 19:53:39 +00004539 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004540 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004541 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004542 "strict");
4543 Py_DECREF(temp);
4544 temp = unicode;
4545 if (temp == NULL)
4546 goto onError;
4547 }
4548 buf = PyUnicode_AS_UNICODE(temp);
4549 len = PyUnicode_GET_SIZE(temp);
4550 if (prec >= 0 && len > prec)
4551 len = prec;
4552 break;
4553
4554 case 'i':
4555 case 'd':
4556 case 'u':
4557 case 'o':
4558 case 'x':
4559 case 'X':
4560 if (c == 'i')
4561 c = 'd';
4562 buf = tmpbuf;
4563 len = formatint(buf, flags, prec, c, v);
4564 if (len < 0)
4565 goto onError;
4566 sign = (c == 'd');
4567 if (flags & F_ZERO) {
4568 fill = '0';
4569 if ((flags&F_ALT) &&
4570 (c == 'x' || c == 'X') &&
4571 buf[0] == '0' && buf[1] == c) {
4572 *res++ = *buf++;
4573 *res++ = *buf++;
4574 rescnt -= 2;
4575 len -= 2;
4576 width -= 2;
4577 if (width < 0)
4578 width = 0;
4579 }
4580 }
4581 break;
4582
4583 case 'e':
4584 case 'E':
4585 case 'f':
4586 case 'g':
4587 case 'G':
4588 buf = tmpbuf;
4589 len = formatfloat(buf, flags, prec, c, v);
4590 if (len < 0)
4591 goto onError;
4592 sign = 1;
4593 if (flags&F_ZERO)
4594 fill = '0';
4595 break;
4596
4597 case 'c':
4598 buf = tmpbuf;
4599 len = formatchar(buf, v);
4600 if (len < 0)
4601 goto onError;
4602 break;
4603
4604 default:
4605 PyErr_Format(PyExc_ValueError,
4606 "unsupported format character '%c' (0x%x)",
4607 c, c);
4608 goto onError;
4609 }
4610 if (sign) {
4611 if (*buf == '-' || *buf == '+') {
4612 sign = *buf++;
4613 len--;
4614 }
4615 else if (flags & F_SIGN)
4616 sign = '+';
4617 else if (flags & F_BLANK)
4618 sign = ' ';
4619 else
4620 sign = 0;
4621 }
4622 if (width < len)
4623 width = len;
4624 if (rescnt < width + (sign != 0)) {
4625 reslen -= rescnt;
4626 rescnt = width + fmtcnt + 100;
4627 reslen += rescnt;
4628 if (_PyUnicode_Resize(result, reslen) < 0)
4629 return NULL;
4630 res = PyUnicode_AS_UNICODE(result)
4631 + reslen - rescnt;
4632 }
4633 if (sign) {
4634 if (fill != ' ')
4635 *res++ = sign;
4636 rescnt--;
4637 if (width > len)
4638 width--;
4639 }
4640 if (width > len && !(flags & F_LJUST)) {
4641 do {
4642 --rescnt;
4643 *res++ = fill;
4644 } while (--width > len);
4645 }
4646 if (sign && fill == ' ')
4647 *res++ = sign;
4648 memcpy(res, buf, len * sizeof(Py_UNICODE));
4649 res += len;
4650 rescnt -= len;
4651 while (--width >= len) {
4652 --rescnt;
4653 *res++ = ' ';
4654 }
4655 if (dict && (argidx < arglen) && c != '%') {
4656 PyErr_SetString(PyExc_TypeError,
4657 "not all arguments converted");
4658 goto onError;
4659 }
4660 Py_XDECREF(temp);
4661 } /* '%' */
4662 } /* until end */
4663 if (argidx < arglen && !dict) {
4664 PyErr_SetString(PyExc_TypeError,
4665 "not all arguments converted");
4666 goto onError;
4667 }
4668
4669 if (args_owned) {
4670 Py_DECREF(args);
4671 }
4672 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004673 if (_PyUnicode_Resize(result, reslen - rescnt))
4674 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004675 return (PyObject *)result;
4676
4677 onError:
4678 Py_XDECREF(result);
4679 Py_DECREF(uformat);
4680 if (args_owned) {
4681 Py_DECREF(args);
4682 }
4683 return NULL;
4684}
4685
4686static PyBufferProcs unicode_as_buffer = {
4687 (getreadbufferproc) unicode_buffer_getreadbuf,
4688 (getwritebufferproc) unicode_buffer_getwritebuf,
4689 (getsegcountproc) unicode_buffer_getsegcount,
4690 (getcharbufferproc) unicode_buffer_getcharbuf,
4691};
4692
4693PyTypeObject PyUnicode_Type = {
4694 PyObject_HEAD_INIT(&PyType_Type)
4695 0, /* ob_size */
4696 "unicode", /* tp_name */
4697 sizeof(PyUnicodeObject), /* tp_size */
4698 0, /* tp_itemsize */
4699 /* Slots */
4700 (destructor)_PyUnicode_Free, /* tp_dealloc */
4701 0, /* tp_print */
4702 (getattrfunc)unicode_getattr, /* tp_getattr */
4703 0, /* tp_setattr */
4704 (cmpfunc) unicode_compare, /* tp_compare */
4705 (reprfunc) unicode_repr, /* tp_repr */
4706 0, /* tp_as_number */
4707 &unicode_as_sequence, /* tp_as_sequence */
4708 0, /* tp_as_mapping */
4709 (hashfunc) unicode_hash, /* tp_hash*/
4710 0, /* tp_call*/
4711 (reprfunc) unicode_str, /* tp_str */
4712 (getattrofunc) NULL, /* tp_getattro */
4713 (setattrofunc) NULL, /* tp_setattro */
4714 &unicode_as_buffer, /* tp_as_buffer */
4715 Py_TPFLAGS_DEFAULT, /* tp_flags */
4716};
4717
4718/* Initialize the Unicode implementation */
4719
4720void _PyUnicode_Init()
4721{
4722 /* Doublecheck the configuration... */
4723 if (sizeof(Py_UNICODE) != 2)
4724 Py_FatalError("Unicode configuration error: "
4725 "sizeof(Py_UNICODE) != 2 bytes");
4726
Fred Drakee4315f52000-05-09 19:53:39 +00004727 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004728 unicode_freelist = NULL;
4729 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004730 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00004731 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004732}
4733
4734/* Finalize the Unicode implementation */
4735
4736void
4737_PyUnicode_Fini()
4738{
4739 PyUnicodeObject *u = unicode_freelist;
4740
4741 while (u != NULL) {
4742 PyUnicodeObject *v = u;
4743 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004744 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00004745 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004746 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00004747 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004748 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004749 unicode_freelist = NULL;
4750 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004751 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004752 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004753}