blob: bfc59dd97a1c0b28582b5e2c085bf5f459ea3c17 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000111/* --- Globals ------------------------------------------------------------
112
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
115
116*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
118/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000119static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000120
121/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000122static PyUnicodeObject *unicode_freelist;
123static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124
Fred Drakee4315f52000-05-09 19:53:39 +0000125/* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
127
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
130
131*/
132
133static char unicode_default_encoding[100];
134
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135/* --- Unicode Object ----------------------------------------------------- */
136
137static
138int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139 int length)
140{
141 void *oldstr;
142
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000143 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode == unicode_empty) {
149 PyErr_SetString(PyExc_SystemError,
150 "can't resize empty unicode object");
151 return -1;
152 }
153
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr = unicode->str;
157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158 if (!unicode->str) {
159 unicode->str = oldstr;
160 PyErr_NoMemory();
161 return -1;
162 }
163 unicode->str[length] = 0;
164 unicode->length = length;
165
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000166 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 /* Reset the object caches */
168 if (unicode->utf8str) {
169 Py_DECREF(unicode->utf8str);
170 unicode->utf8str = NULL;
171 }
172 unicode->hash = -1;
173
174 return 0;
175}
176
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000177int PyUnicode_Resize(PyObject **unicode,
178 int length)
179{
180 PyUnicodeObject *v;
181
182 if (unicode == NULL) {
183 PyErr_BadInternalCall();
184 return -1;
185 }
186 v = (PyUnicodeObject *)*unicode;
187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188 PyErr_BadInternalCall();
189 return -1;
190 }
191 return _PyUnicode_Resize(v, length);
192}
193
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194/* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
196
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
199
200*/
201
202static
203PyUnicodeObject *_PyUnicode_New(int length)
204{
205 register PyUnicodeObject *unicode;
206
207 /* Optimization for empty strings */
208 if (length == 0 && unicode_empty != NULL) {
209 Py_INCREF(unicode_empty);
210 return unicode_empty;
211 }
212
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist) {
215 unicode = unicode_freelist;
216 unicode_freelist = *(PyUnicodeObject **)unicode_freelist;
217 unicode_freelist_size--;
Guido van Rossumb18618d2000-05-03 23:44:39 +0000218 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000220 /* Keep-Alive optimization: we only upsize the buffer,
221 never downsize it. */
222 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000223 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000224 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000225 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000226 }
227 }
228 else
229 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
230 }
231 else {
232 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
233 if (unicode == NULL)
234 return NULL;
235 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
236 }
237
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000238 if (!unicode->str) {
239 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000240 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000241 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000242 unicode->str[length] = 0;
243 unicode->length = length;
244 unicode->hash = -1;
245 unicode->utf8str = NULL;
246 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000247
248 onError:
249 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000250 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000251 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252}
253
254static
255void _PyUnicode_Free(register PyUnicodeObject *unicode)
256{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000257 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000258 /* Keep-Alive optimization */
259 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000260 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000261 unicode->str = NULL;
262 unicode->length = 0;
263 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000264 if (unicode->utf8str) {
265 Py_DECREF(unicode->utf8str);
266 unicode->utf8str = NULL;
267 }
268 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000269 *(PyUnicodeObject **)unicode = unicode_freelist;
270 unicode_freelist = unicode;
271 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 }
273 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000274 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000275 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000276 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000277 }
278}
279
280PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
281 int size)
282{
283 PyUnicodeObject *unicode;
284
285 unicode = _PyUnicode_New(size);
286 if (!unicode)
287 return NULL;
288
289 /* Copy the Unicode data into the new object */
290 if (u != NULL)
291 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
292
293 return (PyObject *)unicode;
294}
295
296#ifdef HAVE_WCHAR_H
297
298PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
299 int size)
300{
301 PyUnicodeObject *unicode;
302
303 if (w == NULL) {
304 PyErr_BadInternalCall();
305 return NULL;
306 }
307
308 unicode = _PyUnicode_New(size);
309 if (!unicode)
310 return NULL;
311
312 /* Copy the wchar_t data into the new object */
313#ifdef HAVE_USABLE_WCHAR_T
314 memcpy(unicode->str, w, size * sizeof(wchar_t));
315#else
316 {
317 register Py_UNICODE *u;
318 register int i;
319 u = PyUnicode_AS_UNICODE(unicode);
320 for (i = size; i >= 0; i--)
321 *u++ = *w++;
322 }
323#endif
324
325 return (PyObject *)unicode;
326}
327
328int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
329 register wchar_t *w,
330 int size)
331{
332 if (unicode == NULL) {
333 PyErr_BadInternalCall();
334 return -1;
335 }
336 if (size > PyUnicode_GET_SIZE(unicode))
337 size = PyUnicode_GET_SIZE(unicode);
338#ifdef HAVE_USABLE_WCHAR_T
339 memcpy(w, unicode->str, size * sizeof(wchar_t));
340#else
341 {
342 register Py_UNICODE *u;
343 register int i;
344 u = PyUnicode_AS_UNICODE(unicode);
345 for (i = size; i >= 0; i--)
346 *w++ = *u++;
347 }
348#endif
349
350 return size;
351}
352
353#endif
354
355PyObject *PyUnicode_FromObject(register PyObject *obj)
356{
357 const char *s;
358 int len;
359
360 if (obj == NULL) {
361 PyErr_BadInternalCall();
362 return NULL;
363 }
364 else if (PyUnicode_Check(obj)) {
365 Py_INCREF(obj);
366 return obj;
367 }
368 else if (PyString_Check(obj)) {
369 s = PyString_AS_STRING(obj);
370 len = PyString_GET_SIZE(obj);
371 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000372 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
373 /* Overwrite the error message with something more useful in
374 case of a TypeError. */
375 if (PyErr_ExceptionMatches(PyExc_TypeError))
376 PyErr_SetString(PyExc_TypeError,
377 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000378 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000379 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000380 if (len == 0) {
381 Py_INCREF(unicode_empty);
382 return (PyObject *)unicode_empty;
383 }
Fred Drakee4315f52000-05-09 19:53:39 +0000384 return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385}
386
387PyObject *PyUnicode_Decode(const char *s,
388 int size,
389 const char *encoding,
390 const char *errors)
391{
392 PyObject *buffer = NULL, *unicode;
393
Fred Drakee4315f52000-05-09 19:53:39 +0000394 if (encoding == NULL)
395 encoding = PyUnicode_GetDefaultEncoding();
396
397 /* Shortcuts for common default encodings */
398 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000399 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000400 else if (strcmp(encoding, "latin-1") == 0)
401 return PyUnicode_DecodeLatin1(s, size, errors);
402 else if (strcmp(encoding, "ascii") == 0)
403 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000404
405 /* Decode via the codec registry */
406 buffer = PyBuffer_FromMemory((void *)s, size);
407 if (buffer == NULL)
408 goto onError;
409 unicode = PyCodec_Decode(buffer, encoding, errors);
410 if (unicode == NULL)
411 goto onError;
412 if (!PyUnicode_Check(unicode)) {
413 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000414 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000415 unicode->ob_type->tp_name);
416 Py_DECREF(unicode);
417 goto onError;
418 }
419 Py_DECREF(buffer);
420 return unicode;
421
422 onError:
423 Py_XDECREF(buffer);
424 return NULL;
425}
426
427PyObject *PyUnicode_Encode(const Py_UNICODE *s,
428 int size,
429 const char *encoding,
430 const char *errors)
431{
432 PyObject *v, *unicode;
433
434 unicode = PyUnicode_FromUnicode(s, size);
435 if (unicode == NULL)
436 return NULL;
437 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
438 Py_DECREF(unicode);
439 return v;
440}
441
442PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
443 const char *encoding,
444 const char *errors)
445{
446 PyObject *v;
447
448 if (!PyUnicode_Check(unicode)) {
449 PyErr_BadArgument();
450 goto onError;
451 }
Fred Drakee4315f52000-05-09 19:53:39 +0000452
453 if (encoding == NULL)
454 encoding = PyUnicode_GetDefaultEncoding();
455
456 /* Shortcuts for common default encodings */
457 if (errors == NULL) {
458 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000459 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000460 else if (strcmp(encoding, "latin-1") == 0)
461 return PyUnicode_AsLatin1String(unicode);
462 else if (strcmp(encoding, "ascii") == 0)
463 return PyUnicode_AsASCIIString(unicode);
464 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000465
466 /* Encode via the codec registry */
467 v = PyCodec_Encode(unicode, encoding, errors);
468 if (v == NULL)
469 goto onError;
470 /* XXX Should we really enforce this ? */
471 if (!PyString_Check(v)) {
472 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000473 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000474 v->ob_type->tp_name);
475 Py_DECREF(v);
476 goto onError;
477 }
478 return v;
479
480 onError:
481 return NULL;
482}
483
484Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
485{
486 if (!PyUnicode_Check(unicode)) {
487 PyErr_BadArgument();
488 goto onError;
489 }
490 return PyUnicode_AS_UNICODE(unicode);
491
492 onError:
493 return NULL;
494}
495
496int PyUnicode_GetSize(PyObject *unicode)
497{
498 if (!PyUnicode_Check(unicode)) {
499 PyErr_BadArgument();
500 goto onError;
501 }
502 return PyUnicode_GET_SIZE(unicode);
503
504 onError:
505 return -1;
506}
507
Fred Drakee4315f52000-05-09 19:53:39 +0000508const char *PyUnicode_GetDefaultEncoding()
509{
510 return unicode_default_encoding;
511}
512
513int PyUnicode_SetDefaultEncoding(const char *encoding)
514{
515 PyObject *v;
516
517 /* Make sure the encoding is valid. As side effect, this also
518 loads the encoding into the codec registry cache. */
519 v = _PyCodec_Lookup(encoding);
520 if (v == NULL)
521 goto onError;
522 Py_DECREF(v);
523 strncpy(unicode_default_encoding,
524 encoding,
525 sizeof(unicode_default_encoding));
526 return 0;
527
528 onError:
529 return -1;
530}
531
Guido van Rossumd57fd912000-03-10 22:53:23 +0000532/* --- UTF-8 Codec -------------------------------------------------------- */
533
534static
535char utf8_code_length[256] = {
536 /* Map UTF-8 encoded prefix byte to sequence length. zero means
537 illegal prefix. see RFC 2279 for details */
538 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
545 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
546 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
551 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
552 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
553 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
554};
555
556static
557int utf8_decoding_error(const char **source,
558 Py_UNICODE **dest,
559 const char *errors,
560 const char *details)
561{
562 if ((errors == NULL) ||
563 (strcmp(errors,"strict") == 0)) {
564 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000565 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000566 details);
567 return -1;
568 }
569 else if (strcmp(errors,"ignore") == 0) {
570 (*source)++;
571 return 0;
572 }
573 else if (strcmp(errors,"replace") == 0) {
574 (*source)++;
575 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
576 (*dest)++;
577 return 0;
578 }
579 else {
580 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000581 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000582 errors);
583 return -1;
584 }
585}
586
587#define UTF8_ERROR(details) do { \
588 if (utf8_decoding_error(&s, &p, errors, details)) \
589 goto onError; \
590 continue; \
591} while (0)
592
593PyObject *PyUnicode_DecodeUTF8(const char *s,
594 int size,
595 const char *errors)
596{
597 int n;
598 const char *e;
599 PyUnicodeObject *unicode;
600 Py_UNICODE *p;
601
602 /* Note: size will always be longer than the resulting Unicode
603 character count */
604 unicode = _PyUnicode_New(size);
605 if (!unicode)
606 return NULL;
607 if (size == 0)
608 return (PyObject *)unicode;
609
610 /* Unpack UTF-8 encoded data */
611 p = unicode->str;
612 e = s + size;
613
614 while (s < e) {
615 register Py_UNICODE ch = (unsigned char)*s;
616
617 if (ch < 0x80) {
618 *p++ = ch;
619 s++;
620 continue;
621 }
622
623 n = utf8_code_length[ch];
624
625 if (s + n > e)
626 UTF8_ERROR("unexpected end of data");
627
628 switch (n) {
629
630 case 0:
631 UTF8_ERROR("unexpected code byte");
632 break;
633
634 case 1:
635 UTF8_ERROR("internal error");
636 break;
637
638 case 2:
639 if ((s[1] & 0xc0) != 0x80)
640 UTF8_ERROR("invalid data");
641 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
642 if (ch < 0x80)
643 UTF8_ERROR("illegal encoding");
644 else
645 *p++ = ch;
646 break;
647
648 case 3:
649 if ((s[1] & 0xc0) != 0x80 ||
650 (s[2] & 0xc0) != 0x80)
651 UTF8_ERROR("invalid data");
652 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
653 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
654 UTF8_ERROR("illegal encoding");
655 else
656 *p++ = ch;
657 break;
658
659 default:
660 /* Other sizes are only needed for UCS-4 */
661 UTF8_ERROR("unsupported Unicode code range");
662 }
663 s += n;
664 }
665
666 /* Adjust length */
667 if (_PyUnicode_Resize(unicode, p - unicode->str))
668 goto onError;
669
670 return (PyObject *)unicode;
671
672onError:
673 Py_DECREF(unicode);
674 return NULL;
675}
676
677#undef UTF8_ERROR
678
679static
680int utf8_encoding_error(const Py_UNICODE **source,
681 char **dest,
682 const char *errors,
683 const char *details)
684{
685 if ((errors == NULL) ||
686 (strcmp(errors,"strict") == 0)) {
687 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000688 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000689 details);
690 return -1;
691 }
692 else if (strcmp(errors,"ignore") == 0) {
693 return 0;
694 }
695 else if (strcmp(errors,"replace") == 0) {
696 **dest = '?';
697 (*dest)++;
698 return 0;
699 }
700 else {
701 PyErr_Format(PyExc_ValueError,
702 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000703 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000704 errors);
705 return -1;
706 }
707}
708
709PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
710 int size,
711 const char *errors)
712{
713 PyObject *v;
714 char *p;
715 char *q;
716
717 v = PyString_FromStringAndSize(NULL, 3 * size);
718 if (v == NULL)
719 return NULL;
720 if (size == 0)
721 goto done;
722
723 p = q = PyString_AS_STRING(v);
724 while (size-- > 0) {
725 Py_UNICODE ch = *s++;
726 if (ch < 0x80)
727 *p++ = (char) ch;
728 else if (ch < 0x0800) {
729 *p++ = 0xc0 | (ch >> 6);
730 *p++ = 0x80 | (ch & 0x3f);
731 } else if (0xD800 <= ch && ch <= 0xDFFF) {
732 /* These byte ranges are reserved for UTF-16 surrogate
733 bytes which the Python implementation currently does
734 not support. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000735 if (utf8_encoding_error(&s, &p, errors,
736 "unsupported code range"))
737 goto onError;
738 } else {
739 *p++ = 0xe0 | (ch >> 12);
740 *p++ = 0x80 | ((ch >> 6) & 0x3f);
741 *p++ = 0x80 | (ch & 0x3f);
742 }
743 }
744 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000745 if (_PyString_Resize(&v, p - q))
746 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000747
748 done:
749 return v;
750
751 onError:
752 Py_DECREF(v);
753 return NULL;
754}
755
756/* Return a Python string holding the UTF-8 encoded value of the
757 Unicode object.
758
759 The resulting string is cached in the Unicode object for subsequent
760 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000761 the character buffer interface and will live (at least) as long as
762 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000763
764 The refcount of the string is *not* incremented.
765
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000766 *** Exported for internal use by the interpreter only !!! ***
767
Guido van Rossumd57fd912000-03-10 22:53:23 +0000768*/
769
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000770PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000771 const char *errors)
772{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000773 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000774
775 if (v)
776 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000777 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
778 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000779 errors);
780 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000781 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000782 return v;
783}
784
785PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
786{
787 PyObject *str;
788
789 if (!PyUnicode_Check(unicode)) {
790 PyErr_BadArgument();
791 return NULL;
792 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000793 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000794 if (str == NULL)
795 return NULL;
796 Py_INCREF(str);
797 return str;
798}
799
800/* --- UTF-16 Codec ------------------------------------------------------- */
801
802static
803int utf16_decoding_error(const Py_UNICODE **source,
804 Py_UNICODE **dest,
805 const char *errors,
806 const char *details)
807{
808 if ((errors == NULL) ||
809 (strcmp(errors,"strict") == 0)) {
810 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000811 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000812 details);
813 return -1;
814 }
815 else if (strcmp(errors,"ignore") == 0) {
816 return 0;
817 }
818 else if (strcmp(errors,"replace") == 0) {
819 if (dest) {
820 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
821 (*dest)++;
822 }
823 return 0;
824 }
825 else {
826 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000827 "UTF-16 decoding error; "
828 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000829 errors);
830 return -1;
831 }
832}
833
834#define UTF16_ERROR(details) do { \
835 if (utf16_decoding_error(&q, &p, errors, details)) \
836 goto onError; \
837 continue; \
838} while(0)
839
840PyObject *PyUnicode_DecodeUTF16(const char *s,
841 int size,
842 const char *errors,
843 int *byteorder)
844{
845 PyUnicodeObject *unicode;
846 Py_UNICODE *p;
847 const Py_UNICODE *q, *e;
848 int bo = 0;
849
850 /* size should be an even number */
851 if (size % sizeof(Py_UNICODE) != 0) {
852 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
853 return NULL;
854 /* The remaining input chars are ignored if we fall through
855 here... */
856 }
857
858 /* Note: size will always be longer than the resulting Unicode
859 character count */
860 unicode = _PyUnicode_New(size);
861 if (!unicode)
862 return NULL;
863 if (size == 0)
864 return (PyObject *)unicode;
865
866 /* Unpack UTF-16 encoded data */
867 p = unicode->str;
868 q = (Py_UNICODE *)s;
869 e = q + (size / sizeof(Py_UNICODE));
870
871 if (byteorder)
872 bo = *byteorder;
873
874 while (q < e) {
875 register Py_UNICODE ch = *q++;
876
877 /* Check for BOM marks (U+FEFF) in the input and adjust
878 current byte order setting accordingly. Swap input
879 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
880 !) */
881#ifdef BYTEORDER_IS_LITTLE_ENDIAN
882 if (ch == 0xFEFF) {
883 bo = -1;
884 continue;
885 } else if (ch == 0xFFFE) {
886 bo = 1;
887 continue;
888 }
889 if (bo == 1)
890 ch = (ch >> 8) | (ch << 8);
891#else
892 if (ch == 0xFEFF) {
893 bo = 1;
894 continue;
895 } else if (ch == 0xFFFE) {
896 bo = -1;
897 continue;
898 }
899 if (bo == -1)
900 ch = (ch >> 8) | (ch << 8);
901#endif
902 if (ch < 0xD800 || ch > 0xDFFF) {
903 *p++ = ch;
904 continue;
905 }
906
907 /* UTF-16 code pair: */
908 if (q >= e)
909 UTF16_ERROR("unexpected end of data");
910 if (0xDC00 <= *q && *q <= 0xDFFF) {
911 q++;
912 if (0xD800 <= *q && *q <= 0xDBFF)
913 /* This is valid data (a UTF-16 surrogate pair), but
914 we are not able to store this information since our
915 Py_UNICODE type only has 16 bits... this might
916 change someday, even though it's unlikely. */
917 UTF16_ERROR("code pairs are not supported");
918 else
919 continue;
920 }
921 UTF16_ERROR("illegal encoding");
922 }
923
924 if (byteorder)
925 *byteorder = bo;
926
927 /* Adjust length */
928 if (_PyUnicode_Resize(unicode, p - unicode->str))
929 goto onError;
930
931 return (PyObject *)unicode;
932
933onError:
934 Py_DECREF(unicode);
935 return NULL;
936}
937
938#undef UTF16_ERROR
939
940PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
941 int size,
942 const char *errors,
943 int byteorder)
944{
945 PyObject *v;
946 Py_UNICODE *p;
947 char *q;
948
949 /* We don't create UTF-16 pairs... */
950 v = PyString_FromStringAndSize(NULL,
951 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
952 if (v == NULL)
953 return NULL;
954 if (size == 0)
955 goto done;
956
957 q = PyString_AS_STRING(v);
958 p = (Py_UNICODE *)q;
959
960 if (byteorder == 0)
961 *p++ = 0xFEFF;
962 if (byteorder == 0 ||
963#ifdef BYTEORDER_IS_LITTLE_ENDIAN
964 byteorder == -1
965#else
966 byteorder == 1
967#endif
968 )
969 memcpy(p, s, size * sizeof(Py_UNICODE));
970 else
971 while (size-- > 0) {
972 Py_UNICODE ch = *s++;
973 *p++ = (ch >> 8) | (ch << 8);
974 }
975 done:
976 return v;
977}
978
979PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
980{
981 if (!PyUnicode_Check(unicode)) {
982 PyErr_BadArgument();
983 return NULL;
984 }
985 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
986 PyUnicode_GET_SIZE(unicode),
987 NULL,
988 0);
989}
990
991/* --- Unicode Escape Codec ----------------------------------------------- */
992
993static
994int unicodeescape_decoding_error(const char **source,
995 unsigned int *x,
996 const char *errors,
997 const char *details)
998{
999 if ((errors == NULL) ||
1000 (strcmp(errors,"strict") == 0)) {
1001 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001002 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001003 details);
1004 return -1;
1005 }
1006 else if (strcmp(errors,"ignore") == 0) {
1007 return 0;
1008 }
1009 else if (strcmp(errors,"replace") == 0) {
1010 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
1011 return 0;
1012 }
1013 else {
1014 PyErr_Format(PyExc_ValueError,
1015 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001016 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001017 errors);
1018 return -1;
1019 }
1020}
1021
1022PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1023 int size,
1024 const char *errors)
1025{
1026 PyUnicodeObject *v;
1027 Py_UNICODE *p = NULL, *buf = NULL;
1028 const char *end;
1029
1030 /* Escaped strings will always be longer than the resulting
1031 Unicode string, so we start with size here and then reduce the
1032 length after conversion to the true value. */
1033 v = _PyUnicode_New(size);
1034 if (v == NULL)
1035 goto onError;
1036 if (size == 0)
1037 return (PyObject *)v;
1038 p = buf = PyUnicode_AS_UNICODE(v);
1039 end = s + size;
1040 while (s < end) {
1041 unsigned char c;
1042 unsigned int x;
1043 int i;
1044
1045 /* Non-escape characters are interpreted as Unicode ordinals */
1046 if (*s != '\\') {
1047 *p++ = (unsigned char)*s++;
1048 continue;
1049 }
1050
1051 /* \ - Escapes */
1052 s++;
1053 switch (*s++) {
1054
1055 /* \x escapes */
1056 case '\n': break;
1057 case '\\': *p++ = '\\'; break;
1058 case '\'': *p++ = '\''; break;
1059 case '\"': *p++ = '\"'; break;
1060 case 'b': *p++ = '\b'; break;
1061 case 'f': *p++ = '\014'; break; /* FF */
1062 case 't': *p++ = '\t'; break;
1063 case 'n': *p++ = '\n'; break;
1064 case 'r': *p++ = '\r'; break;
1065 case 'v': *p++ = '\013'; break; /* VT */
1066 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1067
1068 /* \OOO (octal) escapes */
1069 case '0': case '1': case '2': case '3':
1070 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001071 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001072 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001073 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001074 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001075 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001076 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001077 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001078 break;
1079
1080 /* \xXXXX escape with 0-4 hex digits */
1081 case 'x':
1082 x = 0;
1083 c = (unsigned char)*s;
1084 if (isxdigit(c)) {
1085 do {
1086 x = (x<<4) & ~0xF;
1087 if ('0' <= c && c <= '9')
1088 x += c - '0';
1089 else if ('a' <= c && c <= 'f')
1090 x += 10 + c - 'a';
1091 else
1092 x += 10 + c - 'A';
1093 c = (unsigned char)*++s;
1094 } while (isxdigit(c));
1095 *p++ = x;
1096 } else {
1097 *p++ = '\\';
1098 *p++ = (unsigned char)s[-1];
1099 }
1100 break;
1101
1102 /* \uXXXX with 4 hex digits */
1103 case 'u':
1104 for (x = 0, i = 0; i < 4; i++) {
1105 c = (unsigned char)s[i];
1106 if (!isxdigit(c)) {
1107 if (unicodeescape_decoding_error(&s, &x, errors,
1108 "truncated \\uXXXX"))
1109 goto onError;
1110 i++;
1111 break;
1112 }
1113 x = (x<<4) & ~0xF;
1114 if (c >= '0' && c <= '9')
1115 x += c - '0';
1116 else if (c >= 'a' && c <= 'f')
1117 x += 10 + c - 'a';
1118 else
1119 x += 10 + c - 'A';
1120 }
1121 s += i;
1122 *p++ = x;
1123 break;
1124
1125 default:
1126 *p++ = '\\';
1127 *p++ = (unsigned char)s[-1];
1128 break;
1129 }
1130 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001131 if (_PyUnicode_Resize(v, (int)(p - buf)))
1132 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001133 return (PyObject *)v;
1134
1135 onError:
1136 Py_XDECREF(v);
1137 return NULL;
1138}
1139
1140/* Return a Unicode-Escape string version of the Unicode object.
1141
1142 If quotes is true, the string is enclosed in u"" or u'' quotes as
1143 appropriate.
1144
1145*/
1146
Barry Warsaw51ac5802000-03-20 16:36:48 +00001147static const Py_UNICODE *findchar(const Py_UNICODE *s,
1148 int size,
1149 Py_UNICODE ch);
1150
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151static
1152PyObject *unicodeescape_string(const Py_UNICODE *s,
1153 int size,
1154 int quotes)
1155{
1156 PyObject *repr;
1157 char *p;
1158 char *q;
1159
1160 static const char *hexdigit = "0123456789ABCDEF";
1161
1162 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1163 if (repr == NULL)
1164 return NULL;
1165
1166 p = q = PyString_AS_STRING(repr);
1167
1168 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001169 *p++ = 'u';
1170 *p++ = (findchar(s, size, '\'') &&
1171 !findchar(s, size, '"')) ? '"' : '\'';
1172 }
1173 while (size-- > 0) {
1174 Py_UNICODE ch = *s++;
1175 /* Escape quotes */
1176 if (quotes && (ch == q[1] || ch == '\\')) {
1177 *p++ = '\\';
1178 *p++ = (char) ch;
1179 }
1180 /* Map 16-bit characters to '\uxxxx' */
1181 else if (ch >= 256) {
1182 *p++ = '\\';
1183 *p++ = 'u';
1184 *p++ = hexdigit[(ch >> 12) & 0xf];
1185 *p++ = hexdigit[(ch >> 8) & 0xf];
1186 *p++ = hexdigit[(ch >> 4) & 0xf];
1187 *p++ = hexdigit[ch & 15];
1188 }
1189 /* Map non-printable US ASCII to '\ooo' */
1190 else if (ch < ' ' || ch >= 128) {
1191 *p++ = '\\';
1192 *p++ = hexdigit[(ch >> 6) & 7];
1193 *p++ = hexdigit[(ch >> 3) & 7];
1194 *p++ = hexdigit[ch & 7];
1195 }
1196 /* Copy everything else as-is */
1197 else
1198 *p++ = (char) ch;
1199 }
1200 if (quotes)
1201 *p++ = q[1];
1202
1203 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001204 if (_PyString_Resize(&repr, p - q))
1205 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206
1207 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001208
1209 onError:
1210 Py_DECREF(repr);
1211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212}
1213
1214PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1215 int size)
1216{
1217 return unicodeescape_string(s, size, 0);
1218}
1219
1220PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1221{
1222 if (!PyUnicode_Check(unicode)) {
1223 PyErr_BadArgument();
1224 return NULL;
1225 }
1226 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1227 PyUnicode_GET_SIZE(unicode));
1228}
1229
1230/* --- Raw Unicode Escape Codec ------------------------------------------- */
1231
1232PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1233 int size,
1234 const char *errors)
1235{
1236 PyUnicodeObject *v;
1237 Py_UNICODE *p, *buf;
1238 const char *end;
1239 const char *bs;
1240
1241 /* Escaped strings will always be longer than the resulting
1242 Unicode string, so we start with size here and then reduce the
1243 length after conversion to the true value. */
1244 v = _PyUnicode_New(size);
1245 if (v == NULL)
1246 goto onError;
1247 if (size == 0)
1248 return (PyObject *)v;
1249 p = buf = PyUnicode_AS_UNICODE(v);
1250 end = s + size;
1251 while (s < end) {
1252 unsigned char c;
1253 unsigned int x;
1254 int i;
1255
1256 /* Non-escape characters are interpreted as Unicode ordinals */
1257 if (*s != '\\') {
1258 *p++ = (unsigned char)*s++;
1259 continue;
1260 }
1261
1262 /* \u-escapes are only interpreted iff the number of leading
1263 backslashes if odd */
1264 bs = s;
1265 for (;s < end;) {
1266 if (*s != '\\')
1267 break;
1268 *p++ = (unsigned char)*s++;
1269 }
1270 if (((s - bs) & 1) == 0 ||
1271 s >= end ||
1272 *s != 'u') {
1273 continue;
1274 }
1275 p--;
1276 s++;
1277
1278 /* \uXXXX with 4 hex digits */
1279 for (x = 0, i = 0; i < 4; i++) {
1280 c = (unsigned char)s[i];
1281 if (!isxdigit(c)) {
1282 if (unicodeescape_decoding_error(&s, &x, errors,
1283 "truncated \\uXXXX"))
1284 goto onError;
1285 i++;
1286 break;
1287 }
1288 x = (x<<4) & ~0xF;
1289 if (c >= '0' && c <= '9')
1290 x += c - '0';
1291 else if (c >= 'a' && c <= 'f')
1292 x += 10 + c - 'a';
1293 else
1294 x += 10 + c - 'A';
1295 }
1296 s += i;
1297 *p++ = x;
1298 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001299 if (_PyUnicode_Resize(v, (int)(p - buf)))
1300 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001301 return (PyObject *)v;
1302
1303 onError:
1304 Py_XDECREF(v);
1305 return NULL;
1306}
1307
1308PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1309 int size)
1310{
1311 PyObject *repr;
1312 char *p;
1313 char *q;
1314
1315 static const char *hexdigit = "0123456789ABCDEF";
1316
1317 repr = PyString_FromStringAndSize(NULL, 6 * size);
1318 if (repr == NULL)
1319 return NULL;
1320
1321 p = q = PyString_AS_STRING(repr);
1322 while (size-- > 0) {
1323 Py_UNICODE ch = *s++;
1324 /* Map 16-bit characters to '\uxxxx' */
1325 if (ch >= 256) {
1326 *p++ = '\\';
1327 *p++ = 'u';
1328 *p++ = hexdigit[(ch >> 12) & 0xf];
1329 *p++ = hexdigit[(ch >> 8) & 0xf];
1330 *p++ = hexdigit[(ch >> 4) & 0xf];
1331 *p++ = hexdigit[ch & 15];
1332 }
1333 /* Copy everything else as-is */
1334 else
1335 *p++ = (char) ch;
1336 }
1337 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001338 if (_PyString_Resize(&repr, p - q))
1339 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001340
1341 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001342
1343 onError:
1344 Py_DECREF(repr);
1345 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001346}
1347
1348PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1349{
1350 if (!PyUnicode_Check(unicode)) {
1351 PyErr_BadArgument();
1352 return NULL;
1353 }
1354 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1355 PyUnicode_GET_SIZE(unicode));
1356}
1357
1358/* --- Latin-1 Codec ------------------------------------------------------ */
1359
1360PyObject *PyUnicode_DecodeLatin1(const char *s,
1361 int size,
1362 const char *errors)
1363{
1364 PyUnicodeObject *v;
1365 Py_UNICODE *p;
1366
1367 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1368 v = _PyUnicode_New(size);
1369 if (v == NULL)
1370 goto onError;
1371 if (size == 0)
1372 return (PyObject *)v;
1373 p = PyUnicode_AS_UNICODE(v);
1374 while (size-- > 0)
1375 *p++ = (unsigned char)*s++;
1376 return (PyObject *)v;
1377
1378 onError:
1379 Py_XDECREF(v);
1380 return NULL;
1381}
1382
1383static
1384int latin1_encoding_error(const Py_UNICODE **source,
1385 char **dest,
1386 const char *errors,
1387 const char *details)
1388{
1389 if ((errors == NULL) ||
1390 (strcmp(errors,"strict") == 0)) {
1391 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001392 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001393 details);
1394 return -1;
1395 }
1396 else if (strcmp(errors,"ignore") == 0) {
1397 return 0;
1398 }
1399 else if (strcmp(errors,"replace") == 0) {
1400 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001401 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001402 return 0;
1403 }
1404 else {
1405 PyErr_Format(PyExc_ValueError,
1406 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001407 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001408 errors);
1409 return -1;
1410 }
1411}
1412
1413PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1414 int size,
1415 const char *errors)
1416{
1417 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001418 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001419 repr = PyString_FromStringAndSize(NULL, size);
1420 if (repr == NULL)
1421 return NULL;
1422
1423 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001424 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001425 while (size-- > 0) {
1426 Py_UNICODE ch = *p++;
1427 if (ch >= 256) {
1428 if (latin1_encoding_error(&p, &s, errors,
1429 "ordinal not in range(256)"))
1430 goto onError;
1431 }
1432 else
1433 *s++ = (char)ch;
1434 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001435 /* Resize if error handling skipped some characters */
1436 if (s - start < PyString_GET_SIZE(repr))
1437 if (_PyString_Resize(&repr, s - start))
1438 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001439 return repr;
1440
1441 onError:
1442 Py_DECREF(repr);
1443 return NULL;
1444}
1445
1446PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1447{
1448 if (!PyUnicode_Check(unicode)) {
1449 PyErr_BadArgument();
1450 return NULL;
1451 }
1452 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1453 PyUnicode_GET_SIZE(unicode),
1454 NULL);
1455}
1456
1457/* --- 7-bit ASCII Codec -------------------------------------------------- */
1458
1459static
1460int ascii_decoding_error(const char **source,
1461 Py_UNICODE **dest,
1462 const char *errors,
1463 const char *details)
1464{
1465 if ((errors == NULL) ||
1466 (strcmp(errors,"strict") == 0)) {
1467 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001468 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001469 details);
1470 return -1;
1471 }
1472 else if (strcmp(errors,"ignore") == 0) {
1473 return 0;
1474 }
1475 else if (strcmp(errors,"replace") == 0) {
1476 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1477 (*dest)++;
1478 return 0;
1479 }
1480 else {
1481 PyErr_Format(PyExc_ValueError,
1482 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001483 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001484 errors);
1485 return -1;
1486 }
1487}
1488
1489PyObject *PyUnicode_DecodeASCII(const char *s,
1490 int size,
1491 const char *errors)
1492{
1493 PyUnicodeObject *v;
1494 Py_UNICODE *p;
1495
1496 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1497 v = _PyUnicode_New(size);
1498 if (v == NULL)
1499 goto onError;
1500 if (size == 0)
1501 return (PyObject *)v;
1502 p = PyUnicode_AS_UNICODE(v);
1503 while (size-- > 0) {
1504 register unsigned char c;
1505
1506 c = (unsigned char)*s++;
1507 if (c < 128)
1508 *p++ = c;
1509 else if (ascii_decoding_error(&s, &p, errors,
1510 "ordinal not in range(128)"))
1511 goto onError;
1512 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001513 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1514 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1515 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001516 return (PyObject *)v;
1517
1518 onError:
1519 Py_XDECREF(v);
1520 return NULL;
1521}
1522
1523static
1524int ascii_encoding_error(const Py_UNICODE **source,
1525 char **dest,
1526 const char *errors,
1527 const char *details)
1528{
1529 if ((errors == NULL) ||
1530 (strcmp(errors,"strict") == 0)) {
1531 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001532 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001533 details);
1534 return -1;
1535 }
1536 else if (strcmp(errors,"ignore") == 0) {
1537 return 0;
1538 }
1539 else if (strcmp(errors,"replace") == 0) {
1540 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001541 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001542 return 0;
1543 }
1544 else {
1545 PyErr_Format(PyExc_ValueError,
1546 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001547 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001548 errors);
1549 return -1;
1550 }
1551}
1552
1553PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1554 int size,
1555 const char *errors)
1556{
1557 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001558 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001559 repr = PyString_FromStringAndSize(NULL, size);
1560 if (repr == NULL)
1561 return NULL;
1562
1563 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001564 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565 while (size-- > 0) {
1566 Py_UNICODE ch = *p++;
1567 if (ch >= 128) {
1568 if (ascii_encoding_error(&p, &s, errors,
1569 "ordinal not in range(128)"))
1570 goto onError;
1571 }
1572 else
1573 *s++ = (char)ch;
1574 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001575 /* Resize if error handling skipped some characters */
1576 if (s - start < PyString_GET_SIZE(repr))
1577 if (_PyString_Resize(&repr, s - start))
1578 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001579 return repr;
1580
1581 onError:
1582 Py_DECREF(repr);
1583 return NULL;
1584}
1585
1586PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1587{
1588 if (!PyUnicode_Check(unicode)) {
1589 PyErr_BadArgument();
1590 return NULL;
1591 }
1592 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1593 PyUnicode_GET_SIZE(unicode),
1594 NULL);
1595}
1596
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001597#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001598
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001599/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001600
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001601PyObject *PyUnicode_DecodeMBCS(const char *s,
1602 int size,
1603 const char *errors)
1604{
1605 PyUnicodeObject *v;
1606 Py_UNICODE *p;
1607
1608 /* First get the size of the result */
1609 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001610 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001611 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1612
1613 v = _PyUnicode_New(usize);
1614 if (v == NULL)
1615 return NULL;
1616 if (usize == 0)
1617 return (PyObject *)v;
1618 p = PyUnicode_AS_UNICODE(v);
1619 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1620 Py_DECREF(v);
1621 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1622 }
1623
1624 return (PyObject *)v;
1625}
1626
1627PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1628 int size,
1629 const char *errors)
1630{
1631 PyObject *repr;
1632 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001633 DWORD mbcssize;
1634
1635 /* If there are no characters, bail now! */
1636 if (size==0)
1637 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001638
1639 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001640 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001641 if (mbcssize==0)
1642 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1643
1644 repr = PyString_FromStringAndSize(NULL, mbcssize);
1645 if (repr == NULL)
1646 return NULL;
1647 if (mbcssize==0)
1648 return repr;
1649
1650 /* Do the conversion */
1651 s = PyString_AS_STRING(repr);
1652 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1653 Py_DECREF(repr);
1654 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1655 }
1656 return repr;
1657}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001658
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001659#endif /* MS_WIN32 */
1660
Guido van Rossumd57fd912000-03-10 22:53:23 +00001661/* --- Character Mapping Codec -------------------------------------------- */
1662
1663static
1664int charmap_decoding_error(const char **source,
1665 Py_UNICODE **dest,
1666 const char *errors,
1667 const char *details)
1668{
1669 if ((errors == NULL) ||
1670 (strcmp(errors,"strict") == 0)) {
1671 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001672 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 details);
1674 return -1;
1675 }
1676 else if (strcmp(errors,"ignore") == 0) {
1677 return 0;
1678 }
1679 else if (strcmp(errors,"replace") == 0) {
1680 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1681 (*dest)++;
1682 return 0;
1683 }
1684 else {
1685 PyErr_Format(PyExc_ValueError,
1686 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001687 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001688 errors);
1689 return -1;
1690 }
1691}
1692
1693PyObject *PyUnicode_DecodeCharmap(const char *s,
1694 int size,
1695 PyObject *mapping,
1696 const char *errors)
1697{
1698 PyUnicodeObject *v;
1699 Py_UNICODE *p;
1700
1701 /* Default to Latin-1 */
1702 if (mapping == NULL)
1703 return PyUnicode_DecodeLatin1(s, size, errors);
1704
1705 v = _PyUnicode_New(size);
1706 if (v == NULL)
1707 goto onError;
1708 if (size == 0)
1709 return (PyObject *)v;
1710 p = PyUnicode_AS_UNICODE(v);
1711 while (size-- > 0) {
1712 unsigned char ch = *s++;
1713 PyObject *w, *x;
1714
1715 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1716 w = PyInt_FromLong((long)ch);
1717 if (w == NULL)
1718 goto onError;
1719 x = PyObject_GetItem(mapping, w);
1720 Py_DECREF(w);
1721 if (x == NULL) {
1722 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1723 /* No mapping found: default to Latin-1 mapping */
1724 PyErr_Clear();
1725 *p++ = (Py_UNICODE)ch;
1726 continue;
1727 }
1728 goto onError;
1729 }
1730
1731 /* Apply mapping */
1732 if (PyInt_Check(x)) {
1733 int value = PyInt_AS_LONG(x);
1734 if (value < 0 || value > 65535) {
1735 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001736 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001737 Py_DECREF(x);
1738 goto onError;
1739 }
1740 *p++ = (Py_UNICODE)value;
1741 }
1742 else if (x == Py_None) {
1743 /* undefined mapping */
1744 if (charmap_decoding_error(&s, &p, errors,
1745 "character maps to <undefined>")) {
1746 Py_DECREF(x);
1747 goto onError;
1748 }
1749 }
1750 else if (PyUnicode_Check(x)) {
1751 if (PyUnicode_GET_SIZE(x) != 1) {
1752 /* 1-n mapping */
1753 PyErr_SetString(PyExc_NotImplementedError,
1754 "1-n mappings are currently not implemented");
1755 Py_DECREF(x);
1756 goto onError;
1757 }
1758 *p++ = *PyUnicode_AS_UNICODE(x);
1759 }
1760 else {
1761 /* wrong return value */
1762 PyErr_SetString(PyExc_TypeError,
1763 "character mapping must return integer, None or unicode");
1764 Py_DECREF(x);
1765 goto onError;
1766 }
1767 Py_DECREF(x);
1768 }
1769 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1770 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1771 goto onError;
1772 return (PyObject *)v;
1773
1774 onError:
1775 Py_XDECREF(v);
1776 return NULL;
1777}
1778
1779static
1780int charmap_encoding_error(const Py_UNICODE **source,
1781 char **dest,
1782 const char *errors,
1783 const char *details)
1784{
1785 if ((errors == NULL) ||
1786 (strcmp(errors,"strict") == 0)) {
1787 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001788 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001789 details);
1790 return -1;
1791 }
1792 else if (strcmp(errors,"ignore") == 0) {
1793 return 0;
1794 }
1795 else if (strcmp(errors,"replace") == 0) {
1796 **dest = '?';
1797 (*dest)++;
1798 return 0;
1799 }
1800 else {
1801 PyErr_Format(PyExc_ValueError,
1802 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001803 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001804 errors);
1805 return -1;
1806 }
1807}
1808
1809PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1810 int size,
1811 PyObject *mapping,
1812 const char *errors)
1813{
1814 PyObject *v;
1815 char *s;
1816
1817 /* Default to Latin-1 */
1818 if (mapping == NULL)
1819 return PyUnicode_EncodeLatin1(p, size, errors);
1820
1821 v = PyString_FromStringAndSize(NULL, size);
1822 if (v == NULL)
1823 return NULL;
1824 s = PyString_AS_STRING(v);
1825 while (size-- > 0) {
1826 Py_UNICODE ch = *p++;
1827 PyObject *w, *x;
1828
1829 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1830 w = PyInt_FromLong((long)ch);
1831 if (w == NULL)
1832 goto onError;
1833 x = PyObject_GetItem(mapping, w);
1834 Py_DECREF(w);
1835 if (x == NULL) {
1836 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1837 /* No mapping found: default to Latin-1 mapping if possible */
1838 PyErr_Clear();
1839 if (ch < 256) {
1840 *s++ = (char)ch;
1841 continue;
1842 }
1843 else if (!charmap_encoding_error(&p, &s, errors,
1844 "missing character mapping"))
1845 continue;
1846 }
1847 goto onError;
1848 }
1849
1850 /* Apply mapping */
1851 if (PyInt_Check(x)) {
1852 int value = PyInt_AS_LONG(x);
1853 if (value < 0 || value > 255) {
1854 PyErr_SetString(PyExc_TypeError,
1855 "character mapping must be in range(256)");
1856 Py_DECREF(x);
1857 goto onError;
1858 }
1859 *s++ = (char)value;
1860 }
1861 else if (x == Py_None) {
1862 /* undefined mapping */
1863 if (charmap_encoding_error(&p, &s, errors,
1864 "character maps to <undefined>")) {
1865 Py_DECREF(x);
1866 goto onError;
1867 }
1868 }
1869 else if (PyString_Check(x)) {
1870 if (PyString_GET_SIZE(x) != 1) {
1871 /* 1-n mapping */
1872 PyErr_SetString(PyExc_NotImplementedError,
1873 "1-n mappings are currently not implemented");
1874 Py_DECREF(x);
1875 goto onError;
1876 }
1877 *s++ = *PyString_AS_STRING(x);
1878 }
1879 else {
1880 /* wrong return value */
1881 PyErr_SetString(PyExc_TypeError,
1882 "character mapping must return integer, None or unicode");
1883 Py_DECREF(x);
1884 goto onError;
1885 }
1886 Py_DECREF(x);
1887 }
1888 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1889 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1890 goto onError;
1891 return v;
1892
1893 onError:
1894 Py_DECREF(v);
1895 return NULL;
1896}
1897
1898PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1899 PyObject *mapping)
1900{
1901 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1902 PyErr_BadArgument();
1903 return NULL;
1904 }
1905 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1906 PyUnicode_GET_SIZE(unicode),
1907 mapping,
1908 NULL);
1909}
1910
1911static
1912int translate_error(const Py_UNICODE **source,
1913 Py_UNICODE **dest,
1914 const char *errors,
1915 const char *details)
1916{
1917 if ((errors == NULL) ||
1918 (strcmp(errors,"strict") == 0)) {
1919 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001920 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001921 details);
1922 return -1;
1923 }
1924 else if (strcmp(errors,"ignore") == 0) {
1925 return 0;
1926 }
1927 else if (strcmp(errors,"replace") == 0) {
1928 **dest = '?';
1929 (*dest)++;
1930 return 0;
1931 }
1932 else {
1933 PyErr_Format(PyExc_ValueError,
1934 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001935 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001936 errors);
1937 return -1;
1938 }
1939}
1940
1941PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1942 int size,
1943 PyObject *mapping,
1944 const char *errors)
1945{
1946 PyUnicodeObject *v;
1947 Py_UNICODE *p;
1948
1949 if (mapping == NULL) {
1950 PyErr_BadArgument();
1951 return NULL;
1952 }
1953
1954 /* Output will never be longer than input */
1955 v = _PyUnicode_New(size);
1956 if (v == NULL)
1957 goto onError;
1958 if (size == 0)
1959 goto done;
1960 p = PyUnicode_AS_UNICODE(v);
1961 while (size-- > 0) {
1962 Py_UNICODE ch = *s++;
1963 PyObject *w, *x;
1964
1965 /* Get mapping */
1966 w = PyInt_FromLong(ch);
1967 if (w == NULL)
1968 goto onError;
1969 x = PyObject_GetItem(mapping, w);
1970 Py_DECREF(w);
1971 if (x == NULL) {
1972 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1973 /* No mapping found: default to 1-1 mapping */
1974 PyErr_Clear();
1975 *p++ = ch;
1976 continue;
1977 }
1978 goto onError;
1979 }
1980
1981 /* Apply mapping */
1982 if (PyInt_Check(x))
1983 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1984 else if (x == Py_None) {
1985 /* undefined mapping */
1986 if (translate_error(&s, &p, errors,
1987 "character maps to <undefined>")) {
1988 Py_DECREF(x);
1989 goto onError;
1990 }
1991 }
1992 else if (PyUnicode_Check(x)) {
1993 if (PyUnicode_GET_SIZE(x) != 1) {
1994 /* 1-n mapping */
1995 PyErr_SetString(PyExc_NotImplementedError,
1996 "1-n mappings are currently not implemented");
1997 Py_DECREF(x);
1998 goto onError;
1999 }
2000 *p++ = *PyUnicode_AS_UNICODE(x);
2001 }
2002 else {
2003 /* wrong return value */
2004 PyErr_SetString(PyExc_TypeError,
2005 "translate mapping must return integer, None or unicode");
2006 Py_DECREF(x);
2007 goto onError;
2008 }
2009 Py_DECREF(x);
2010 }
2011 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002012 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2013 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002014
2015 done:
2016 return (PyObject *)v;
2017
2018 onError:
2019 Py_XDECREF(v);
2020 return NULL;
2021}
2022
2023PyObject *PyUnicode_Translate(PyObject *str,
2024 PyObject *mapping,
2025 const char *errors)
2026{
2027 PyObject *result;
2028
2029 str = PyUnicode_FromObject(str);
2030 if (str == NULL)
2031 goto onError;
2032 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2033 PyUnicode_GET_SIZE(str),
2034 mapping,
2035 errors);
2036 Py_DECREF(str);
2037 return result;
2038
2039 onError:
2040 Py_XDECREF(str);
2041 return NULL;
2042}
2043
Guido van Rossum9e896b32000-04-05 20:11:21 +00002044/* --- Decimal Encoder ---------------------------------------------------- */
2045
2046int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2047 int length,
2048 char *output,
2049 const char *errors)
2050{
2051 Py_UNICODE *p, *end;
2052
2053 if (output == NULL) {
2054 PyErr_BadArgument();
2055 return -1;
2056 }
2057
2058 p = s;
2059 end = s + length;
2060 while (p < end) {
2061 register Py_UNICODE ch = *p++;
2062 int decimal;
2063
2064 if (Py_UNICODE_ISSPACE(ch)) {
2065 *output++ = ' ';
2066 continue;
2067 }
2068 decimal = Py_UNICODE_TODECIMAL(ch);
2069 if (decimal >= 0) {
2070 *output++ = '0' + decimal;
2071 continue;
2072 }
Guido van Rossumba477042000-04-06 18:18:10 +00002073 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002074 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002075 continue;
2076 }
2077 /* All other characters are considered invalid */
2078 if (errors == NULL || strcmp(errors, "strict") == 0) {
2079 PyErr_SetString(PyExc_ValueError,
2080 "invalid decimal Unicode string");
2081 goto onError;
2082 }
2083 else if (strcmp(errors, "ignore") == 0)
2084 continue;
2085 else if (strcmp(errors, "replace") == 0) {
2086 *output++ = '?';
2087 continue;
2088 }
2089 }
2090 /* 0-terminate the output string */
2091 *output++ = '\0';
2092 return 0;
2093
2094 onError:
2095 return -1;
2096}
2097
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098/* --- Helpers ------------------------------------------------------------ */
2099
2100static
2101int count(PyUnicodeObject *self,
2102 int start,
2103 int end,
2104 PyUnicodeObject *substring)
2105{
2106 int count = 0;
2107
2108 end -= substring->length;
2109
2110 while (start <= end)
2111 if (Py_UNICODE_MATCH(self, start, substring)) {
2112 count++;
2113 start += substring->length;
2114 } else
2115 start++;
2116
2117 return count;
2118}
2119
2120int PyUnicode_Count(PyObject *str,
2121 PyObject *substr,
2122 int start,
2123 int end)
2124{
2125 int result;
2126
2127 str = PyUnicode_FromObject(str);
2128 if (str == NULL)
2129 return -1;
2130 substr = PyUnicode_FromObject(substr);
2131 if (substr == NULL) {
2132 Py_DECREF(substr);
2133 return -1;
2134 }
2135
2136 result = count((PyUnicodeObject *)str,
2137 start, end,
2138 (PyUnicodeObject *)substr);
2139
2140 Py_DECREF(str);
2141 Py_DECREF(substr);
2142 return result;
2143}
2144
2145static
2146int findstring(PyUnicodeObject *self,
2147 PyUnicodeObject *substring,
2148 int start,
2149 int end,
2150 int direction)
2151{
2152 if (start < 0)
2153 start += self->length;
2154 if (start < 0)
2155 start = 0;
2156
2157 if (substring->length == 0)
2158 return start;
2159
2160 if (end > self->length)
2161 end = self->length;
2162 if (end < 0)
2163 end += self->length;
2164 if (end < 0)
2165 end = 0;
2166
2167 end -= substring->length;
2168
2169 if (direction < 0) {
2170 for (; end >= start; end--)
2171 if (Py_UNICODE_MATCH(self, end, substring))
2172 return end;
2173 } else {
2174 for (; start <= end; start++)
2175 if (Py_UNICODE_MATCH(self, start, substring))
2176 return start;
2177 }
2178
2179 return -1;
2180}
2181
2182int PyUnicode_Find(PyObject *str,
2183 PyObject *substr,
2184 int start,
2185 int end,
2186 int direction)
2187{
2188 int result;
2189
2190 str = PyUnicode_FromObject(str);
2191 if (str == NULL)
2192 return -1;
2193 substr = PyUnicode_FromObject(substr);
2194 if (substr == NULL) {
2195 Py_DECREF(substr);
2196 return -1;
2197 }
2198
2199 result = findstring((PyUnicodeObject *)str,
2200 (PyUnicodeObject *)substr,
2201 start, end, direction);
2202 Py_DECREF(str);
2203 Py_DECREF(substr);
2204 return result;
2205}
2206
2207static
2208int tailmatch(PyUnicodeObject *self,
2209 PyUnicodeObject *substring,
2210 int start,
2211 int end,
2212 int direction)
2213{
2214 if (start < 0)
2215 start += self->length;
2216 if (start < 0)
2217 start = 0;
2218
2219 if (substring->length == 0)
2220 return 1;
2221
2222 if (end > self->length)
2223 end = self->length;
2224 if (end < 0)
2225 end += self->length;
2226 if (end < 0)
2227 end = 0;
2228
2229 end -= substring->length;
2230 if (end < start)
2231 return 0;
2232
2233 if (direction > 0) {
2234 if (Py_UNICODE_MATCH(self, end, substring))
2235 return 1;
2236 } else {
2237 if (Py_UNICODE_MATCH(self, start, substring))
2238 return 1;
2239 }
2240
2241 return 0;
2242}
2243
2244int PyUnicode_Tailmatch(PyObject *str,
2245 PyObject *substr,
2246 int start,
2247 int end,
2248 int direction)
2249{
2250 int result;
2251
2252 str = PyUnicode_FromObject(str);
2253 if (str == NULL)
2254 return -1;
2255 substr = PyUnicode_FromObject(substr);
2256 if (substr == NULL) {
2257 Py_DECREF(substr);
2258 return -1;
2259 }
2260
2261 result = tailmatch((PyUnicodeObject *)str,
2262 (PyUnicodeObject *)substr,
2263 start, end, direction);
2264 Py_DECREF(str);
2265 Py_DECREF(substr);
2266 return result;
2267}
2268
2269static
2270const Py_UNICODE *findchar(const Py_UNICODE *s,
2271 int size,
2272 Py_UNICODE ch)
2273{
2274 /* like wcschr, but doesn't stop at NULL characters */
2275
2276 while (size-- > 0) {
2277 if (*s == ch)
2278 return s;
2279 s++;
2280 }
2281
2282 return NULL;
2283}
2284
2285/* Apply fixfct filter to the Unicode object self and return a
2286 reference to the modified object */
2287
2288static
2289PyObject *fixup(PyUnicodeObject *self,
2290 int (*fixfct)(PyUnicodeObject *s))
2291{
2292
2293 PyUnicodeObject *u;
2294
2295 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2296 self->length);
2297 if (u == NULL)
2298 return NULL;
2299 if (!fixfct(u)) {
2300 /* fixfct should return TRUE if it modified the buffer. If
2301 FALSE, return a reference to the original buffer instead
2302 (to save space, not time) */
2303 Py_INCREF(self);
2304 Py_DECREF(u);
2305 return (PyObject*) self;
2306 }
2307 return (PyObject*) u;
2308}
2309
2310static
2311int fixupper(PyUnicodeObject *self)
2312{
2313 int len = self->length;
2314 Py_UNICODE *s = self->str;
2315 int status = 0;
2316
2317 while (len-- > 0) {
2318 register Py_UNICODE ch;
2319
2320 ch = Py_UNICODE_TOUPPER(*s);
2321 if (ch != *s) {
2322 status = 1;
2323 *s = ch;
2324 }
2325 s++;
2326 }
2327
2328 return status;
2329}
2330
2331static
2332int fixlower(PyUnicodeObject *self)
2333{
2334 int len = self->length;
2335 Py_UNICODE *s = self->str;
2336 int status = 0;
2337
2338 while (len-- > 0) {
2339 register Py_UNICODE ch;
2340
2341 ch = Py_UNICODE_TOLOWER(*s);
2342 if (ch != *s) {
2343 status = 1;
2344 *s = ch;
2345 }
2346 s++;
2347 }
2348
2349 return status;
2350}
2351
2352static
2353int fixswapcase(PyUnicodeObject *self)
2354{
2355 int len = self->length;
2356 Py_UNICODE *s = self->str;
2357 int status = 0;
2358
2359 while (len-- > 0) {
2360 if (Py_UNICODE_ISUPPER(*s)) {
2361 *s = Py_UNICODE_TOLOWER(*s);
2362 status = 1;
2363 } else if (Py_UNICODE_ISLOWER(*s)) {
2364 *s = Py_UNICODE_TOUPPER(*s);
2365 status = 1;
2366 }
2367 s++;
2368 }
2369
2370 return status;
2371}
2372
2373static
2374int fixcapitalize(PyUnicodeObject *self)
2375{
2376 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2377 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2378 return 1;
2379 }
2380 return 0;
2381}
2382
2383static
2384int fixtitle(PyUnicodeObject *self)
2385{
2386 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2387 register Py_UNICODE *e;
2388 int previous_is_cased;
2389
2390 /* Shortcut for single character strings */
2391 if (PyUnicode_GET_SIZE(self) == 1) {
2392 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2393 if (*p != ch) {
2394 *p = ch;
2395 return 1;
2396 }
2397 else
2398 return 0;
2399 }
2400
2401 e = p + PyUnicode_GET_SIZE(self);
2402 previous_is_cased = 0;
2403 for (; p < e; p++) {
2404 register const Py_UNICODE ch = *p;
2405
2406 if (previous_is_cased)
2407 *p = Py_UNICODE_TOLOWER(ch);
2408 else
2409 *p = Py_UNICODE_TOTITLE(ch);
2410
2411 if (Py_UNICODE_ISLOWER(ch) ||
2412 Py_UNICODE_ISUPPER(ch) ||
2413 Py_UNICODE_ISTITLE(ch))
2414 previous_is_cased = 1;
2415 else
2416 previous_is_cased = 0;
2417 }
2418 return 1;
2419}
2420
2421PyObject *PyUnicode_Join(PyObject *separator,
2422 PyObject *seq)
2423{
2424 Py_UNICODE *sep;
2425 int seplen;
2426 PyUnicodeObject *res = NULL;
2427 int reslen = 0;
2428 Py_UNICODE *p;
2429 int seqlen = 0;
2430 int sz = 100;
2431 int i;
2432
2433 seqlen = PySequence_Length(seq);
2434 if (seqlen < 0 && PyErr_Occurred())
2435 return NULL;
2436
2437 if (separator == NULL) {
2438 Py_UNICODE blank = ' ';
2439 sep = &blank;
2440 seplen = 1;
2441 }
2442 else {
2443 separator = PyUnicode_FromObject(separator);
2444 if (separator == NULL)
2445 return NULL;
2446 sep = PyUnicode_AS_UNICODE(separator);
2447 seplen = PyUnicode_GET_SIZE(separator);
2448 }
2449
2450 res = _PyUnicode_New(sz);
2451 if (res == NULL)
2452 goto onError;
2453 p = PyUnicode_AS_UNICODE(res);
2454 reslen = 0;
2455
2456 for (i = 0; i < seqlen; i++) {
2457 int itemlen;
2458 PyObject *item;
2459
2460 item = PySequence_GetItem(seq, i);
2461 if (item == NULL)
2462 goto onError;
2463 if (!PyUnicode_Check(item)) {
2464 PyObject *v;
2465 v = PyUnicode_FromObject(item);
2466 Py_DECREF(item);
2467 item = v;
2468 if (item == NULL)
2469 goto onError;
2470 }
2471 itemlen = PyUnicode_GET_SIZE(item);
2472 while (reslen + itemlen + seplen >= sz) {
2473 if (_PyUnicode_Resize(res, sz*2))
2474 goto onError;
2475 sz *= 2;
2476 p = PyUnicode_AS_UNICODE(res) + reslen;
2477 }
2478 if (i > 0) {
2479 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2480 p += seplen;
2481 reslen += seplen;
2482 }
2483 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2484 p += itemlen;
2485 reslen += itemlen;
2486 Py_DECREF(item);
2487 }
2488 if (_PyUnicode_Resize(res, reslen))
2489 goto onError;
2490
2491 Py_XDECREF(separator);
2492 return (PyObject *)res;
2493
2494 onError:
2495 Py_XDECREF(separator);
2496 Py_DECREF(res);
2497 return NULL;
2498}
2499
2500static
2501PyUnicodeObject *pad(PyUnicodeObject *self,
2502 int left,
2503 int right,
2504 Py_UNICODE fill)
2505{
2506 PyUnicodeObject *u;
2507
2508 if (left < 0)
2509 left = 0;
2510 if (right < 0)
2511 right = 0;
2512
2513 if (left == 0 && right == 0) {
2514 Py_INCREF(self);
2515 return self;
2516 }
2517
2518 u = _PyUnicode_New(left + self->length + right);
2519 if (u) {
2520 if (left)
2521 Py_UNICODE_FILL(u->str, fill, left);
2522 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2523 if (right)
2524 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2525 }
2526
2527 return u;
2528}
2529
2530#define SPLIT_APPEND(data, left, right) \
2531 str = PyUnicode_FromUnicode(data + left, right - left); \
2532 if (!str) \
2533 goto onError; \
2534 if (PyList_Append(list, str)) { \
2535 Py_DECREF(str); \
2536 goto onError; \
2537 } \
2538 else \
2539 Py_DECREF(str);
2540
2541static
2542PyObject *split_whitespace(PyUnicodeObject *self,
2543 PyObject *list,
2544 int maxcount)
2545{
2546 register int i;
2547 register int j;
2548 int len = self->length;
2549 PyObject *str;
2550
2551 for (i = j = 0; i < len; ) {
2552 /* find a token */
2553 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2554 i++;
2555 j = i;
2556 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2557 i++;
2558 if (j < i) {
2559 if (maxcount-- <= 0)
2560 break;
2561 SPLIT_APPEND(self->str, j, i);
2562 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2563 i++;
2564 j = i;
2565 }
2566 }
2567 if (j < len) {
2568 SPLIT_APPEND(self->str, j, len);
2569 }
2570 return list;
2571
2572 onError:
2573 Py_DECREF(list);
2574 return NULL;
2575}
2576
2577PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002578 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002579{
2580 register int i;
2581 register int j;
2582 int len;
2583 PyObject *list;
2584 PyObject *str;
2585 Py_UNICODE *data;
2586
2587 string = PyUnicode_FromObject(string);
2588 if (string == NULL)
2589 return NULL;
2590 data = PyUnicode_AS_UNICODE(string);
2591 len = PyUnicode_GET_SIZE(string);
2592
Guido van Rossumd57fd912000-03-10 22:53:23 +00002593 list = PyList_New(0);
2594 if (!list)
2595 goto onError;
2596
2597 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002598 int eol;
2599
Guido van Rossumd57fd912000-03-10 22:53:23 +00002600 /* Find a line and append it */
2601 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2602 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002603
2604 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002605 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002606 if (i < len) {
2607 if (data[i] == '\r' && i + 1 < len &&
2608 data[i+1] == '\n')
2609 i += 2;
2610 else
2611 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002612 if (keepends)
2613 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002614 }
Guido van Rossum86662912000-04-11 15:38:46 +00002615 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002616 j = i;
2617 }
2618 if (j < len) {
2619 SPLIT_APPEND(data, j, len);
2620 }
2621
2622 Py_DECREF(string);
2623 return list;
2624
2625 onError:
2626 Py_DECREF(list);
2627 Py_DECREF(string);
2628 return NULL;
2629}
2630
2631static
2632PyObject *split_char(PyUnicodeObject *self,
2633 PyObject *list,
2634 Py_UNICODE ch,
2635 int maxcount)
2636{
2637 register int i;
2638 register int j;
2639 int len = self->length;
2640 PyObject *str;
2641
2642 for (i = j = 0; i < len; ) {
2643 if (self->str[i] == ch) {
2644 if (maxcount-- <= 0)
2645 break;
2646 SPLIT_APPEND(self->str, j, i);
2647 i = j = i + 1;
2648 } else
2649 i++;
2650 }
2651 if (j <= len) {
2652 SPLIT_APPEND(self->str, j, len);
2653 }
2654 return list;
2655
2656 onError:
2657 Py_DECREF(list);
2658 return NULL;
2659}
2660
2661static
2662PyObject *split_substring(PyUnicodeObject *self,
2663 PyObject *list,
2664 PyUnicodeObject *substring,
2665 int maxcount)
2666{
2667 register int i;
2668 register int j;
2669 int len = self->length;
2670 int sublen = substring->length;
2671 PyObject *str;
2672
2673 for (i = j = 0; i < len - sublen; ) {
2674 if (Py_UNICODE_MATCH(self, i, substring)) {
2675 if (maxcount-- <= 0)
2676 break;
2677 SPLIT_APPEND(self->str, j, i);
2678 i = j = i + sublen;
2679 } else
2680 i++;
2681 }
2682 if (j <= len) {
2683 SPLIT_APPEND(self->str, j, len);
2684 }
2685 return list;
2686
2687 onError:
2688 Py_DECREF(list);
2689 return NULL;
2690}
2691
2692#undef SPLIT_APPEND
2693
2694static
2695PyObject *split(PyUnicodeObject *self,
2696 PyUnicodeObject *substring,
2697 int maxcount)
2698{
2699 PyObject *list;
2700
2701 if (maxcount < 0)
2702 maxcount = INT_MAX;
2703
2704 list = PyList_New(0);
2705 if (!list)
2706 return NULL;
2707
2708 if (substring == NULL)
2709 return split_whitespace(self,list,maxcount);
2710
2711 else if (substring->length == 1)
2712 return split_char(self,list,substring->str[0],maxcount);
2713
2714 else if (substring->length == 0) {
2715 Py_DECREF(list);
2716 PyErr_SetString(PyExc_ValueError, "empty separator");
2717 return NULL;
2718 }
2719 else
2720 return split_substring(self,list,substring,maxcount);
2721}
2722
2723static
2724PyObject *strip(PyUnicodeObject *self,
2725 int left,
2726 int right)
2727{
2728 Py_UNICODE *p = self->str;
2729 int start = 0;
2730 int end = self->length;
2731
2732 if (left)
2733 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2734 start++;
2735
2736 if (right)
2737 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2738 end--;
2739
2740 if (start == 0 && end == self->length) {
2741 /* couldn't strip anything off, return original string */
2742 Py_INCREF(self);
2743 return (PyObject*) self;
2744 }
2745
2746 return (PyObject*) PyUnicode_FromUnicode(
2747 self->str + start,
2748 end - start
2749 );
2750}
2751
2752static
2753PyObject *replace(PyUnicodeObject *self,
2754 PyUnicodeObject *str1,
2755 PyUnicodeObject *str2,
2756 int maxcount)
2757{
2758 PyUnicodeObject *u;
2759
2760 if (maxcount < 0)
2761 maxcount = INT_MAX;
2762
2763 if (str1->length == 1 && str2->length == 1) {
2764 int i;
2765
2766 /* replace characters */
2767 if (!findchar(self->str, self->length, str1->str[0])) {
2768 /* nothing to replace, return original string */
2769 Py_INCREF(self);
2770 u = self;
2771 } else {
2772 Py_UNICODE u1 = str1->str[0];
2773 Py_UNICODE u2 = str2->str[0];
2774
2775 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2776 self->str,
2777 self->length
2778 );
2779 if (u)
2780 for (i = 0; i < u->length; i++)
2781 if (u->str[i] == u1) {
2782 if (--maxcount < 0)
2783 break;
2784 u->str[i] = u2;
2785 }
2786 }
2787
2788 } else {
2789 int n, i;
2790 Py_UNICODE *p;
2791
2792 /* replace strings */
2793 n = count(self, 0, self->length, str1);
2794 if (n > maxcount)
2795 n = maxcount;
2796 if (n == 0) {
2797 /* nothing to replace, return original string */
2798 Py_INCREF(self);
2799 u = self;
2800 } else {
2801 u = _PyUnicode_New(
2802 self->length + n * (str2->length - str1->length));
2803 if (u) {
2804 i = 0;
2805 p = u->str;
2806 while (i <= self->length - str1->length)
2807 if (Py_UNICODE_MATCH(self, i, str1)) {
2808 /* replace string segment */
2809 Py_UNICODE_COPY(p, str2->str, str2->length);
2810 p += str2->length;
2811 i += str1->length;
2812 if (--n <= 0) {
2813 /* copy remaining part */
2814 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2815 break;
2816 }
2817 } else
2818 *p++ = self->str[i++];
2819 }
2820 }
2821 }
2822
2823 return (PyObject *) u;
2824}
2825
2826/* --- Unicode Object Methods --------------------------------------------- */
2827
2828static char title__doc__[] =
2829"S.title() -> unicode\n\
2830\n\
2831Return a titlecased version of S, i.e. words start with title case\n\
2832characters, all remaining cased characters have lower case.";
2833
2834static PyObject*
2835unicode_title(PyUnicodeObject *self, PyObject *args)
2836{
2837 if (!PyArg_NoArgs(args))
2838 return NULL;
2839 return fixup(self, fixtitle);
2840}
2841
2842static char capitalize__doc__[] =
2843"S.capitalize() -> unicode\n\
2844\n\
2845Return a capitalized version of S, i.e. make the first character\n\
2846have upper case.";
2847
2848static PyObject*
2849unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2850{
2851 if (!PyArg_NoArgs(args))
2852 return NULL;
2853 return fixup(self, fixcapitalize);
2854}
2855
2856#if 0
2857static char capwords__doc__[] =
2858"S.capwords() -> unicode\n\
2859\n\
2860Apply .capitalize() to all words in S and return the result with\n\
2861normalized whitespace (all whitespace strings are replaced by ' ').";
2862
2863static PyObject*
2864unicode_capwords(PyUnicodeObject *self, PyObject *args)
2865{
2866 PyObject *list;
2867 PyObject *item;
2868 int i;
2869
2870 if (!PyArg_NoArgs(args))
2871 return NULL;
2872
2873 /* Split into words */
2874 list = split(self, NULL, -1);
2875 if (!list)
2876 return NULL;
2877
2878 /* Capitalize each word */
2879 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2880 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2881 fixcapitalize);
2882 if (item == NULL)
2883 goto onError;
2884 Py_DECREF(PyList_GET_ITEM(list, i));
2885 PyList_SET_ITEM(list, i, item);
2886 }
2887
2888 /* Join the words to form a new string */
2889 item = PyUnicode_Join(NULL, list);
2890
2891onError:
2892 Py_DECREF(list);
2893 return (PyObject *)item;
2894}
2895#endif
2896
2897static char center__doc__[] =
2898"S.center(width) -> unicode\n\
2899\n\
2900Return S centered in a Unicode string of length width. Padding is done\n\
2901using spaces.";
2902
2903static PyObject *
2904unicode_center(PyUnicodeObject *self, PyObject *args)
2905{
2906 int marg, left;
2907 int width;
2908
2909 if (!PyArg_ParseTuple(args, "i:center", &width))
2910 return NULL;
2911
2912 if (self->length >= width) {
2913 Py_INCREF(self);
2914 return (PyObject*) self;
2915 }
2916
2917 marg = width - self->length;
2918 left = marg / 2 + (marg & width & 1);
2919
2920 return (PyObject*) pad(self, left, marg - left, ' ');
2921}
2922
2923static int
2924unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2925{
2926 int len1, len2;
2927 Py_UNICODE *s1 = str1->str;
2928 Py_UNICODE *s2 = str2->str;
2929
2930 len1 = str1->length;
2931 len2 = str2->length;
2932
2933 while (len1 > 0 && len2 > 0) {
2934 int cmp = (*s1++) - (*s2++);
2935 if (cmp)
2936 /* This should make Christian happy! */
2937 return (cmp < 0) ? -1 : (cmp != 0);
2938 len1--, len2--;
2939 }
2940
2941 return (len1 < len2) ? -1 : (len1 != len2);
2942}
2943
2944int PyUnicode_Compare(PyObject *left,
2945 PyObject *right)
2946{
2947 PyUnicodeObject *u = NULL, *v = NULL;
2948 int result;
2949
2950 /* Coerce the two arguments */
2951 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2952 if (u == NULL)
2953 goto onError;
2954 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2955 if (v == NULL)
2956 goto onError;
2957
2958 /* Shortcut for emtpy or interned objects */
2959 if (v == u) {
2960 Py_DECREF(u);
2961 Py_DECREF(v);
2962 return 0;
2963 }
2964
2965 result = unicode_compare(u, v);
2966
2967 Py_DECREF(u);
2968 Py_DECREF(v);
2969 return result;
2970
2971onError:
2972 Py_XDECREF(u);
2973 Py_XDECREF(v);
2974 return -1;
2975}
2976
Guido van Rossum403d68b2000-03-13 15:55:09 +00002977int PyUnicode_Contains(PyObject *container,
2978 PyObject *element)
2979{
2980 PyUnicodeObject *u = NULL, *v = NULL;
2981 int result;
2982 register const Py_UNICODE *p, *e;
2983 register Py_UNICODE ch;
2984
2985 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002986 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2987 if (v == NULL)
2988 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002989 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2990 if (u == NULL) {
2991 Py_DECREF(v);
2992 goto onError;
2993 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002994
2995 /* Check v in u */
2996 if (PyUnicode_GET_SIZE(v) != 1) {
2997 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00002998 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00002999 goto onError;
3000 }
3001 ch = *PyUnicode_AS_UNICODE(v);
3002 p = PyUnicode_AS_UNICODE(u);
3003 e = p + PyUnicode_GET_SIZE(u);
3004 result = 0;
3005 while (p < e) {
3006 if (*p++ == ch) {
3007 result = 1;
3008 break;
3009 }
3010 }
3011
3012 Py_DECREF(u);
3013 Py_DECREF(v);
3014 return result;
3015
3016onError:
3017 Py_XDECREF(u);
3018 Py_XDECREF(v);
3019 return -1;
3020}
3021
Guido van Rossumd57fd912000-03-10 22:53:23 +00003022/* Concat to string or Unicode object giving a new Unicode object. */
3023
3024PyObject *PyUnicode_Concat(PyObject *left,
3025 PyObject *right)
3026{
3027 PyUnicodeObject *u = NULL, *v = NULL, *w;
3028
3029 /* Coerce the two arguments */
3030 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3031 if (u == NULL)
3032 goto onError;
3033 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3034 if (v == NULL)
3035 goto onError;
3036
3037 /* Shortcuts */
3038 if (v == unicode_empty) {
3039 Py_DECREF(v);
3040 return (PyObject *)u;
3041 }
3042 if (u == unicode_empty) {
3043 Py_DECREF(u);
3044 return (PyObject *)v;
3045 }
3046
3047 /* Concat the two Unicode strings */
3048 w = _PyUnicode_New(u->length + v->length);
3049 if (w == NULL)
3050 goto onError;
3051 Py_UNICODE_COPY(w->str, u->str, u->length);
3052 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3053
3054 Py_DECREF(u);
3055 Py_DECREF(v);
3056 return (PyObject *)w;
3057
3058onError:
3059 Py_XDECREF(u);
3060 Py_XDECREF(v);
3061 return NULL;
3062}
3063
3064static char count__doc__[] =
3065"S.count(sub[, start[, end]]) -> int\n\
3066\n\
3067Return the number of occurrences of substring sub in Unicode string\n\
3068S[start:end]. Optional arguments start and end are\n\
3069interpreted as in slice notation.";
3070
3071static PyObject *
3072unicode_count(PyUnicodeObject *self, PyObject *args)
3073{
3074 PyUnicodeObject *substring;
3075 int start = 0;
3076 int end = INT_MAX;
3077 PyObject *result;
3078
Guido van Rossumb8872e62000-05-09 14:14:27 +00003079 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3080 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003081 return NULL;
3082
3083 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3084 (PyObject *)substring);
3085 if (substring == NULL)
3086 return NULL;
3087
3088 if (substring->length == 0) {
3089 Py_DECREF(substring);
3090 return PyInt_FromLong((long) 0);
3091 }
3092
3093 if (start < 0)
3094 start += self->length;
3095 if (start < 0)
3096 start = 0;
3097 if (end > self->length)
3098 end = self->length;
3099 if (end < 0)
3100 end += self->length;
3101 if (end < 0)
3102 end = 0;
3103
3104 result = PyInt_FromLong((long) count(self, start, end, substring));
3105
3106 Py_DECREF(substring);
3107 return result;
3108}
3109
3110static char encode__doc__[] =
3111"S.encode([encoding[,errors]]) -> string\n\
3112\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003113Return an encoded string version of S. Default encoding is the current\n\
3114default string encoding. errors may be given to set a different error\n\
3115handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3116a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003117
3118static PyObject *
3119unicode_encode(PyUnicodeObject *self, PyObject *args)
3120{
3121 char *encoding = NULL;
3122 char *errors = NULL;
3123 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3124 return NULL;
3125 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3126}
3127
3128static char expandtabs__doc__[] =
3129"S.expandtabs([tabsize]) -> unicode\n\
3130\n\
3131Return a copy of S where all tab characters are expanded using spaces.\n\
3132If tabsize is not given, a tab size of 8 characters is assumed.";
3133
3134static PyObject*
3135unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3136{
3137 Py_UNICODE *e;
3138 Py_UNICODE *p;
3139 Py_UNICODE *q;
3140 int i, j;
3141 PyUnicodeObject *u;
3142 int tabsize = 8;
3143
3144 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3145 return NULL;
3146
3147 /* First pass: determine size of ouput string */
3148 i = j = 0;
3149 e = self->str + self->length;
3150 for (p = self->str; p < e; p++)
3151 if (*p == '\t') {
3152 if (tabsize > 0)
3153 j += tabsize - (j % tabsize);
3154 }
3155 else {
3156 j++;
3157 if (*p == '\n' || *p == '\r') {
3158 i += j;
3159 j = 0;
3160 }
3161 }
3162
3163 /* Second pass: create output string and fill it */
3164 u = _PyUnicode_New(i + j);
3165 if (!u)
3166 return NULL;
3167
3168 j = 0;
3169 q = u->str;
3170
3171 for (p = self->str; p < e; p++)
3172 if (*p == '\t') {
3173 if (tabsize > 0) {
3174 i = tabsize - (j % tabsize);
3175 j += i;
3176 while (i--)
3177 *q++ = ' ';
3178 }
3179 }
3180 else {
3181 j++;
3182 *q++ = *p;
3183 if (*p == '\n' || *p == '\r')
3184 j = 0;
3185 }
3186
3187 return (PyObject*) u;
3188}
3189
3190static char find__doc__[] =
3191"S.find(sub [,start [,end]]) -> int\n\
3192\n\
3193Return the lowest index in S where substring sub is found,\n\
3194such that sub is contained within s[start,end]. Optional\n\
3195arguments start and end are interpreted as in slice notation.\n\
3196\n\
3197Return -1 on failure.";
3198
3199static PyObject *
3200unicode_find(PyUnicodeObject *self, PyObject *args)
3201{
3202 PyUnicodeObject *substring;
3203 int start = 0;
3204 int end = INT_MAX;
3205 PyObject *result;
3206
Guido van Rossumb8872e62000-05-09 14:14:27 +00003207 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3208 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003209 return NULL;
3210 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3211 (PyObject *)substring);
3212 if (substring == NULL)
3213 return NULL;
3214
3215 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3216
3217 Py_DECREF(substring);
3218 return result;
3219}
3220
3221static PyObject *
3222unicode_getitem(PyUnicodeObject *self, int index)
3223{
3224 if (index < 0 || index >= self->length) {
3225 PyErr_SetString(PyExc_IndexError, "string index out of range");
3226 return NULL;
3227 }
3228
3229 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3230}
3231
3232static long
3233unicode_hash(PyUnicodeObject *self)
3234{
3235 long hash;
3236 PyObject *utf8;
3237
3238 /* Since Unicode objects compare equal to their UTF-8 string
3239 counterparts, they should also use the UTF-8 strings as basis
3240 for their hash value. This is needed to assure that strings and
3241 Unicode objects behave in the same way as dictionary
3242 keys. Unfortunately, this costs some performance and also some
3243 memory if the cached UTF-8 representation is not used later
3244 on. */
3245 if (self->hash != -1)
3246 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003247 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 if (utf8 == NULL)
3249 return -1;
3250 hash = PyObject_Hash(utf8);
3251 if (hash == -1)
3252 return -1;
3253 self->hash = hash;
3254 return hash;
3255}
3256
3257static char index__doc__[] =
3258"S.index(sub [,start [,end]]) -> int\n\
3259\n\
3260Like S.find() but raise ValueError when the substring is not found.";
3261
3262static PyObject *
3263unicode_index(PyUnicodeObject *self, PyObject *args)
3264{
3265 int result;
3266 PyUnicodeObject *substring;
3267 int start = 0;
3268 int end = INT_MAX;
3269
Guido van Rossumb8872e62000-05-09 14:14:27 +00003270 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3271 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003272 return NULL;
3273
3274 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3275 (PyObject *)substring);
3276 if (substring == NULL)
3277 return NULL;
3278
3279 result = findstring(self, substring, start, end, 1);
3280
3281 Py_DECREF(substring);
3282 if (result < 0) {
3283 PyErr_SetString(PyExc_ValueError, "substring not found");
3284 return NULL;
3285 }
3286 return PyInt_FromLong(result);
3287}
3288
3289static char islower__doc__[] =
3290"S.islower() -> int\n\
3291\n\
3292Return 1 if all cased characters in S are lowercase and there is\n\
3293at least one cased character in S, 0 otherwise.";
3294
3295static PyObject*
3296unicode_islower(PyUnicodeObject *self, PyObject *args)
3297{
3298 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3299 register const Py_UNICODE *e;
3300 int cased;
3301
3302 if (!PyArg_NoArgs(args))
3303 return NULL;
3304
3305 /* Shortcut for single character strings */
3306 if (PyUnicode_GET_SIZE(self) == 1)
3307 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3308
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003309 /* Special case for empty strings */
3310 if (PyString_GET_SIZE(self) == 0)
3311 return PyInt_FromLong(0);
3312
Guido van Rossumd57fd912000-03-10 22:53:23 +00003313 e = p + PyUnicode_GET_SIZE(self);
3314 cased = 0;
3315 for (; p < e; p++) {
3316 register const Py_UNICODE ch = *p;
3317
3318 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3319 return PyInt_FromLong(0);
3320 else if (!cased && Py_UNICODE_ISLOWER(ch))
3321 cased = 1;
3322 }
3323 return PyInt_FromLong(cased);
3324}
3325
3326static char isupper__doc__[] =
3327"S.isupper() -> int\n\
3328\n\
3329Return 1 if all cased characters in S are uppercase and there is\n\
3330at least one cased character in S, 0 otherwise.";
3331
3332static PyObject*
3333unicode_isupper(PyUnicodeObject *self, PyObject *args)
3334{
3335 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3336 register const Py_UNICODE *e;
3337 int cased;
3338
3339 if (!PyArg_NoArgs(args))
3340 return NULL;
3341
3342 /* Shortcut for single character strings */
3343 if (PyUnicode_GET_SIZE(self) == 1)
3344 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3345
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003346 /* Special case for empty strings */
3347 if (PyString_GET_SIZE(self) == 0)
3348 return PyInt_FromLong(0);
3349
Guido van Rossumd57fd912000-03-10 22:53:23 +00003350 e = p + PyUnicode_GET_SIZE(self);
3351 cased = 0;
3352 for (; p < e; p++) {
3353 register const Py_UNICODE ch = *p;
3354
3355 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3356 return PyInt_FromLong(0);
3357 else if (!cased && Py_UNICODE_ISUPPER(ch))
3358 cased = 1;
3359 }
3360 return PyInt_FromLong(cased);
3361}
3362
3363static char istitle__doc__[] =
3364"S.istitle() -> int\n\
3365\n\
3366Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3367may only follow uncased characters and lowercase characters only cased\n\
3368ones. Return 0 otherwise.";
3369
3370static PyObject*
3371unicode_istitle(PyUnicodeObject *self, PyObject *args)
3372{
3373 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3374 register const Py_UNICODE *e;
3375 int cased, previous_is_cased;
3376
3377 if (!PyArg_NoArgs(args))
3378 return NULL;
3379
3380 /* Shortcut for single character strings */
3381 if (PyUnicode_GET_SIZE(self) == 1)
3382 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3383 (Py_UNICODE_ISUPPER(*p) != 0));
3384
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003385 /* Special case for empty strings */
3386 if (PyString_GET_SIZE(self) == 0)
3387 return PyInt_FromLong(0);
3388
Guido van Rossumd57fd912000-03-10 22:53:23 +00003389 e = p + PyUnicode_GET_SIZE(self);
3390 cased = 0;
3391 previous_is_cased = 0;
3392 for (; p < e; p++) {
3393 register const Py_UNICODE ch = *p;
3394
3395 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3396 if (previous_is_cased)
3397 return PyInt_FromLong(0);
3398 previous_is_cased = 1;
3399 cased = 1;
3400 }
3401 else if (Py_UNICODE_ISLOWER(ch)) {
3402 if (!previous_is_cased)
3403 return PyInt_FromLong(0);
3404 previous_is_cased = 1;
3405 cased = 1;
3406 }
3407 else
3408 previous_is_cased = 0;
3409 }
3410 return PyInt_FromLong(cased);
3411}
3412
3413static char isspace__doc__[] =
3414"S.isspace() -> int\n\
3415\n\
3416Return 1 if there are only whitespace characters in S,\n\
34170 otherwise.";
3418
3419static PyObject*
3420unicode_isspace(PyUnicodeObject *self, PyObject *args)
3421{
3422 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3423 register const Py_UNICODE *e;
3424
3425 if (!PyArg_NoArgs(args))
3426 return NULL;
3427
3428 /* Shortcut for single character strings */
3429 if (PyUnicode_GET_SIZE(self) == 1 &&
3430 Py_UNICODE_ISSPACE(*p))
3431 return PyInt_FromLong(1);
3432
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003433 /* Special case for empty strings */
3434 if (PyString_GET_SIZE(self) == 0)
3435 return PyInt_FromLong(0);
3436
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 e = p + PyUnicode_GET_SIZE(self);
3438 for (; p < e; p++) {
3439 if (!Py_UNICODE_ISSPACE(*p))
3440 return PyInt_FromLong(0);
3441 }
3442 return PyInt_FromLong(1);
3443}
3444
3445static char isdecimal__doc__[] =
3446"S.isdecimal() -> int\n\
3447\n\
3448Return 1 if there are only decimal characters in S,\n\
34490 otherwise.";
3450
3451static PyObject*
3452unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3453{
3454 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3455 register const Py_UNICODE *e;
3456
3457 if (!PyArg_NoArgs(args))
3458 return NULL;
3459
3460 /* Shortcut for single character strings */
3461 if (PyUnicode_GET_SIZE(self) == 1 &&
3462 Py_UNICODE_ISDECIMAL(*p))
3463 return PyInt_FromLong(1);
3464
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003465 /* Special case for empty strings */
3466 if (PyString_GET_SIZE(self) == 0)
3467 return PyInt_FromLong(0);
3468
Guido van Rossumd57fd912000-03-10 22:53:23 +00003469 e = p + PyUnicode_GET_SIZE(self);
3470 for (; p < e; p++) {
3471 if (!Py_UNICODE_ISDECIMAL(*p))
3472 return PyInt_FromLong(0);
3473 }
3474 return PyInt_FromLong(1);
3475}
3476
3477static char isdigit__doc__[] =
3478"S.isdigit() -> int\n\
3479\n\
3480Return 1 if there are only digit characters in S,\n\
34810 otherwise.";
3482
3483static PyObject*
3484unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3485{
3486 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3487 register const Py_UNICODE *e;
3488
3489 if (!PyArg_NoArgs(args))
3490 return NULL;
3491
3492 /* Shortcut for single character strings */
3493 if (PyUnicode_GET_SIZE(self) == 1 &&
3494 Py_UNICODE_ISDIGIT(*p))
3495 return PyInt_FromLong(1);
3496
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003497 /* Special case for empty strings */
3498 if (PyString_GET_SIZE(self) == 0)
3499 return PyInt_FromLong(0);
3500
Guido van Rossumd57fd912000-03-10 22:53:23 +00003501 e = p + PyUnicode_GET_SIZE(self);
3502 for (; p < e; p++) {
3503 if (!Py_UNICODE_ISDIGIT(*p))
3504 return PyInt_FromLong(0);
3505 }
3506 return PyInt_FromLong(1);
3507}
3508
3509static char isnumeric__doc__[] =
3510"S.isnumeric() -> int\n\
3511\n\
3512Return 1 if there are only numeric characters in S,\n\
35130 otherwise.";
3514
3515static PyObject*
3516unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3517{
3518 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3519 register const Py_UNICODE *e;
3520
3521 if (!PyArg_NoArgs(args))
3522 return NULL;
3523
3524 /* Shortcut for single character strings */
3525 if (PyUnicode_GET_SIZE(self) == 1 &&
3526 Py_UNICODE_ISNUMERIC(*p))
3527 return PyInt_FromLong(1);
3528
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003529 /* Special case for empty strings */
3530 if (PyString_GET_SIZE(self) == 0)
3531 return PyInt_FromLong(0);
3532
Guido van Rossumd57fd912000-03-10 22:53:23 +00003533 e = p + PyUnicode_GET_SIZE(self);
3534 for (; p < e; p++) {
3535 if (!Py_UNICODE_ISNUMERIC(*p))
3536 return PyInt_FromLong(0);
3537 }
3538 return PyInt_FromLong(1);
3539}
3540
3541static char join__doc__[] =
3542"S.join(sequence) -> unicode\n\
3543\n\
3544Return a string which is the concatenation of the strings in the\n\
3545sequence. The separator between elements is S.";
3546
3547static PyObject*
3548unicode_join(PyUnicodeObject *self, PyObject *args)
3549{
3550 PyObject *data;
3551 if (!PyArg_ParseTuple(args, "O:join", &data))
3552 return NULL;
3553
3554 return PyUnicode_Join((PyObject *)self, data);
3555}
3556
3557static int
3558unicode_length(PyUnicodeObject *self)
3559{
3560 return self->length;
3561}
3562
3563static char ljust__doc__[] =
3564"S.ljust(width) -> unicode\n\
3565\n\
3566Return S left justified in a Unicode string of length width. Padding is\n\
3567done using spaces.";
3568
3569static PyObject *
3570unicode_ljust(PyUnicodeObject *self, PyObject *args)
3571{
3572 int width;
3573 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3574 return NULL;
3575
3576 if (self->length >= width) {
3577 Py_INCREF(self);
3578 return (PyObject*) self;
3579 }
3580
3581 return (PyObject*) pad(self, 0, width - self->length, ' ');
3582}
3583
3584static char lower__doc__[] =
3585"S.lower() -> unicode\n\
3586\n\
3587Return a copy of the string S converted to lowercase.";
3588
3589static PyObject*
3590unicode_lower(PyUnicodeObject *self, PyObject *args)
3591{
3592 if (!PyArg_NoArgs(args))
3593 return NULL;
3594 return fixup(self, fixlower);
3595}
3596
3597static char lstrip__doc__[] =
3598"S.lstrip() -> unicode\n\
3599\n\
3600Return a copy of the string S with leading whitespace removed.";
3601
3602static PyObject *
3603unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3604{
3605 if (!PyArg_NoArgs(args))
3606 return NULL;
3607 return strip(self, 1, 0);
3608}
3609
3610static PyObject*
3611unicode_repeat(PyUnicodeObject *str, int len)
3612{
3613 PyUnicodeObject *u;
3614 Py_UNICODE *p;
3615
3616 if (len < 0)
3617 len = 0;
3618
3619 if (len == 1) {
3620 /* no repeat, return original string */
3621 Py_INCREF(str);
3622 return (PyObject*) str;
3623 }
3624
3625 u = _PyUnicode_New(len * str->length);
3626 if (!u)
3627 return NULL;
3628
3629 p = u->str;
3630
3631 while (len-- > 0) {
3632 Py_UNICODE_COPY(p, str->str, str->length);
3633 p += str->length;
3634 }
3635
3636 return (PyObject*) u;
3637}
3638
3639PyObject *PyUnicode_Replace(PyObject *obj,
3640 PyObject *subobj,
3641 PyObject *replobj,
3642 int maxcount)
3643{
3644 PyObject *self;
3645 PyObject *str1;
3646 PyObject *str2;
3647 PyObject *result;
3648
3649 self = PyUnicode_FromObject(obj);
3650 if (self == NULL)
3651 return NULL;
3652 str1 = PyUnicode_FromObject(subobj);
3653 if (str1 == NULL) {
3654 Py_DECREF(self);
3655 return NULL;
3656 }
3657 str2 = PyUnicode_FromObject(replobj);
3658 if (str2 == NULL) {
3659 Py_DECREF(self);
3660 Py_DECREF(str1);
3661 return NULL;
3662 }
3663 result = replace((PyUnicodeObject *)self,
3664 (PyUnicodeObject *)str1,
3665 (PyUnicodeObject *)str2,
3666 maxcount);
3667 Py_DECREF(self);
3668 Py_DECREF(str1);
3669 Py_DECREF(str2);
3670 return result;
3671}
3672
3673static char replace__doc__[] =
3674"S.replace (old, new[, maxsplit]) -> unicode\n\
3675\n\
3676Return a copy of S with all occurrences of substring\n\
3677old replaced by new. If the optional argument maxsplit is\n\
3678given, only the first maxsplit occurrences are replaced.";
3679
3680static PyObject*
3681unicode_replace(PyUnicodeObject *self, PyObject *args)
3682{
3683 PyUnicodeObject *str1;
3684 PyUnicodeObject *str2;
3685 int maxcount = -1;
3686 PyObject *result;
3687
3688 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3689 return NULL;
3690 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3691 if (str1 == NULL)
3692 return NULL;
3693 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3694 if (str2 == NULL)
3695 return NULL;
3696
3697 result = replace(self, str1, str2, maxcount);
3698
3699 Py_DECREF(str1);
3700 Py_DECREF(str2);
3701 return result;
3702}
3703
3704static
3705PyObject *unicode_repr(PyObject *unicode)
3706{
3707 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3708 PyUnicode_GET_SIZE(unicode),
3709 1);
3710}
3711
3712static char rfind__doc__[] =
3713"S.rfind(sub [,start [,end]]) -> int\n\
3714\n\
3715Return the highest index in S where substring sub is found,\n\
3716such that sub is contained within s[start,end]. Optional\n\
3717arguments start and end are interpreted as in slice notation.\n\
3718\n\
3719Return -1 on failure.";
3720
3721static PyObject *
3722unicode_rfind(PyUnicodeObject *self, PyObject *args)
3723{
3724 PyUnicodeObject *substring;
3725 int start = 0;
3726 int end = INT_MAX;
3727 PyObject *result;
3728
Guido van Rossumb8872e62000-05-09 14:14:27 +00003729 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
3730 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003731 return NULL;
3732 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3733 (PyObject *)substring);
3734 if (substring == NULL)
3735 return NULL;
3736
3737 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3738
3739 Py_DECREF(substring);
3740 return result;
3741}
3742
3743static char rindex__doc__[] =
3744"S.rindex(sub [,start [,end]]) -> int\n\
3745\n\
3746Like S.rfind() but raise ValueError when the substring is not found.";
3747
3748static PyObject *
3749unicode_rindex(PyUnicodeObject *self, PyObject *args)
3750{
3751 int result;
3752 PyUnicodeObject *substring;
3753 int start = 0;
3754 int end = INT_MAX;
3755
Guido van Rossumb8872e62000-05-09 14:14:27 +00003756 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
3757 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003758 return NULL;
3759 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3760 (PyObject *)substring);
3761 if (substring == NULL)
3762 return NULL;
3763
3764 result = findstring(self, substring, start, end, -1);
3765
3766 Py_DECREF(substring);
3767 if (result < 0) {
3768 PyErr_SetString(PyExc_ValueError, "substring not found");
3769 return NULL;
3770 }
3771 return PyInt_FromLong(result);
3772}
3773
3774static char rjust__doc__[] =
3775"S.rjust(width) -> unicode\n\
3776\n\
3777Return S right justified in a Unicode string of length width. Padding is\n\
3778done using spaces.";
3779
3780static PyObject *
3781unicode_rjust(PyUnicodeObject *self, PyObject *args)
3782{
3783 int width;
3784 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3785 return NULL;
3786
3787 if (self->length >= width) {
3788 Py_INCREF(self);
3789 return (PyObject*) self;
3790 }
3791
3792 return (PyObject*) pad(self, width - self->length, 0, ' ');
3793}
3794
3795static char rstrip__doc__[] =
3796"S.rstrip() -> unicode\n\
3797\n\
3798Return a copy of the string S with trailing whitespace removed.";
3799
3800static PyObject *
3801unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3802{
3803 if (!PyArg_NoArgs(args))
3804 return NULL;
3805 return strip(self, 0, 1);
3806}
3807
3808static PyObject*
3809unicode_slice(PyUnicodeObject *self, int start, int end)
3810{
3811 /* standard clamping */
3812 if (start < 0)
3813 start = 0;
3814 if (end < 0)
3815 end = 0;
3816 if (end > self->length)
3817 end = self->length;
3818 if (start == 0 && end == self->length) {
3819 /* full slice, return original string */
3820 Py_INCREF(self);
3821 return (PyObject*) self;
3822 }
3823 if (start > end)
3824 start = end;
3825 /* copy slice */
3826 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3827 end - start);
3828}
3829
3830PyObject *PyUnicode_Split(PyObject *s,
3831 PyObject *sep,
3832 int maxsplit)
3833{
3834 PyObject *result;
3835
3836 s = PyUnicode_FromObject(s);
3837 if (s == NULL)
3838 return NULL;
3839 if (sep != NULL) {
3840 sep = PyUnicode_FromObject(sep);
3841 if (sep == NULL) {
3842 Py_DECREF(s);
3843 return NULL;
3844 }
3845 }
3846
3847 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3848
3849 Py_DECREF(s);
3850 Py_XDECREF(sep);
3851 return result;
3852}
3853
3854static char split__doc__[] =
3855"S.split([sep [,maxsplit]]) -> list of strings\n\
3856\n\
3857Return a list of the words in S, using sep as the\n\
3858delimiter string. If maxsplit is given, at most maxsplit\n\
3859splits are done. If sep is not specified, any whitespace string\n\
3860is a separator.";
3861
3862static PyObject*
3863unicode_split(PyUnicodeObject *self, PyObject *args)
3864{
3865 PyObject *substring = Py_None;
3866 int maxcount = -1;
3867
3868 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3869 return NULL;
3870
3871 if (substring == Py_None)
3872 return split(self, NULL, maxcount);
3873 else if (PyUnicode_Check(substring))
3874 return split(self, (PyUnicodeObject *)substring, maxcount);
3875 else
3876 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3877}
3878
3879static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00003880"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00003881\n\
3882Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00003883Line breaks are not included in the resulting list unless keepends\n\
3884is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003885
3886static PyObject*
3887unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3888{
Guido van Rossum86662912000-04-11 15:38:46 +00003889 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003890
Guido van Rossum86662912000-04-11 15:38:46 +00003891 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003892 return NULL;
3893
Guido van Rossum86662912000-04-11 15:38:46 +00003894 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003895}
3896
3897static
3898PyObject *unicode_str(PyUnicodeObject *self)
3899{
Fred Drakee4315f52000-05-09 19:53:39 +00003900 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003901}
3902
3903static char strip__doc__[] =
3904"S.strip() -> unicode\n\
3905\n\
3906Return a copy of S with leading and trailing whitespace removed.";
3907
3908static PyObject *
3909unicode_strip(PyUnicodeObject *self, PyObject *args)
3910{
3911 if (!PyArg_NoArgs(args))
3912 return NULL;
3913 return strip(self, 1, 1);
3914}
3915
3916static char swapcase__doc__[] =
3917"S.swapcase() -> unicode\n\
3918\n\
3919Return a copy of S with uppercase characters converted to lowercase\n\
3920and vice versa.";
3921
3922static PyObject*
3923unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3924{
3925 if (!PyArg_NoArgs(args))
3926 return NULL;
3927 return fixup(self, fixswapcase);
3928}
3929
3930static char translate__doc__[] =
3931"S.translate(table) -> unicode\n\
3932\n\
3933Return a copy of the string S, where all characters have been mapped\n\
3934through the given translation table, which must be a mapping of\n\
3935Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3936are left untouched. Characters mapped to None are deleted.";
3937
3938static PyObject*
3939unicode_translate(PyUnicodeObject *self, PyObject *args)
3940{
3941 PyObject *table;
3942
3943 if (!PyArg_ParseTuple(args, "O:translate", &table))
3944 return NULL;
3945 return PyUnicode_TranslateCharmap(self->str,
3946 self->length,
3947 table,
3948 "ignore");
3949}
3950
3951static char upper__doc__[] =
3952"S.upper() -> unicode\n\
3953\n\
3954Return a copy of S converted to uppercase.";
3955
3956static PyObject*
3957unicode_upper(PyUnicodeObject *self, PyObject *args)
3958{
3959 if (!PyArg_NoArgs(args))
3960 return NULL;
3961 return fixup(self, fixupper);
3962}
3963
3964#if 0
3965static char zfill__doc__[] =
3966"S.zfill(width) -> unicode\n\
3967\n\
3968Pad a numeric string x with zeros on the left, to fill a field\n\
3969of the specified width. The string x is never truncated.";
3970
3971static PyObject *
3972unicode_zfill(PyUnicodeObject *self, PyObject *args)
3973{
3974 int fill;
3975 PyUnicodeObject *u;
3976
3977 int width;
3978 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3979 return NULL;
3980
3981 if (self->length >= width) {
3982 Py_INCREF(self);
3983 return (PyObject*) self;
3984 }
3985
3986 fill = width - self->length;
3987
3988 u = pad(self, fill, 0, '0');
3989
3990 if (u->str[fill] == '+' || u->str[fill] == '-') {
3991 /* move sign to beginning of string */
3992 u->str[0] = u->str[fill];
3993 u->str[fill] = '0';
3994 }
3995
3996 return (PyObject*) u;
3997}
3998#endif
3999
4000#if 0
4001static PyObject*
4002unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4003{
4004 if (!PyArg_NoArgs(args))
4005 return NULL;
4006 return PyInt_FromLong(unicode_freelist_size);
4007}
4008#endif
4009
4010static char startswith__doc__[] =
4011"S.startswith(prefix[, start[, end]]) -> int\n\
4012\n\
4013Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4014optional start, test S beginning at that position. With optional end, stop\n\
4015comparing S at that position.";
4016
4017static PyObject *
4018unicode_startswith(PyUnicodeObject *self,
4019 PyObject *args)
4020{
4021 PyUnicodeObject *substring;
4022 int start = 0;
4023 int end = INT_MAX;
4024 PyObject *result;
4025
Guido van Rossumb8872e62000-05-09 14:14:27 +00004026 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4027 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004028 return NULL;
4029 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4030 (PyObject *)substring);
4031 if (substring == NULL)
4032 return NULL;
4033
4034 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4035
4036 Py_DECREF(substring);
4037 return result;
4038}
4039
4040
4041static char endswith__doc__[] =
4042"S.endswith(suffix[, start[, end]]) -> int\n\
4043\n\
4044Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4045optional start, test S beginning at that position. With optional end, stop\n\
4046comparing S at that position.";
4047
4048static PyObject *
4049unicode_endswith(PyUnicodeObject *self,
4050 PyObject *args)
4051{
4052 PyUnicodeObject *substring;
4053 int start = 0;
4054 int end = INT_MAX;
4055 PyObject *result;
4056
Guido van Rossumb8872e62000-05-09 14:14:27 +00004057 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4058 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059 return NULL;
4060 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4061 (PyObject *)substring);
4062 if (substring == NULL)
4063 return NULL;
4064
4065 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4066
4067 Py_DECREF(substring);
4068 return result;
4069}
4070
4071
4072static PyMethodDef unicode_methods[] = {
4073
4074 /* Order is according to common usage: often used methods should
4075 appear first, since lookup is done sequentially. */
4076
4077 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4078 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4079 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4080 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4081 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4082 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4083 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4084 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4085 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4086 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4087 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4088 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4089 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4090 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4091/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4092 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4093 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4094 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4095 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4096 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4097 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4098 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4099 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4100 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4101 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4102 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4103 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4104 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4105 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4106 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4107 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4108 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4109 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4110#if 0
4111 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4112 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4113#endif
4114
4115#if 0
4116 /* This one is just used for debugging the implementation. */
4117 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4118#endif
4119
4120 {NULL, NULL}
4121};
4122
4123static PyObject *
4124unicode_getattr(PyUnicodeObject *self, char *name)
4125{
4126 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4127}
4128
4129static PySequenceMethods unicode_as_sequence = {
4130 (inquiry) unicode_length, /* sq_length */
4131 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4132 (intargfunc) unicode_repeat, /* sq_repeat */
4133 (intargfunc) unicode_getitem, /* sq_item */
4134 (intintargfunc) unicode_slice, /* sq_slice */
4135 0, /* sq_ass_item */
4136 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004137 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004138};
4139
4140static int
4141unicode_buffer_getreadbuf(PyUnicodeObject *self,
4142 int index,
4143 const void **ptr)
4144{
4145 if (index != 0) {
4146 PyErr_SetString(PyExc_SystemError,
4147 "accessing non-existent unicode segment");
4148 return -1;
4149 }
4150 *ptr = (void *) self->str;
4151 return PyUnicode_GET_DATA_SIZE(self);
4152}
4153
4154static int
4155unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4156 const void **ptr)
4157{
4158 PyErr_SetString(PyExc_TypeError,
4159 "cannot use unicode as modifyable buffer");
4160 return -1;
4161}
4162
4163static int
4164unicode_buffer_getsegcount(PyUnicodeObject *self,
4165 int *lenp)
4166{
4167 if (lenp)
4168 *lenp = PyUnicode_GET_DATA_SIZE(self);
4169 return 1;
4170}
4171
4172static int
4173unicode_buffer_getcharbuf(PyUnicodeObject *self,
4174 int index,
4175 const void **ptr)
4176{
4177 PyObject *str;
4178
4179 if (index != 0) {
4180 PyErr_SetString(PyExc_SystemError,
4181 "accessing non-existent unicode segment");
4182 return -1;
4183 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004184 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004185 if (str == NULL)
4186 return -1;
4187 *ptr = (void *) PyString_AS_STRING(str);
4188 return PyString_GET_SIZE(str);
4189}
4190
4191/* Helpers for PyUnicode_Format() */
4192
4193static PyObject *
4194getnextarg(args, arglen, p_argidx)
4195 PyObject *args;
4196int arglen;
4197int *p_argidx;
4198{
4199 int argidx = *p_argidx;
4200 if (argidx < arglen) {
4201 (*p_argidx)++;
4202 if (arglen < 0)
4203 return args;
4204 else
4205 return PyTuple_GetItem(args, argidx);
4206 }
4207 PyErr_SetString(PyExc_TypeError,
4208 "not enough arguments for format string");
4209 return NULL;
4210}
4211
4212#define F_LJUST (1<<0)
4213#define F_SIGN (1<<1)
4214#define F_BLANK (1<<2)
4215#define F_ALT (1<<3)
4216#define F_ZERO (1<<4)
4217
4218static
4219#ifdef HAVE_STDARG_PROTOTYPES
4220int usprintf(register Py_UNICODE *buffer, char *format, ...)
4221#else
4222int usprintf(va_alist) va_dcl
4223#endif
4224{
4225 register int i;
4226 int len;
4227 va_list va;
4228 char *charbuffer;
4229#ifdef HAVE_STDARG_PROTOTYPES
4230 va_start(va, format);
4231#else
4232 Py_UNICODE *args;
4233 char *format;
4234
4235 va_start(va);
4236 buffer = va_arg(va, Py_UNICODE *);
4237 format = va_arg(va, char *);
4238#endif
4239
4240 /* First, format the string as char array, then expand to Py_UNICODE
4241 array. */
4242 charbuffer = (char *)buffer;
4243 len = vsprintf(charbuffer, format, va);
4244 for (i = len - 1; i >= 0; i--)
4245 buffer[i] = (Py_UNICODE) charbuffer[i];
4246
4247 va_end(va);
4248 return len;
4249}
4250
4251static int
4252formatfloat(Py_UNICODE *buf,
4253 int flags,
4254 int prec,
4255 int type,
4256 PyObject *v)
4257{
4258 char fmt[20];
4259 double x;
4260
4261 x = PyFloat_AsDouble(v);
4262 if (x == -1.0 && PyErr_Occurred())
4263 return -1;
4264 if (prec < 0)
4265 prec = 6;
4266 if (prec > 50)
4267 prec = 50; /* Arbitrary limitation */
4268 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4269 type = 'g';
4270 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4271 return usprintf(buf, fmt, x);
4272}
4273
4274static int
4275formatint(Py_UNICODE *buf,
4276 int flags,
4277 int prec,
4278 int type,
4279 PyObject *v)
4280{
4281 char fmt[20];
4282 long x;
4283
4284 x = PyInt_AsLong(v);
4285 if (x == -1 && PyErr_Occurred())
4286 return -1;
4287 if (prec < 0)
4288 prec = 1;
4289 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4290 return usprintf(buf, fmt, x);
4291}
4292
4293static int
4294formatchar(Py_UNICODE *buf,
4295 PyObject *v)
4296{
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004297 if (PyUnicode_Check(v)) {
4298 if (PyUnicode_GET_SIZE(v) != 1)
4299 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004300 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004301 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004302
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004303 else if (PyString_Check(v)) {
4304 if (PyString_GET_SIZE(v) != 1)
4305 goto onError;
4306 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4307 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004308
4309 else {
4310 /* Integer input truncated to a character */
4311 long x;
4312 x = PyInt_AsLong(v);
4313 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004314 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004315 buf[0] = (char) x;
4316 }
4317 buf[1] = '\0';
4318 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004319
4320 onError:
4321 PyErr_SetString(PyExc_TypeError,
4322 "%c requires int or char");
4323 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324}
4325
4326PyObject *PyUnicode_Format(PyObject *format,
4327 PyObject *args)
4328{
4329 Py_UNICODE *fmt, *res;
4330 int fmtcnt, rescnt, reslen, arglen, argidx;
4331 int args_owned = 0;
4332 PyUnicodeObject *result = NULL;
4333 PyObject *dict = NULL;
4334 PyObject *uformat;
4335
4336 if (format == NULL || args == NULL) {
4337 PyErr_BadInternalCall();
4338 return NULL;
4339 }
4340 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004341 if (uformat == NULL)
4342 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004343 fmt = PyUnicode_AS_UNICODE(uformat);
4344 fmtcnt = PyUnicode_GET_SIZE(uformat);
4345
4346 reslen = rescnt = fmtcnt + 100;
4347 result = _PyUnicode_New(reslen);
4348 if (result == NULL)
4349 goto onError;
4350 res = PyUnicode_AS_UNICODE(result);
4351
4352 if (PyTuple_Check(args)) {
4353 arglen = PyTuple_Size(args);
4354 argidx = 0;
4355 }
4356 else {
4357 arglen = -1;
4358 argidx = -2;
4359 }
4360 if (args->ob_type->tp_as_mapping)
4361 dict = args;
4362
4363 while (--fmtcnt >= 0) {
4364 if (*fmt != '%') {
4365 if (--rescnt < 0) {
4366 rescnt = fmtcnt + 100;
4367 reslen += rescnt;
4368 if (_PyUnicode_Resize(result, reslen) < 0)
4369 return NULL;
4370 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4371 --rescnt;
4372 }
4373 *res++ = *fmt++;
4374 }
4375 else {
4376 /* Got a format specifier */
4377 int flags = 0;
4378 int width = -1;
4379 int prec = -1;
4380 int size = 0;
4381 Py_UNICODE c = '\0';
4382 Py_UNICODE fill;
4383 PyObject *v = NULL;
4384 PyObject *temp = NULL;
4385 Py_UNICODE *buf;
4386 Py_UNICODE sign;
4387 int len;
4388 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4389
4390 fmt++;
4391 if (*fmt == '(') {
4392 Py_UNICODE *keystart;
4393 int keylen;
4394 PyObject *key;
4395 int pcount = 1;
4396
4397 if (dict == NULL) {
4398 PyErr_SetString(PyExc_TypeError,
4399 "format requires a mapping");
4400 goto onError;
4401 }
4402 ++fmt;
4403 --fmtcnt;
4404 keystart = fmt;
4405 /* Skip over balanced parentheses */
4406 while (pcount > 0 && --fmtcnt >= 0) {
4407 if (*fmt == ')')
4408 --pcount;
4409 else if (*fmt == '(')
4410 ++pcount;
4411 fmt++;
4412 }
4413 keylen = fmt - keystart - 1;
4414 if (fmtcnt < 0 || pcount > 0) {
4415 PyErr_SetString(PyExc_ValueError,
4416 "incomplete format key");
4417 goto onError;
4418 }
Fred Drakee4315f52000-05-09 19:53:39 +00004419 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004420 then looked up since Python uses strings to hold
4421 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004422 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004423 key = PyUnicode_EncodeUTF8(keystart,
4424 keylen,
4425 NULL);
4426 if (key == NULL)
4427 goto onError;
4428 if (args_owned) {
4429 Py_DECREF(args);
4430 args_owned = 0;
4431 }
4432 args = PyObject_GetItem(dict, key);
4433 Py_DECREF(key);
4434 if (args == NULL) {
4435 goto onError;
4436 }
4437 args_owned = 1;
4438 arglen = -1;
4439 argidx = -2;
4440 }
4441 while (--fmtcnt >= 0) {
4442 switch (c = *fmt++) {
4443 case '-': flags |= F_LJUST; continue;
4444 case '+': flags |= F_SIGN; continue;
4445 case ' ': flags |= F_BLANK; continue;
4446 case '#': flags |= F_ALT; continue;
4447 case '0': flags |= F_ZERO; continue;
4448 }
4449 break;
4450 }
4451 if (c == '*') {
4452 v = getnextarg(args, arglen, &argidx);
4453 if (v == NULL)
4454 goto onError;
4455 if (!PyInt_Check(v)) {
4456 PyErr_SetString(PyExc_TypeError,
4457 "* wants int");
4458 goto onError;
4459 }
4460 width = PyInt_AsLong(v);
4461 if (width < 0) {
4462 flags |= F_LJUST;
4463 width = -width;
4464 }
4465 if (--fmtcnt >= 0)
4466 c = *fmt++;
4467 }
4468 else if (c >= '0' && c <= '9') {
4469 width = c - '0';
4470 while (--fmtcnt >= 0) {
4471 c = *fmt++;
4472 if (c < '0' || c > '9')
4473 break;
4474 if ((width*10) / 10 != width) {
4475 PyErr_SetString(PyExc_ValueError,
4476 "width too big");
4477 goto onError;
4478 }
4479 width = width*10 + (c - '0');
4480 }
4481 }
4482 if (c == '.') {
4483 prec = 0;
4484 if (--fmtcnt >= 0)
4485 c = *fmt++;
4486 if (c == '*') {
4487 v = getnextarg(args, arglen, &argidx);
4488 if (v == NULL)
4489 goto onError;
4490 if (!PyInt_Check(v)) {
4491 PyErr_SetString(PyExc_TypeError,
4492 "* wants int");
4493 goto onError;
4494 }
4495 prec = PyInt_AsLong(v);
4496 if (prec < 0)
4497 prec = 0;
4498 if (--fmtcnt >= 0)
4499 c = *fmt++;
4500 }
4501 else if (c >= '0' && c <= '9') {
4502 prec = c - '0';
4503 while (--fmtcnt >= 0) {
4504 c = Py_CHARMASK(*fmt++);
4505 if (c < '0' || c > '9')
4506 break;
4507 if ((prec*10) / 10 != prec) {
4508 PyErr_SetString(PyExc_ValueError,
4509 "prec too big");
4510 goto onError;
4511 }
4512 prec = prec*10 + (c - '0');
4513 }
4514 }
4515 } /* prec */
4516 if (fmtcnt >= 0) {
4517 if (c == 'h' || c == 'l' || c == 'L') {
4518 size = c;
4519 if (--fmtcnt >= 0)
4520 c = *fmt++;
4521 }
4522 }
4523 if (fmtcnt < 0) {
4524 PyErr_SetString(PyExc_ValueError,
4525 "incomplete format");
4526 goto onError;
4527 }
4528 if (c != '%') {
4529 v = getnextarg(args, arglen, &argidx);
4530 if (v == NULL)
4531 goto onError;
4532 }
4533 sign = 0;
4534 fill = ' ';
4535 switch (c) {
4536
4537 case '%':
4538 buf = tmpbuf;
4539 buf[0] = '%';
4540 len = 1;
4541 break;
4542
4543 case 's':
4544 case 'r':
4545 if (PyUnicode_Check(v) && c == 's') {
4546 temp = v;
4547 Py_INCREF(temp);
4548 }
4549 else {
4550 PyObject *unicode;
4551 if (c == 's')
4552 temp = PyObject_Str(v);
4553 else
4554 temp = PyObject_Repr(v);
4555 if (temp == NULL)
4556 goto onError;
4557 if (!PyString_Check(temp)) {
4558 /* XXX Note: this should never happen, since
4559 PyObject_Repr() and PyObject_Str() assure
4560 this */
4561 Py_DECREF(temp);
4562 PyErr_SetString(PyExc_TypeError,
4563 "%s argument has non-string str()");
4564 goto onError;
4565 }
Fred Drakee4315f52000-05-09 19:53:39 +00004566 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004567 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004568 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004569 "strict");
4570 Py_DECREF(temp);
4571 temp = unicode;
4572 if (temp == NULL)
4573 goto onError;
4574 }
4575 buf = PyUnicode_AS_UNICODE(temp);
4576 len = PyUnicode_GET_SIZE(temp);
4577 if (prec >= 0 && len > prec)
4578 len = prec;
4579 break;
4580
4581 case 'i':
4582 case 'd':
4583 case 'u':
4584 case 'o':
4585 case 'x':
4586 case 'X':
4587 if (c == 'i')
4588 c = 'd';
4589 buf = tmpbuf;
4590 len = formatint(buf, flags, prec, c, v);
4591 if (len < 0)
4592 goto onError;
4593 sign = (c == 'd');
4594 if (flags & F_ZERO) {
4595 fill = '0';
4596 if ((flags&F_ALT) &&
4597 (c == 'x' || c == 'X') &&
4598 buf[0] == '0' && buf[1] == c) {
4599 *res++ = *buf++;
4600 *res++ = *buf++;
4601 rescnt -= 2;
4602 len -= 2;
4603 width -= 2;
4604 if (width < 0)
4605 width = 0;
4606 }
4607 }
4608 break;
4609
4610 case 'e':
4611 case 'E':
4612 case 'f':
4613 case 'g':
4614 case 'G':
4615 buf = tmpbuf;
4616 len = formatfloat(buf, flags, prec, c, v);
4617 if (len < 0)
4618 goto onError;
4619 sign = 1;
4620 if (flags&F_ZERO)
4621 fill = '0';
4622 break;
4623
4624 case 'c':
4625 buf = tmpbuf;
4626 len = formatchar(buf, v);
4627 if (len < 0)
4628 goto onError;
4629 break;
4630
4631 default:
4632 PyErr_Format(PyExc_ValueError,
4633 "unsupported format character '%c' (0x%x)",
4634 c, c);
4635 goto onError;
4636 }
4637 if (sign) {
4638 if (*buf == '-' || *buf == '+') {
4639 sign = *buf++;
4640 len--;
4641 }
4642 else if (flags & F_SIGN)
4643 sign = '+';
4644 else if (flags & F_BLANK)
4645 sign = ' ';
4646 else
4647 sign = 0;
4648 }
4649 if (width < len)
4650 width = len;
4651 if (rescnt < width + (sign != 0)) {
4652 reslen -= rescnt;
4653 rescnt = width + fmtcnt + 100;
4654 reslen += rescnt;
4655 if (_PyUnicode_Resize(result, reslen) < 0)
4656 return NULL;
4657 res = PyUnicode_AS_UNICODE(result)
4658 + reslen - rescnt;
4659 }
4660 if (sign) {
4661 if (fill != ' ')
4662 *res++ = sign;
4663 rescnt--;
4664 if (width > len)
4665 width--;
4666 }
4667 if (width > len && !(flags & F_LJUST)) {
4668 do {
4669 --rescnt;
4670 *res++ = fill;
4671 } while (--width > len);
4672 }
4673 if (sign && fill == ' ')
4674 *res++ = sign;
4675 memcpy(res, buf, len * sizeof(Py_UNICODE));
4676 res += len;
4677 rescnt -= len;
4678 while (--width >= len) {
4679 --rescnt;
4680 *res++ = ' ';
4681 }
4682 if (dict && (argidx < arglen) && c != '%') {
4683 PyErr_SetString(PyExc_TypeError,
4684 "not all arguments converted");
4685 goto onError;
4686 }
4687 Py_XDECREF(temp);
4688 } /* '%' */
4689 } /* until end */
4690 if (argidx < arglen && !dict) {
4691 PyErr_SetString(PyExc_TypeError,
4692 "not all arguments converted");
4693 goto onError;
4694 }
4695
4696 if (args_owned) {
4697 Py_DECREF(args);
4698 }
4699 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004700 if (_PyUnicode_Resize(result, reslen - rescnt))
4701 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004702 return (PyObject *)result;
4703
4704 onError:
4705 Py_XDECREF(result);
4706 Py_DECREF(uformat);
4707 if (args_owned) {
4708 Py_DECREF(args);
4709 }
4710 return NULL;
4711}
4712
4713static PyBufferProcs unicode_as_buffer = {
4714 (getreadbufferproc) unicode_buffer_getreadbuf,
4715 (getwritebufferproc) unicode_buffer_getwritebuf,
4716 (getsegcountproc) unicode_buffer_getsegcount,
4717 (getcharbufferproc) unicode_buffer_getcharbuf,
4718};
4719
4720PyTypeObject PyUnicode_Type = {
4721 PyObject_HEAD_INIT(&PyType_Type)
4722 0, /* ob_size */
4723 "unicode", /* tp_name */
4724 sizeof(PyUnicodeObject), /* tp_size */
4725 0, /* tp_itemsize */
4726 /* Slots */
4727 (destructor)_PyUnicode_Free, /* tp_dealloc */
4728 0, /* tp_print */
4729 (getattrfunc)unicode_getattr, /* tp_getattr */
4730 0, /* tp_setattr */
4731 (cmpfunc) unicode_compare, /* tp_compare */
4732 (reprfunc) unicode_repr, /* tp_repr */
4733 0, /* tp_as_number */
4734 &unicode_as_sequence, /* tp_as_sequence */
4735 0, /* tp_as_mapping */
4736 (hashfunc) unicode_hash, /* tp_hash*/
4737 0, /* tp_call*/
4738 (reprfunc) unicode_str, /* tp_str */
4739 (getattrofunc) NULL, /* tp_getattro */
4740 (setattrofunc) NULL, /* tp_setattro */
4741 &unicode_as_buffer, /* tp_as_buffer */
4742 Py_TPFLAGS_DEFAULT, /* tp_flags */
4743};
4744
4745/* Initialize the Unicode implementation */
4746
4747void _PyUnicode_Init()
4748{
4749 /* Doublecheck the configuration... */
4750 if (sizeof(Py_UNICODE) != 2)
4751 Py_FatalError("Unicode configuration error: "
4752 "sizeof(Py_UNICODE) != 2 bytes");
4753
Fred Drakee4315f52000-05-09 19:53:39 +00004754 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004755 unicode_freelist = NULL;
4756 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00004758 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004759}
4760
4761/* Finalize the Unicode implementation */
4762
4763void
4764_PyUnicode_Fini()
4765{
4766 PyUnicodeObject *u = unicode_freelist;
4767
4768 while (u != NULL) {
4769 PyUnicodeObject *v = u;
4770 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004771 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00004772 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004773 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00004774 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004776 unicode_freelist = NULL;
4777 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004779 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004780}