blob: 3157cd89c516aa0339efba516ba62cfd97744eae [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
7(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
8
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
67#include "mymath.h"
68#include "unicodeobject.h"
69
70#if defined(HAVE_LIMITS_H)
71#include <limits.h>
72#else
73#define INT_MAX 2147483647
74#endif
75
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000076#ifdef MS_WIN32
77#include <windows.h>
78#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000079
Guido van Rossumd57fd912000-03-10 22:53:23 +000080/* Limit for the Unicode object free list */
81
82#define MAX_UNICODE_FREELIST_SIZE 1024
83
84/* Limit for the Unicode object free list stay alive optimization.
85
86 The implementation will keep allocated Unicode memory intact for
87 all objects on the free list having a size less than this
88 limit. This reduces malloc() overhead for small Unicode objects.
89
Barry Warsaw51ac5802000-03-20 16:36:48 +000090 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000091 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000092 malloc()-overhead) bytes of unused garbage.
93
94 Setting the limit to 0 effectively turns the feature off.
95
Guido van Rossumfd4b9572000-04-10 13:51:10 +000096 Note: This is an experimental feature ! If you get core dumps when
97 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000098
99*/
100
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000101#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +0000102
103/* Endianness switches; defaults to little endian */
104
105#ifdef WORDS_BIGENDIAN
106# define BYTEORDER_IS_BIG_ENDIAN
107#else
108# define BYTEORDER_IS_LITTLE_ENDIAN
109#endif
110
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000111/* --- Globals ------------------------------------------------------------
112
113 The globals are initialized by the _PyUnicode_Init() API and should
114 not be used before calling that API.
115
116*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000117
118/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000119static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000120
121/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000122static PyUnicodeObject *unicode_freelist;
123static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000124
Fred Drakee4315f52000-05-09 19:53:39 +0000125/* Default encoding to use and assume when NULL is passed as encoding
126 parameter; it is initialized by _PyUnicode_Init().
127
128 Always use the PyUnicode_SetDefaultEncoding() and
129 PyUnicode_GetDefaultEncoding() APIs to access this global.
130
131*/
132
133static char unicode_default_encoding[100];
134
Guido van Rossumd57fd912000-03-10 22:53:23 +0000135/* --- Unicode Object ----------------------------------------------------- */
136
137static
138int _PyUnicode_Resize(register PyUnicodeObject *unicode,
139 int length)
140{
141 void *oldstr;
142
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000143 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000144 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000145 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000146
147 /* Resizing unicode_empty is not allowed. */
148 if (unicode == unicode_empty) {
149 PyErr_SetString(PyExc_SystemError,
150 "can't resize empty unicode object");
151 return -1;
152 }
153
154 /* We allocate one more byte to make sure the string is
155 Ux0000 terminated -- XXX is this needed ? */
156 oldstr = unicode->str;
157 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
158 if (!unicode->str) {
159 unicode->str = oldstr;
160 PyErr_NoMemory();
161 return -1;
162 }
163 unicode->str[length] = 0;
164 unicode->length = length;
165
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000166 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000167 /* Reset the object caches */
168 if (unicode->utf8str) {
169 Py_DECREF(unicode->utf8str);
170 unicode->utf8str = NULL;
171 }
172 unicode->hash = -1;
173
174 return 0;
175}
176
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000177int PyUnicode_Resize(PyObject **unicode,
178 int length)
179{
180 PyUnicodeObject *v;
181
182 if (unicode == NULL) {
183 PyErr_BadInternalCall();
184 return -1;
185 }
186 v = (PyUnicodeObject *)*unicode;
187 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
188 PyErr_BadInternalCall();
189 return -1;
190 }
191 return _PyUnicode_Resize(v, length);
192}
193
Guido van Rossumd57fd912000-03-10 22:53:23 +0000194/* We allocate one more byte to make sure the string is
195 Ux0000 terminated -- XXX is this needed ?
196
197 XXX This allocator could further be enhanced by assuring that the
198 free list never reduces its size below 1.
199
200*/
201
202static
203PyUnicodeObject *_PyUnicode_New(int length)
204{
205 register PyUnicodeObject *unicode;
206
207 /* Optimization for empty strings */
208 if (length == 0 && unicode_empty != NULL) {
209 Py_INCREF(unicode_empty);
210 return unicode_empty;
211 }
212
213 /* Unicode freelist & memory allocation */
214 if (unicode_freelist) {
215 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000216 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000217 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000218 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000219 /* Keep-Alive optimization: we only upsize the buffer,
220 never downsize it. */
221 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000223 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000224 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000227 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000228 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000229 }
230 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000231 }
232 else {
233 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
234 if (unicode == NULL)
235 return NULL;
236 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
237 }
238
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000239 if (!unicode->str) {
240 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000241 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000243 unicode->str[length] = 0;
244 unicode->length = length;
245 unicode->hash = -1;
246 unicode->utf8str = NULL;
247 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000248
249 onError:
250 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000251 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000253}
254
255static
256void _PyUnicode_Free(register PyUnicodeObject *unicode)
257{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000258 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000259 /* Keep-Alive optimization */
260 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000261 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000262 unicode->str = NULL;
263 unicode->length = 0;
264 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000265 if (unicode->utf8str) {
266 Py_DECREF(unicode->utf8str);
267 unicode->utf8str = NULL;
268 }
269 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000270 *(PyUnicodeObject **)unicode = unicode_freelist;
271 unicode_freelist = unicode;
272 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000273 }
274 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000275 PyMem_DEL(unicode->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000276 Py_XDECREF(unicode->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000277 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000278 }
279}
280
281PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
282 int size)
283{
284 PyUnicodeObject *unicode;
285
286 unicode = _PyUnicode_New(size);
287 if (!unicode)
288 return NULL;
289
290 /* Copy the Unicode data into the new object */
291 if (u != NULL)
292 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
293
294 return (PyObject *)unicode;
295}
296
297#ifdef HAVE_WCHAR_H
298
299PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
300 int size)
301{
302 PyUnicodeObject *unicode;
303
304 if (w == NULL) {
305 PyErr_BadInternalCall();
306 return NULL;
307 }
308
309 unicode = _PyUnicode_New(size);
310 if (!unicode)
311 return NULL;
312
313 /* Copy the wchar_t data into the new object */
314#ifdef HAVE_USABLE_WCHAR_T
315 memcpy(unicode->str, w, size * sizeof(wchar_t));
316#else
317 {
318 register Py_UNICODE *u;
319 register int i;
320 u = PyUnicode_AS_UNICODE(unicode);
321 for (i = size; i >= 0; i--)
322 *u++ = *w++;
323 }
324#endif
325
326 return (PyObject *)unicode;
327}
328
329int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
330 register wchar_t *w,
331 int size)
332{
333 if (unicode == NULL) {
334 PyErr_BadInternalCall();
335 return -1;
336 }
337 if (size > PyUnicode_GET_SIZE(unicode))
338 size = PyUnicode_GET_SIZE(unicode);
339#ifdef HAVE_USABLE_WCHAR_T
340 memcpy(w, unicode->str, size * sizeof(wchar_t));
341#else
342 {
343 register Py_UNICODE *u;
344 register int i;
345 u = PyUnicode_AS_UNICODE(unicode);
346 for (i = size; i >= 0; i--)
347 *w++ = *u++;
348 }
349#endif
350
351 return size;
352}
353
354#endif
355
356PyObject *PyUnicode_FromObject(register PyObject *obj)
357{
358 const char *s;
359 int len;
360
361 if (obj == NULL) {
362 PyErr_BadInternalCall();
363 return NULL;
364 }
365 else if (PyUnicode_Check(obj)) {
366 Py_INCREF(obj);
367 return obj;
368 }
369 else if (PyString_Check(obj)) {
370 s = PyString_AS_STRING(obj);
371 len = PyString_GET_SIZE(obj);
372 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000373 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
374 /* Overwrite the error message with something more useful in
375 case of a TypeError. */
376 if (PyErr_ExceptionMatches(PyExc_TypeError))
377 PyErr_SetString(PyExc_TypeError,
378 "coercing to Unicode: need string or charbuffer");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000379 return NULL;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000380 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000381 if (len == 0) {
382 Py_INCREF(unicode_empty);
383 return (PyObject *)unicode_empty;
384 }
Fred Drakee4315f52000-05-09 19:53:39 +0000385 return PyUnicode_Decode(s, len, NULL, "strict");
Guido van Rossumd57fd912000-03-10 22:53:23 +0000386}
387
388PyObject *PyUnicode_Decode(const char *s,
389 int size,
390 const char *encoding,
391 const char *errors)
392{
393 PyObject *buffer = NULL, *unicode;
394
Fred Drakee4315f52000-05-09 19:53:39 +0000395 if (encoding == NULL)
396 encoding = PyUnicode_GetDefaultEncoding();
397
398 /* Shortcuts for common default encodings */
399 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000400 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000401 else if (strcmp(encoding, "latin-1") == 0)
402 return PyUnicode_DecodeLatin1(s, size, errors);
403 else if (strcmp(encoding, "ascii") == 0)
404 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000405
406 /* Decode via the codec registry */
407 buffer = PyBuffer_FromMemory((void *)s, size);
408 if (buffer == NULL)
409 goto onError;
410 unicode = PyCodec_Decode(buffer, encoding, errors);
411 if (unicode == NULL)
412 goto onError;
413 if (!PyUnicode_Check(unicode)) {
414 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000415 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000416 unicode->ob_type->tp_name);
417 Py_DECREF(unicode);
418 goto onError;
419 }
420 Py_DECREF(buffer);
421 return unicode;
422
423 onError:
424 Py_XDECREF(buffer);
425 return NULL;
426}
427
428PyObject *PyUnicode_Encode(const Py_UNICODE *s,
429 int size,
430 const char *encoding,
431 const char *errors)
432{
433 PyObject *v, *unicode;
434
435 unicode = PyUnicode_FromUnicode(s, size);
436 if (unicode == NULL)
437 return NULL;
438 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
439 Py_DECREF(unicode);
440 return v;
441}
442
443PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
444 const char *encoding,
445 const char *errors)
446{
447 PyObject *v;
448
449 if (!PyUnicode_Check(unicode)) {
450 PyErr_BadArgument();
451 goto onError;
452 }
Fred Drakee4315f52000-05-09 19:53:39 +0000453
454 if (encoding == NULL)
455 encoding = PyUnicode_GetDefaultEncoding();
456
457 /* Shortcuts for common default encodings */
458 if (errors == NULL) {
459 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000460 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000461 else if (strcmp(encoding, "latin-1") == 0)
462 return PyUnicode_AsLatin1String(unicode);
463 else if (strcmp(encoding, "ascii") == 0)
464 return PyUnicode_AsASCIIString(unicode);
465 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000466
467 /* Encode via the codec registry */
468 v = PyCodec_Encode(unicode, encoding, errors);
469 if (v == NULL)
470 goto onError;
471 /* XXX Should we really enforce this ? */
472 if (!PyString_Check(v)) {
473 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000474 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000475 v->ob_type->tp_name);
476 Py_DECREF(v);
477 goto onError;
478 }
479 return v;
480
481 onError:
482 return NULL;
483}
484
485Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
486{
487 if (!PyUnicode_Check(unicode)) {
488 PyErr_BadArgument();
489 goto onError;
490 }
491 return PyUnicode_AS_UNICODE(unicode);
492
493 onError:
494 return NULL;
495}
496
497int PyUnicode_GetSize(PyObject *unicode)
498{
499 if (!PyUnicode_Check(unicode)) {
500 PyErr_BadArgument();
501 goto onError;
502 }
503 return PyUnicode_GET_SIZE(unicode);
504
505 onError:
506 return -1;
507}
508
Fred Drakee4315f52000-05-09 19:53:39 +0000509const char *PyUnicode_GetDefaultEncoding()
510{
511 return unicode_default_encoding;
512}
513
514int PyUnicode_SetDefaultEncoding(const char *encoding)
515{
516 PyObject *v;
517
518 /* Make sure the encoding is valid. As side effect, this also
519 loads the encoding into the codec registry cache. */
520 v = _PyCodec_Lookup(encoding);
521 if (v == NULL)
522 goto onError;
523 Py_DECREF(v);
524 strncpy(unicode_default_encoding,
525 encoding,
526 sizeof(unicode_default_encoding));
527 return 0;
528
529 onError:
530 return -1;
531}
532
Guido van Rossumd57fd912000-03-10 22:53:23 +0000533/* --- UTF-8 Codec -------------------------------------------------------- */
534
535static
536char utf8_code_length[256] = {
537 /* Map UTF-8 encoded prefix byte to sequence length. zero means
538 illegal prefix. see RFC 2279 for details */
539 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
540 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
541 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
542 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
545 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
546 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
547 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
548 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
549 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
550 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
551 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
552 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
553 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
554 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
555};
556
557static
558int utf8_decoding_error(const char **source,
559 Py_UNICODE **dest,
560 const char *errors,
561 const char *details)
562{
563 if ((errors == NULL) ||
564 (strcmp(errors,"strict") == 0)) {
565 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000566 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000567 details);
568 return -1;
569 }
570 else if (strcmp(errors,"ignore") == 0) {
571 (*source)++;
572 return 0;
573 }
574 else if (strcmp(errors,"replace") == 0) {
575 (*source)++;
576 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
577 (*dest)++;
578 return 0;
579 }
580 else {
581 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000582 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000583 errors);
584 return -1;
585 }
586}
587
588#define UTF8_ERROR(details) do { \
589 if (utf8_decoding_error(&s, &p, errors, details)) \
590 goto onError; \
591 continue; \
592} while (0)
593
594PyObject *PyUnicode_DecodeUTF8(const char *s,
595 int size,
596 const char *errors)
597{
598 int n;
599 const char *e;
600 PyUnicodeObject *unicode;
601 Py_UNICODE *p;
602
603 /* Note: size will always be longer than the resulting Unicode
604 character count */
605 unicode = _PyUnicode_New(size);
606 if (!unicode)
607 return NULL;
608 if (size == 0)
609 return (PyObject *)unicode;
610
611 /* Unpack UTF-8 encoded data */
612 p = unicode->str;
613 e = s + size;
614
615 while (s < e) {
616 register Py_UNICODE ch = (unsigned char)*s;
617
618 if (ch < 0x80) {
619 *p++ = ch;
620 s++;
621 continue;
622 }
623
624 n = utf8_code_length[ch];
625
626 if (s + n > e)
627 UTF8_ERROR("unexpected end of data");
628
629 switch (n) {
630
631 case 0:
632 UTF8_ERROR("unexpected code byte");
633 break;
634
635 case 1:
636 UTF8_ERROR("internal error");
637 break;
638
639 case 2:
640 if ((s[1] & 0xc0) != 0x80)
641 UTF8_ERROR("invalid data");
642 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
643 if (ch < 0x80)
644 UTF8_ERROR("illegal encoding");
645 else
646 *p++ = ch;
647 break;
648
649 case 3:
650 if ((s[1] & 0xc0) != 0x80 ||
651 (s[2] & 0xc0) != 0x80)
652 UTF8_ERROR("invalid data");
653 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
654 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000))
655 UTF8_ERROR("illegal encoding");
656 else
657 *p++ = ch;
658 break;
659
660 default:
661 /* Other sizes are only needed for UCS-4 */
662 UTF8_ERROR("unsupported Unicode code range");
663 }
664 s += n;
665 }
666
667 /* Adjust length */
668 if (_PyUnicode_Resize(unicode, p - unicode->str))
669 goto onError;
670
671 return (PyObject *)unicode;
672
673onError:
674 Py_DECREF(unicode);
675 return NULL;
676}
677
678#undef UTF8_ERROR
679
680static
681int utf8_encoding_error(const Py_UNICODE **source,
682 char **dest,
683 const char *errors,
684 const char *details)
685{
686 if ((errors == NULL) ||
687 (strcmp(errors,"strict") == 0)) {
688 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000689 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000690 details);
691 return -1;
692 }
693 else if (strcmp(errors,"ignore") == 0) {
694 return 0;
695 }
696 else if (strcmp(errors,"replace") == 0) {
697 **dest = '?';
698 (*dest)++;
699 return 0;
700 }
701 else {
702 PyErr_Format(PyExc_ValueError,
703 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000704 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705 errors);
706 return -1;
707 }
708}
709
710PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
711 int size,
712 const char *errors)
713{
714 PyObject *v;
715 char *p;
716 char *q;
717
718 v = PyString_FromStringAndSize(NULL, 3 * size);
719 if (v == NULL)
720 return NULL;
721 if (size == 0)
722 goto done;
723
724 p = q = PyString_AS_STRING(v);
725 while (size-- > 0) {
726 Py_UNICODE ch = *s++;
727 if (ch < 0x80)
728 *p++ = (char) ch;
729 else if (ch < 0x0800) {
730 *p++ = 0xc0 | (ch >> 6);
731 *p++ = 0x80 | (ch & 0x3f);
732 } else if (0xD800 <= ch && ch <= 0xDFFF) {
733 /* These byte ranges are reserved for UTF-16 surrogate
734 bytes which the Python implementation currently does
735 not support. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000736 if (utf8_encoding_error(&s, &p, errors,
737 "unsupported code range"))
738 goto onError;
739 } else {
740 *p++ = 0xe0 | (ch >> 12);
741 *p++ = 0x80 | ((ch >> 6) & 0x3f);
742 *p++ = 0x80 | (ch & 0x3f);
743 }
744 }
745 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000746 if (_PyString_Resize(&v, p - q))
747 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000748
749 done:
750 return v;
751
752 onError:
753 Py_DECREF(v);
754 return NULL;
755}
756
757/* Return a Python string holding the UTF-8 encoded value of the
758 Unicode object.
759
760 The resulting string is cached in the Unicode object for subsequent
761 usage by this function. The cached version is needed to implement
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000762 the character buffer interface and will live (at least) as long as
763 the Unicode object itself.
Guido van Rossumd57fd912000-03-10 22:53:23 +0000764
765 The refcount of the string is *not* incremented.
766
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000767 *** Exported for internal use by the interpreter only !!! ***
768
Guido van Rossumd57fd912000-03-10 22:53:23 +0000769*/
770
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000771PyObject *_PyUnicode_AsUTF8String(PyObject *unicode,
Guido van Rossumd57fd912000-03-10 22:53:23 +0000772 const char *errors)
773{
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000774 PyObject *v = ((PyUnicodeObject *)unicode)->utf8str;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000775
776 if (v)
777 return v;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000778 v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
779 PyUnicode_GET_SIZE(unicode),
Guido van Rossumd57fd912000-03-10 22:53:23 +0000780 errors);
781 if (v && errors == NULL)
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000782 ((PyUnicodeObject *)unicode)->utf8str = v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000783 return v;
784}
785
786PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
787{
788 PyObject *str;
789
790 if (!PyUnicode_Check(unicode)) {
791 PyErr_BadArgument();
792 return NULL;
793 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000794 str = _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000795 if (str == NULL)
796 return NULL;
797 Py_INCREF(str);
798 return str;
799}
800
801/* --- UTF-16 Codec ------------------------------------------------------- */
802
803static
804int utf16_decoding_error(const Py_UNICODE **source,
805 Py_UNICODE **dest,
806 const char *errors,
807 const char *details)
808{
809 if ((errors == NULL) ||
810 (strcmp(errors,"strict") == 0)) {
811 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000812 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000813 details);
814 return -1;
815 }
816 else if (strcmp(errors,"ignore") == 0) {
817 return 0;
818 }
819 else if (strcmp(errors,"replace") == 0) {
820 if (dest) {
821 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
822 (*dest)++;
823 }
824 return 0;
825 }
826 else {
827 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000828 "UTF-16 decoding error; "
829 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000830 errors);
831 return -1;
832 }
833}
834
835#define UTF16_ERROR(details) do { \
836 if (utf16_decoding_error(&q, &p, errors, details)) \
837 goto onError; \
838 continue; \
839} while(0)
840
841PyObject *PyUnicode_DecodeUTF16(const char *s,
842 int size,
843 const char *errors,
844 int *byteorder)
845{
846 PyUnicodeObject *unicode;
847 Py_UNICODE *p;
848 const Py_UNICODE *q, *e;
849 int bo = 0;
850
851 /* size should be an even number */
852 if (size % sizeof(Py_UNICODE) != 0) {
853 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
854 return NULL;
855 /* The remaining input chars are ignored if we fall through
856 here... */
857 }
858
859 /* Note: size will always be longer than the resulting Unicode
860 character count */
861 unicode = _PyUnicode_New(size);
862 if (!unicode)
863 return NULL;
864 if (size == 0)
865 return (PyObject *)unicode;
866
867 /* Unpack UTF-16 encoded data */
868 p = unicode->str;
869 q = (Py_UNICODE *)s;
870 e = q + (size / sizeof(Py_UNICODE));
871
872 if (byteorder)
873 bo = *byteorder;
874
875 while (q < e) {
876 register Py_UNICODE ch = *q++;
877
878 /* Check for BOM marks (U+FEFF) in the input and adjust
879 current byte order setting accordingly. Swap input
880 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
881 !) */
882#ifdef BYTEORDER_IS_LITTLE_ENDIAN
883 if (ch == 0xFEFF) {
884 bo = -1;
885 continue;
886 } else if (ch == 0xFFFE) {
887 bo = 1;
888 continue;
889 }
890 if (bo == 1)
891 ch = (ch >> 8) | (ch << 8);
892#else
893 if (ch == 0xFEFF) {
894 bo = 1;
895 continue;
896 } else if (ch == 0xFFFE) {
897 bo = -1;
898 continue;
899 }
900 if (bo == -1)
901 ch = (ch >> 8) | (ch << 8);
902#endif
903 if (ch < 0xD800 || ch > 0xDFFF) {
904 *p++ = ch;
905 continue;
906 }
907
908 /* UTF-16 code pair: */
909 if (q >= e)
910 UTF16_ERROR("unexpected end of data");
911 if (0xDC00 <= *q && *q <= 0xDFFF) {
912 q++;
913 if (0xD800 <= *q && *q <= 0xDBFF)
914 /* This is valid data (a UTF-16 surrogate pair), but
915 we are not able to store this information since our
916 Py_UNICODE type only has 16 bits... this might
917 change someday, even though it's unlikely. */
918 UTF16_ERROR("code pairs are not supported");
919 else
920 continue;
921 }
922 UTF16_ERROR("illegal encoding");
923 }
924
925 if (byteorder)
926 *byteorder = bo;
927
928 /* Adjust length */
929 if (_PyUnicode_Resize(unicode, p - unicode->str))
930 goto onError;
931
932 return (PyObject *)unicode;
933
934onError:
935 Py_DECREF(unicode);
936 return NULL;
937}
938
939#undef UTF16_ERROR
940
941PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
942 int size,
943 const char *errors,
944 int byteorder)
945{
946 PyObject *v;
947 Py_UNICODE *p;
948 char *q;
949
950 /* We don't create UTF-16 pairs... */
951 v = PyString_FromStringAndSize(NULL,
952 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
953 if (v == NULL)
954 return NULL;
955 if (size == 0)
956 goto done;
957
958 q = PyString_AS_STRING(v);
959 p = (Py_UNICODE *)q;
960
961 if (byteorder == 0)
962 *p++ = 0xFEFF;
963 if (byteorder == 0 ||
964#ifdef BYTEORDER_IS_LITTLE_ENDIAN
965 byteorder == -1
966#else
967 byteorder == 1
968#endif
969 )
970 memcpy(p, s, size * sizeof(Py_UNICODE));
971 else
972 while (size-- > 0) {
973 Py_UNICODE ch = *s++;
974 *p++ = (ch >> 8) | (ch << 8);
975 }
976 done:
977 return v;
978}
979
980PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
981{
982 if (!PyUnicode_Check(unicode)) {
983 PyErr_BadArgument();
984 return NULL;
985 }
986 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
987 PyUnicode_GET_SIZE(unicode),
988 NULL,
989 0);
990}
991
992/* --- Unicode Escape Codec ----------------------------------------------- */
993
994static
995int unicodeescape_decoding_error(const char **source,
996 unsigned int *x,
997 const char *errors,
998 const char *details)
999{
1000 if ((errors == NULL) ||
1001 (strcmp(errors,"strict") == 0)) {
1002 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001003 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001004 details);
1005 return -1;
1006 }
1007 else if (strcmp(errors,"ignore") == 0) {
1008 return 0;
1009 }
1010 else if (strcmp(errors,"replace") == 0) {
1011 *x = (unsigned int)Py_UNICODE_REPLACEMENT_CHARACTER;
1012 return 0;
1013 }
1014 else {
1015 PyErr_Format(PyExc_ValueError,
1016 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001017 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001018 errors);
1019 return -1;
1020 }
1021}
1022
1023PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1024 int size,
1025 const char *errors)
1026{
1027 PyUnicodeObject *v;
1028 Py_UNICODE *p = NULL, *buf = NULL;
1029 const char *end;
1030
1031 /* Escaped strings will always be longer than the resulting
1032 Unicode string, so we start with size here and then reduce the
1033 length after conversion to the true value. */
1034 v = _PyUnicode_New(size);
1035 if (v == NULL)
1036 goto onError;
1037 if (size == 0)
1038 return (PyObject *)v;
1039 p = buf = PyUnicode_AS_UNICODE(v);
1040 end = s + size;
1041 while (s < end) {
1042 unsigned char c;
1043 unsigned int x;
1044 int i;
1045
1046 /* Non-escape characters are interpreted as Unicode ordinals */
1047 if (*s != '\\') {
1048 *p++ = (unsigned char)*s++;
1049 continue;
1050 }
1051
1052 /* \ - Escapes */
1053 s++;
1054 switch (*s++) {
1055
1056 /* \x escapes */
1057 case '\n': break;
1058 case '\\': *p++ = '\\'; break;
1059 case '\'': *p++ = '\''; break;
1060 case '\"': *p++ = '\"'; break;
1061 case 'b': *p++ = '\b'; break;
1062 case 'f': *p++ = '\014'; break; /* FF */
1063 case 't': *p++ = '\t'; break;
1064 case 'n': *p++ = '\n'; break;
1065 case 'r': *p++ = '\r'; break;
1066 case 'v': *p++ = '\013'; break; /* VT */
1067 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1068
1069 /* \OOO (octal) escapes */
1070 case '0': case '1': case '2': case '3':
1071 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001072 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001074 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001076 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001077 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001078 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079 break;
1080
1081 /* \xXXXX escape with 0-4 hex digits */
1082 case 'x':
1083 x = 0;
1084 c = (unsigned char)*s;
1085 if (isxdigit(c)) {
1086 do {
1087 x = (x<<4) & ~0xF;
1088 if ('0' <= c && c <= '9')
1089 x += c - '0';
1090 else if ('a' <= c && c <= 'f')
1091 x += 10 + c - 'a';
1092 else
1093 x += 10 + c - 'A';
1094 c = (unsigned char)*++s;
1095 } while (isxdigit(c));
1096 *p++ = x;
1097 } else {
1098 *p++ = '\\';
1099 *p++ = (unsigned char)s[-1];
1100 }
1101 break;
1102
1103 /* \uXXXX with 4 hex digits */
1104 case 'u':
1105 for (x = 0, i = 0; i < 4; i++) {
1106 c = (unsigned char)s[i];
1107 if (!isxdigit(c)) {
1108 if (unicodeescape_decoding_error(&s, &x, errors,
1109 "truncated \\uXXXX"))
1110 goto onError;
1111 i++;
1112 break;
1113 }
1114 x = (x<<4) & ~0xF;
1115 if (c >= '0' && c <= '9')
1116 x += c - '0';
1117 else if (c >= 'a' && c <= 'f')
1118 x += 10 + c - 'a';
1119 else
1120 x += 10 + c - 'A';
1121 }
1122 s += i;
1123 *p++ = x;
1124 break;
1125
1126 default:
1127 *p++ = '\\';
1128 *p++ = (unsigned char)s[-1];
1129 break;
1130 }
1131 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001132 if (_PyUnicode_Resize(v, (int)(p - buf)))
1133 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001134 return (PyObject *)v;
1135
1136 onError:
1137 Py_XDECREF(v);
1138 return NULL;
1139}
1140
1141/* Return a Unicode-Escape string version of the Unicode object.
1142
1143 If quotes is true, the string is enclosed in u"" or u'' quotes as
1144 appropriate.
1145
1146*/
1147
Barry Warsaw51ac5802000-03-20 16:36:48 +00001148static const Py_UNICODE *findchar(const Py_UNICODE *s,
1149 int size,
1150 Py_UNICODE ch);
1151
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152static
1153PyObject *unicodeescape_string(const Py_UNICODE *s,
1154 int size,
1155 int quotes)
1156{
1157 PyObject *repr;
1158 char *p;
1159 char *q;
1160
1161 static const char *hexdigit = "0123456789ABCDEF";
1162
1163 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1164 if (repr == NULL)
1165 return NULL;
1166
1167 p = q = PyString_AS_STRING(repr);
1168
1169 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001170 *p++ = 'u';
1171 *p++ = (findchar(s, size, '\'') &&
1172 !findchar(s, size, '"')) ? '"' : '\'';
1173 }
1174 while (size-- > 0) {
1175 Py_UNICODE ch = *s++;
1176 /* Escape quotes */
1177 if (quotes && (ch == q[1] || ch == '\\')) {
1178 *p++ = '\\';
1179 *p++ = (char) ch;
1180 }
1181 /* Map 16-bit characters to '\uxxxx' */
1182 else if (ch >= 256) {
1183 *p++ = '\\';
1184 *p++ = 'u';
1185 *p++ = hexdigit[(ch >> 12) & 0xf];
1186 *p++ = hexdigit[(ch >> 8) & 0xf];
1187 *p++ = hexdigit[(ch >> 4) & 0xf];
1188 *p++ = hexdigit[ch & 15];
1189 }
1190 /* Map non-printable US ASCII to '\ooo' */
1191 else if (ch < ' ' || ch >= 128) {
1192 *p++ = '\\';
1193 *p++ = hexdigit[(ch >> 6) & 7];
1194 *p++ = hexdigit[(ch >> 3) & 7];
1195 *p++ = hexdigit[ch & 7];
1196 }
1197 /* Copy everything else as-is */
1198 else
1199 *p++ = (char) ch;
1200 }
1201 if (quotes)
1202 *p++ = q[1];
1203
1204 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001205 if (_PyString_Resize(&repr, p - q))
1206 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207
1208 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001209
1210 onError:
1211 Py_DECREF(repr);
1212 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213}
1214
1215PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1216 int size)
1217{
1218 return unicodeescape_string(s, size, 0);
1219}
1220
1221PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1222{
1223 if (!PyUnicode_Check(unicode)) {
1224 PyErr_BadArgument();
1225 return NULL;
1226 }
1227 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1228 PyUnicode_GET_SIZE(unicode));
1229}
1230
1231/* --- Raw Unicode Escape Codec ------------------------------------------- */
1232
1233PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1234 int size,
1235 const char *errors)
1236{
1237 PyUnicodeObject *v;
1238 Py_UNICODE *p, *buf;
1239 const char *end;
1240 const char *bs;
1241
1242 /* Escaped strings will always be longer than the resulting
1243 Unicode string, so we start with size here and then reduce the
1244 length after conversion to the true value. */
1245 v = _PyUnicode_New(size);
1246 if (v == NULL)
1247 goto onError;
1248 if (size == 0)
1249 return (PyObject *)v;
1250 p = buf = PyUnicode_AS_UNICODE(v);
1251 end = s + size;
1252 while (s < end) {
1253 unsigned char c;
1254 unsigned int x;
1255 int i;
1256
1257 /* Non-escape characters are interpreted as Unicode ordinals */
1258 if (*s != '\\') {
1259 *p++ = (unsigned char)*s++;
1260 continue;
1261 }
1262
1263 /* \u-escapes are only interpreted iff the number of leading
1264 backslashes if odd */
1265 bs = s;
1266 for (;s < end;) {
1267 if (*s != '\\')
1268 break;
1269 *p++ = (unsigned char)*s++;
1270 }
1271 if (((s - bs) & 1) == 0 ||
1272 s >= end ||
1273 *s != 'u') {
1274 continue;
1275 }
1276 p--;
1277 s++;
1278
1279 /* \uXXXX with 4 hex digits */
1280 for (x = 0, i = 0; i < 4; i++) {
1281 c = (unsigned char)s[i];
1282 if (!isxdigit(c)) {
1283 if (unicodeescape_decoding_error(&s, &x, errors,
1284 "truncated \\uXXXX"))
1285 goto onError;
1286 i++;
1287 break;
1288 }
1289 x = (x<<4) & ~0xF;
1290 if (c >= '0' && c <= '9')
1291 x += c - '0';
1292 else if (c >= 'a' && c <= 'f')
1293 x += 10 + c - 'a';
1294 else
1295 x += 10 + c - 'A';
1296 }
1297 s += i;
1298 *p++ = x;
1299 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001300 if (_PyUnicode_Resize(v, (int)(p - buf)))
1301 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001302 return (PyObject *)v;
1303
1304 onError:
1305 Py_XDECREF(v);
1306 return NULL;
1307}
1308
1309PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1310 int size)
1311{
1312 PyObject *repr;
1313 char *p;
1314 char *q;
1315
1316 static const char *hexdigit = "0123456789ABCDEF";
1317
1318 repr = PyString_FromStringAndSize(NULL, 6 * size);
1319 if (repr == NULL)
1320 return NULL;
1321
1322 p = q = PyString_AS_STRING(repr);
1323 while (size-- > 0) {
1324 Py_UNICODE ch = *s++;
1325 /* Map 16-bit characters to '\uxxxx' */
1326 if (ch >= 256) {
1327 *p++ = '\\';
1328 *p++ = 'u';
1329 *p++ = hexdigit[(ch >> 12) & 0xf];
1330 *p++ = hexdigit[(ch >> 8) & 0xf];
1331 *p++ = hexdigit[(ch >> 4) & 0xf];
1332 *p++ = hexdigit[ch & 15];
1333 }
1334 /* Copy everything else as-is */
1335 else
1336 *p++ = (char) ch;
1337 }
1338 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001339 if (_PyString_Resize(&repr, p - q))
1340 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001341
1342 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001343
1344 onError:
1345 Py_DECREF(repr);
1346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001347}
1348
1349PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1350{
1351 if (!PyUnicode_Check(unicode)) {
1352 PyErr_BadArgument();
1353 return NULL;
1354 }
1355 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1356 PyUnicode_GET_SIZE(unicode));
1357}
1358
1359/* --- Latin-1 Codec ------------------------------------------------------ */
1360
1361PyObject *PyUnicode_DecodeLatin1(const char *s,
1362 int size,
1363 const char *errors)
1364{
1365 PyUnicodeObject *v;
1366 Py_UNICODE *p;
1367
1368 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1369 v = _PyUnicode_New(size);
1370 if (v == NULL)
1371 goto onError;
1372 if (size == 0)
1373 return (PyObject *)v;
1374 p = PyUnicode_AS_UNICODE(v);
1375 while (size-- > 0)
1376 *p++ = (unsigned char)*s++;
1377 return (PyObject *)v;
1378
1379 onError:
1380 Py_XDECREF(v);
1381 return NULL;
1382}
1383
1384static
1385int latin1_encoding_error(const Py_UNICODE **source,
1386 char **dest,
1387 const char *errors,
1388 const char *details)
1389{
1390 if ((errors == NULL) ||
1391 (strcmp(errors,"strict") == 0)) {
1392 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001393 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394 details);
1395 return -1;
1396 }
1397 else if (strcmp(errors,"ignore") == 0) {
1398 return 0;
1399 }
1400 else if (strcmp(errors,"replace") == 0) {
1401 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001402 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001403 return 0;
1404 }
1405 else {
1406 PyErr_Format(PyExc_ValueError,
1407 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001408 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001409 errors);
1410 return -1;
1411 }
1412}
1413
1414PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1415 int size,
1416 const char *errors)
1417{
1418 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001419 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001420 repr = PyString_FromStringAndSize(NULL, size);
1421 if (repr == NULL)
1422 return NULL;
1423
1424 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001425 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001426 while (size-- > 0) {
1427 Py_UNICODE ch = *p++;
1428 if (ch >= 256) {
1429 if (latin1_encoding_error(&p, &s, errors,
1430 "ordinal not in range(256)"))
1431 goto onError;
1432 }
1433 else
1434 *s++ = (char)ch;
1435 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001436 /* Resize if error handling skipped some characters */
1437 if (s - start < PyString_GET_SIZE(repr))
1438 if (_PyString_Resize(&repr, s - start))
1439 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001440 return repr;
1441
1442 onError:
1443 Py_DECREF(repr);
1444 return NULL;
1445}
1446
1447PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1448{
1449 if (!PyUnicode_Check(unicode)) {
1450 PyErr_BadArgument();
1451 return NULL;
1452 }
1453 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1454 PyUnicode_GET_SIZE(unicode),
1455 NULL);
1456}
1457
1458/* --- 7-bit ASCII Codec -------------------------------------------------- */
1459
1460static
1461int ascii_decoding_error(const char **source,
1462 Py_UNICODE **dest,
1463 const char *errors,
1464 const char *details)
1465{
1466 if ((errors == NULL) ||
1467 (strcmp(errors,"strict") == 0)) {
1468 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001469 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001470 details);
1471 return -1;
1472 }
1473 else if (strcmp(errors,"ignore") == 0) {
1474 return 0;
1475 }
1476 else if (strcmp(errors,"replace") == 0) {
1477 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1478 (*dest)++;
1479 return 0;
1480 }
1481 else {
1482 PyErr_Format(PyExc_ValueError,
1483 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001484 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001485 errors);
1486 return -1;
1487 }
1488}
1489
1490PyObject *PyUnicode_DecodeASCII(const char *s,
1491 int size,
1492 const char *errors)
1493{
1494 PyUnicodeObject *v;
1495 Py_UNICODE *p;
1496
1497 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1498 v = _PyUnicode_New(size);
1499 if (v == NULL)
1500 goto onError;
1501 if (size == 0)
1502 return (PyObject *)v;
1503 p = PyUnicode_AS_UNICODE(v);
1504 while (size-- > 0) {
1505 register unsigned char c;
1506
1507 c = (unsigned char)*s++;
1508 if (c < 128)
1509 *p++ = c;
1510 else if (ascii_decoding_error(&s, &p, errors,
1511 "ordinal not in range(128)"))
1512 goto onError;
1513 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001514 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1515 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1516 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001517 return (PyObject *)v;
1518
1519 onError:
1520 Py_XDECREF(v);
1521 return NULL;
1522}
1523
1524static
1525int ascii_encoding_error(const Py_UNICODE **source,
1526 char **dest,
1527 const char *errors,
1528 const char *details)
1529{
1530 if ((errors == NULL) ||
1531 (strcmp(errors,"strict") == 0)) {
1532 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001533 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001534 details);
1535 return -1;
1536 }
1537 else if (strcmp(errors,"ignore") == 0) {
1538 return 0;
1539 }
1540 else if (strcmp(errors,"replace") == 0) {
1541 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001542 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001543 return 0;
1544 }
1545 else {
1546 PyErr_Format(PyExc_ValueError,
1547 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001548 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001549 errors);
1550 return -1;
1551 }
1552}
1553
1554PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1555 int size,
1556 const char *errors)
1557{
1558 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001559 char *s, *start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001560 repr = PyString_FromStringAndSize(NULL, size);
1561 if (repr == NULL)
1562 return NULL;
1563
1564 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001565 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001566 while (size-- > 0) {
1567 Py_UNICODE ch = *p++;
1568 if (ch >= 128) {
1569 if (ascii_encoding_error(&p, &s, errors,
1570 "ordinal not in range(128)"))
1571 goto onError;
1572 }
1573 else
1574 *s++ = (char)ch;
1575 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001576 /* Resize if error handling skipped some characters */
1577 if (s - start < PyString_GET_SIZE(repr))
1578 if (_PyString_Resize(&repr, s - start))
1579 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001580 return repr;
1581
1582 onError:
1583 Py_DECREF(repr);
1584 return NULL;
1585}
1586
1587PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1588{
1589 if (!PyUnicode_Check(unicode)) {
1590 PyErr_BadArgument();
1591 return NULL;
1592 }
1593 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1594 PyUnicode_GET_SIZE(unicode),
1595 NULL);
1596}
1597
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001598#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001599
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001600/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001601
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001602PyObject *PyUnicode_DecodeMBCS(const char *s,
1603 int size,
1604 const char *errors)
1605{
1606 PyUnicodeObject *v;
1607 Py_UNICODE *p;
1608
1609 /* First get the size of the result */
1610 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001611 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001612 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1613
1614 v = _PyUnicode_New(usize);
1615 if (v == NULL)
1616 return NULL;
1617 if (usize == 0)
1618 return (PyObject *)v;
1619 p = PyUnicode_AS_UNICODE(v);
1620 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1621 Py_DECREF(v);
1622 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1623 }
1624
1625 return (PyObject *)v;
1626}
1627
1628PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1629 int size,
1630 const char *errors)
1631{
1632 PyObject *repr;
1633 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001634 DWORD mbcssize;
1635
1636 /* If there are no characters, bail now! */
1637 if (size==0)
1638 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001639
1640 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001641 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001642 if (mbcssize==0)
1643 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1644
1645 repr = PyString_FromStringAndSize(NULL, mbcssize);
1646 if (repr == NULL)
1647 return NULL;
1648 if (mbcssize==0)
1649 return repr;
1650
1651 /* Do the conversion */
1652 s = PyString_AS_STRING(repr);
1653 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1654 Py_DECREF(repr);
1655 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1656 }
1657 return repr;
1658}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001659
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001660#endif /* MS_WIN32 */
1661
Guido van Rossumd57fd912000-03-10 22:53:23 +00001662/* --- Character Mapping Codec -------------------------------------------- */
1663
1664static
1665int charmap_decoding_error(const char **source,
1666 Py_UNICODE **dest,
1667 const char *errors,
1668 const char *details)
1669{
1670 if ((errors == NULL) ||
1671 (strcmp(errors,"strict") == 0)) {
1672 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001673 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001674 details);
1675 return -1;
1676 }
1677 else if (strcmp(errors,"ignore") == 0) {
1678 return 0;
1679 }
1680 else if (strcmp(errors,"replace") == 0) {
1681 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1682 (*dest)++;
1683 return 0;
1684 }
1685 else {
1686 PyErr_Format(PyExc_ValueError,
1687 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001688 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001689 errors);
1690 return -1;
1691 }
1692}
1693
1694PyObject *PyUnicode_DecodeCharmap(const char *s,
1695 int size,
1696 PyObject *mapping,
1697 const char *errors)
1698{
1699 PyUnicodeObject *v;
1700 Py_UNICODE *p;
1701
1702 /* Default to Latin-1 */
1703 if (mapping == NULL)
1704 return PyUnicode_DecodeLatin1(s, size, errors);
1705
1706 v = _PyUnicode_New(size);
1707 if (v == NULL)
1708 goto onError;
1709 if (size == 0)
1710 return (PyObject *)v;
1711 p = PyUnicode_AS_UNICODE(v);
1712 while (size-- > 0) {
1713 unsigned char ch = *s++;
1714 PyObject *w, *x;
1715
1716 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1717 w = PyInt_FromLong((long)ch);
1718 if (w == NULL)
1719 goto onError;
1720 x = PyObject_GetItem(mapping, w);
1721 Py_DECREF(w);
1722 if (x == NULL) {
1723 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1724 /* No mapping found: default to Latin-1 mapping */
1725 PyErr_Clear();
1726 *p++ = (Py_UNICODE)ch;
1727 continue;
1728 }
1729 goto onError;
1730 }
1731
1732 /* Apply mapping */
1733 if (PyInt_Check(x)) {
1734 int value = PyInt_AS_LONG(x);
1735 if (value < 0 || value > 65535) {
1736 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001737 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001738 Py_DECREF(x);
1739 goto onError;
1740 }
1741 *p++ = (Py_UNICODE)value;
1742 }
1743 else if (x == Py_None) {
1744 /* undefined mapping */
1745 if (charmap_decoding_error(&s, &p, errors,
1746 "character maps to <undefined>")) {
1747 Py_DECREF(x);
1748 goto onError;
1749 }
1750 }
1751 else if (PyUnicode_Check(x)) {
1752 if (PyUnicode_GET_SIZE(x) != 1) {
1753 /* 1-n mapping */
1754 PyErr_SetString(PyExc_NotImplementedError,
1755 "1-n mappings are currently not implemented");
1756 Py_DECREF(x);
1757 goto onError;
1758 }
1759 *p++ = *PyUnicode_AS_UNICODE(x);
1760 }
1761 else {
1762 /* wrong return value */
1763 PyErr_SetString(PyExc_TypeError,
1764 "character mapping must return integer, None or unicode");
1765 Py_DECREF(x);
1766 goto onError;
1767 }
1768 Py_DECREF(x);
1769 }
1770 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
1771 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1772 goto onError;
1773 return (PyObject *)v;
1774
1775 onError:
1776 Py_XDECREF(v);
1777 return NULL;
1778}
1779
1780static
1781int charmap_encoding_error(const Py_UNICODE **source,
1782 char **dest,
1783 const char *errors,
1784 const char *details)
1785{
1786 if ((errors == NULL) ||
1787 (strcmp(errors,"strict") == 0)) {
1788 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001789 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 details);
1791 return -1;
1792 }
1793 else if (strcmp(errors,"ignore") == 0) {
1794 return 0;
1795 }
1796 else if (strcmp(errors,"replace") == 0) {
1797 **dest = '?';
1798 (*dest)++;
1799 return 0;
1800 }
1801 else {
1802 PyErr_Format(PyExc_ValueError,
1803 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001804 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001805 errors);
1806 return -1;
1807 }
1808}
1809
1810PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
1811 int size,
1812 PyObject *mapping,
1813 const char *errors)
1814{
1815 PyObject *v;
1816 char *s;
1817
1818 /* Default to Latin-1 */
1819 if (mapping == NULL)
1820 return PyUnicode_EncodeLatin1(p, size, errors);
1821
1822 v = PyString_FromStringAndSize(NULL, size);
1823 if (v == NULL)
1824 return NULL;
1825 s = PyString_AS_STRING(v);
1826 while (size-- > 0) {
1827 Py_UNICODE ch = *p++;
1828 PyObject *w, *x;
1829
1830 /* Get mapping (Unicode ordinal -> string char, integer or None) */
1831 w = PyInt_FromLong((long)ch);
1832 if (w == NULL)
1833 goto onError;
1834 x = PyObject_GetItem(mapping, w);
1835 Py_DECREF(w);
1836 if (x == NULL) {
1837 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1838 /* No mapping found: default to Latin-1 mapping if possible */
1839 PyErr_Clear();
1840 if (ch < 256) {
1841 *s++ = (char)ch;
1842 continue;
1843 }
1844 else if (!charmap_encoding_error(&p, &s, errors,
1845 "missing character mapping"))
1846 continue;
1847 }
1848 goto onError;
1849 }
1850
1851 /* Apply mapping */
1852 if (PyInt_Check(x)) {
1853 int value = PyInt_AS_LONG(x);
1854 if (value < 0 || value > 255) {
1855 PyErr_SetString(PyExc_TypeError,
1856 "character mapping must be in range(256)");
1857 Py_DECREF(x);
1858 goto onError;
1859 }
1860 *s++ = (char)value;
1861 }
1862 else if (x == Py_None) {
1863 /* undefined mapping */
1864 if (charmap_encoding_error(&p, &s, errors,
1865 "character maps to <undefined>")) {
1866 Py_DECREF(x);
1867 goto onError;
1868 }
1869 }
1870 else if (PyString_Check(x)) {
1871 if (PyString_GET_SIZE(x) != 1) {
1872 /* 1-n mapping */
1873 PyErr_SetString(PyExc_NotImplementedError,
1874 "1-n mappings are currently not implemented");
1875 Py_DECREF(x);
1876 goto onError;
1877 }
1878 *s++ = *PyString_AS_STRING(x);
1879 }
1880 else {
1881 /* wrong return value */
1882 PyErr_SetString(PyExc_TypeError,
1883 "character mapping must return integer, None or unicode");
1884 Py_DECREF(x);
1885 goto onError;
1886 }
1887 Py_DECREF(x);
1888 }
1889 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
1890 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
1891 goto onError;
1892 return v;
1893
1894 onError:
1895 Py_DECREF(v);
1896 return NULL;
1897}
1898
1899PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
1900 PyObject *mapping)
1901{
1902 if (!PyUnicode_Check(unicode) || mapping == NULL) {
1903 PyErr_BadArgument();
1904 return NULL;
1905 }
1906 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
1907 PyUnicode_GET_SIZE(unicode),
1908 mapping,
1909 NULL);
1910}
1911
1912static
1913int translate_error(const Py_UNICODE **source,
1914 Py_UNICODE **dest,
1915 const char *errors,
1916 const char *details)
1917{
1918 if ((errors == NULL) ||
1919 (strcmp(errors,"strict") == 0)) {
1920 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001921 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001922 details);
1923 return -1;
1924 }
1925 else if (strcmp(errors,"ignore") == 0) {
1926 return 0;
1927 }
1928 else if (strcmp(errors,"replace") == 0) {
1929 **dest = '?';
1930 (*dest)++;
1931 return 0;
1932 }
1933 else {
1934 PyErr_Format(PyExc_ValueError,
1935 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001936 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001937 errors);
1938 return -1;
1939 }
1940}
1941
1942PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
1943 int size,
1944 PyObject *mapping,
1945 const char *errors)
1946{
1947 PyUnicodeObject *v;
1948 Py_UNICODE *p;
1949
1950 if (mapping == NULL) {
1951 PyErr_BadArgument();
1952 return NULL;
1953 }
1954
1955 /* Output will never be longer than input */
1956 v = _PyUnicode_New(size);
1957 if (v == NULL)
1958 goto onError;
1959 if (size == 0)
1960 goto done;
1961 p = PyUnicode_AS_UNICODE(v);
1962 while (size-- > 0) {
1963 Py_UNICODE ch = *s++;
1964 PyObject *w, *x;
1965
1966 /* Get mapping */
1967 w = PyInt_FromLong(ch);
1968 if (w == NULL)
1969 goto onError;
1970 x = PyObject_GetItem(mapping, w);
1971 Py_DECREF(w);
1972 if (x == NULL) {
1973 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
1974 /* No mapping found: default to 1-1 mapping */
1975 PyErr_Clear();
1976 *p++ = ch;
1977 continue;
1978 }
1979 goto onError;
1980 }
1981
1982 /* Apply mapping */
1983 if (PyInt_Check(x))
1984 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
1985 else if (x == Py_None) {
1986 /* undefined mapping */
1987 if (translate_error(&s, &p, errors,
1988 "character maps to <undefined>")) {
1989 Py_DECREF(x);
1990 goto onError;
1991 }
1992 }
1993 else if (PyUnicode_Check(x)) {
1994 if (PyUnicode_GET_SIZE(x) != 1) {
1995 /* 1-n mapping */
1996 PyErr_SetString(PyExc_NotImplementedError,
1997 "1-n mappings are currently not implemented");
1998 Py_DECREF(x);
1999 goto onError;
2000 }
2001 *p++ = *PyUnicode_AS_UNICODE(x);
2002 }
2003 else {
2004 /* wrong return value */
2005 PyErr_SetString(PyExc_TypeError,
2006 "translate mapping must return integer, None or unicode");
2007 Py_DECREF(x);
2008 goto onError;
2009 }
2010 Py_DECREF(x);
2011 }
2012 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002013 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2014 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002015
2016 done:
2017 return (PyObject *)v;
2018
2019 onError:
2020 Py_XDECREF(v);
2021 return NULL;
2022}
2023
2024PyObject *PyUnicode_Translate(PyObject *str,
2025 PyObject *mapping,
2026 const char *errors)
2027{
2028 PyObject *result;
2029
2030 str = PyUnicode_FromObject(str);
2031 if (str == NULL)
2032 goto onError;
2033 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2034 PyUnicode_GET_SIZE(str),
2035 mapping,
2036 errors);
2037 Py_DECREF(str);
2038 return result;
2039
2040 onError:
2041 Py_XDECREF(str);
2042 return NULL;
2043}
2044
Guido van Rossum9e896b32000-04-05 20:11:21 +00002045/* --- Decimal Encoder ---------------------------------------------------- */
2046
2047int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2048 int length,
2049 char *output,
2050 const char *errors)
2051{
2052 Py_UNICODE *p, *end;
2053
2054 if (output == NULL) {
2055 PyErr_BadArgument();
2056 return -1;
2057 }
2058
2059 p = s;
2060 end = s + length;
2061 while (p < end) {
2062 register Py_UNICODE ch = *p++;
2063 int decimal;
2064
2065 if (Py_UNICODE_ISSPACE(ch)) {
2066 *output++ = ' ';
2067 continue;
2068 }
2069 decimal = Py_UNICODE_TODECIMAL(ch);
2070 if (decimal >= 0) {
2071 *output++ = '0' + decimal;
2072 continue;
2073 }
Guido van Rossumba477042000-04-06 18:18:10 +00002074 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002075 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002076 continue;
2077 }
2078 /* All other characters are considered invalid */
2079 if (errors == NULL || strcmp(errors, "strict") == 0) {
2080 PyErr_SetString(PyExc_ValueError,
2081 "invalid decimal Unicode string");
2082 goto onError;
2083 }
2084 else if (strcmp(errors, "ignore") == 0)
2085 continue;
2086 else if (strcmp(errors, "replace") == 0) {
2087 *output++ = '?';
2088 continue;
2089 }
2090 }
2091 /* 0-terminate the output string */
2092 *output++ = '\0';
2093 return 0;
2094
2095 onError:
2096 return -1;
2097}
2098
Guido van Rossumd57fd912000-03-10 22:53:23 +00002099/* --- Helpers ------------------------------------------------------------ */
2100
2101static
2102int count(PyUnicodeObject *self,
2103 int start,
2104 int end,
2105 PyUnicodeObject *substring)
2106{
2107 int count = 0;
2108
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002109 if (substring->length == 0)
2110 return (end - start + 1);
2111
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 end -= substring->length;
2113
2114 while (start <= end)
2115 if (Py_UNICODE_MATCH(self, start, substring)) {
2116 count++;
2117 start += substring->length;
2118 } else
2119 start++;
2120
2121 return count;
2122}
2123
2124int PyUnicode_Count(PyObject *str,
2125 PyObject *substr,
2126 int start,
2127 int end)
2128{
2129 int result;
2130
2131 str = PyUnicode_FromObject(str);
2132 if (str == NULL)
2133 return -1;
2134 substr = PyUnicode_FromObject(substr);
2135 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002136 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002137 return -1;
2138 }
2139
2140 result = count((PyUnicodeObject *)str,
2141 start, end,
2142 (PyUnicodeObject *)substr);
2143
2144 Py_DECREF(str);
2145 Py_DECREF(substr);
2146 return result;
2147}
2148
2149static
2150int findstring(PyUnicodeObject *self,
2151 PyUnicodeObject *substring,
2152 int start,
2153 int end,
2154 int direction)
2155{
2156 if (start < 0)
2157 start += self->length;
2158 if (start < 0)
2159 start = 0;
2160
2161 if (substring->length == 0)
2162 return start;
2163
2164 if (end > self->length)
2165 end = self->length;
2166 if (end < 0)
2167 end += self->length;
2168 if (end < 0)
2169 end = 0;
2170
2171 end -= substring->length;
2172
2173 if (direction < 0) {
2174 for (; end >= start; end--)
2175 if (Py_UNICODE_MATCH(self, end, substring))
2176 return end;
2177 } else {
2178 for (; start <= end; start++)
2179 if (Py_UNICODE_MATCH(self, start, substring))
2180 return start;
2181 }
2182
2183 return -1;
2184}
2185
2186int PyUnicode_Find(PyObject *str,
2187 PyObject *substr,
2188 int start,
2189 int end,
2190 int direction)
2191{
2192 int result;
2193
2194 str = PyUnicode_FromObject(str);
2195 if (str == NULL)
2196 return -1;
2197 substr = PyUnicode_FromObject(substr);
2198 if (substr == NULL) {
2199 Py_DECREF(substr);
2200 return -1;
2201 }
2202
2203 result = findstring((PyUnicodeObject *)str,
2204 (PyUnicodeObject *)substr,
2205 start, end, direction);
2206 Py_DECREF(str);
2207 Py_DECREF(substr);
2208 return result;
2209}
2210
2211static
2212int tailmatch(PyUnicodeObject *self,
2213 PyUnicodeObject *substring,
2214 int start,
2215 int end,
2216 int direction)
2217{
2218 if (start < 0)
2219 start += self->length;
2220 if (start < 0)
2221 start = 0;
2222
2223 if (substring->length == 0)
2224 return 1;
2225
2226 if (end > self->length)
2227 end = self->length;
2228 if (end < 0)
2229 end += self->length;
2230 if (end < 0)
2231 end = 0;
2232
2233 end -= substring->length;
2234 if (end < start)
2235 return 0;
2236
2237 if (direction > 0) {
2238 if (Py_UNICODE_MATCH(self, end, substring))
2239 return 1;
2240 } else {
2241 if (Py_UNICODE_MATCH(self, start, substring))
2242 return 1;
2243 }
2244
2245 return 0;
2246}
2247
2248int PyUnicode_Tailmatch(PyObject *str,
2249 PyObject *substr,
2250 int start,
2251 int end,
2252 int direction)
2253{
2254 int result;
2255
2256 str = PyUnicode_FromObject(str);
2257 if (str == NULL)
2258 return -1;
2259 substr = PyUnicode_FromObject(substr);
2260 if (substr == NULL) {
2261 Py_DECREF(substr);
2262 return -1;
2263 }
2264
2265 result = tailmatch((PyUnicodeObject *)str,
2266 (PyUnicodeObject *)substr,
2267 start, end, direction);
2268 Py_DECREF(str);
2269 Py_DECREF(substr);
2270 return result;
2271}
2272
2273static
2274const Py_UNICODE *findchar(const Py_UNICODE *s,
2275 int size,
2276 Py_UNICODE ch)
2277{
2278 /* like wcschr, but doesn't stop at NULL characters */
2279
2280 while (size-- > 0) {
2281 if (*s == ch)
2282 return s;
2283 s++;
2284 }
2285
2286 return NULL;
2287}
2288
2289/* Apply fixfct filter to the Unicode object self and return a
2290 reference to the modified object */
2291
2292static
2293PyObject *fixup(PyUnicodeObject *self,
2294 int (*fixfct)(PyUnicodeObject *s))
2295{
2296
2297 PyUnicodeObject *u;
2298
2299 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2300 self->length);
2301 if (u == NULL)
2302 return NULL;
2303 if (!fixfct(u)) {
2304 /* fixfct should return TRUE if it modified the buffer. If
2305 FALSE, return a reference to the original buffer instead
2306 (to save space, not time) */
2307 Py_INCREF(self);
2308 Py_DECREF(u);
2309 return (PyObject*) self;
2310 }
2311 return (PyObject*) u;
2312}
2313
2314static
2315int fixupper(PyUnicodeObject *self)
2316{
2317 int len = self->length;
2318 Py_UNICODE *s = self->str;
2319 int status = 0;
2320
2321 while (len-- > 0) {
2322 register Py_UNICODE ch;
2323
2324 ch = Py_UNICODE_TOUPPER(*s);
2325 if (ch != *s) {
2326 status = 1;
2327 *s = ch;
2328 }
2329 s++;
2330 }
2331
2332 return status;
2333}
2334
2335static
2336int fixlower(PyUnicodeObject *self)
2337{
2338 int len = self->length;
2339 Py_UNICODE *s = self->str;
2340 int status = 0;
2341
2342 while (len-- > 0) {
2343 register Py_UNICODE ch;
2344
2345 ch = Py_UNICODE_TOLOWER(*s);
2346 if (ch != *s) {
2347 status = 1;
2348 *s = ch;
2349 }
2350 s++;
2351 }
2352
2353 return status;
2354}
2355
2356static
2357int fixswapcase(PyUnicodeObject *self)
2358{
2359 int len = self->length;
2360 Py_UNICODE *s = self->str;
2361 int status = 0;
2362
2363 while (len-- > 0) {
2364 if (Py_UNICODE_ISUPPER(*s)) {
2365 *s = Py_UNICODE_TOLOWER(*s);
2366 status = 1;
2367 } else if (Py_UNICODE_ISLOWER(*s)) {
2368 *s = Py_UNICODE_TOUPPER(*s);
2369 status = 1;
2370 }
2371 s++;
2372 }
2373
2374 return status;
2375}
2376
2377static
2378int fixcapitalize(PyUnicodeObject *self)
2379{
2380 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2381 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2382 return 1;
2383 }
2384 return 0;
2385}
2386
2387static
2388int fixtitle(PyUnicodeObject *self)
2389{
2390 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2391 register Py_UNICODE *e;
2392 int previous_is_cased;
2393
2394 /* Shortcut for single character strings */
2395 if (PyUnicode_GET_SIZE(self) == 1) {
2396 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2397 if (*p != ch) {
2398 *p = ch;
2399 return 1;
2400 }
2401 else
2402 return 0;
2403 }
2404
2405 e = p + PyUnicode_GET_SIZE(self);
2406 previous_is_cased = 0;
2407 for (; p < e; p++) {
2408 register const Py_UNICODE ch = *p;
2409
2410 if (previous_is_cased)
2411 *p = Py_UNICODE_TOLOWER(ch);
2412 else
2413 *p = Py_UNICODE_TOTITLE(ch);
2414
2415 if (Py_UNICODE_ISLOWER(ch) ||
2416 Py_UNICODE_ISUPPER(ch) ||
2417 Py_UNICODE_ISTITLE(ch))
2418 previous_is_cased = 1;
2419 else
2420 previous_is_cased = 0;
2421 }
2422 return 1;
2423}
2424
2425PyObject *PyUnicode_Join(PyObject *separator,
2426 PyObject *seq)
2427{
2428 Py_UNICODE *sep;
2429 int seplen;
2430 PyUnicodeObject *res = NULL;
2431 int reslen = 0;
2432 Py_UNICODE *p;
2433 int seqlen = 0;
2434 int sz = 100;
2435 int i;
2436
2437 seqlen = PySequence_Length(seq);
2438 if (seqlen < 0 && PyErr_Occurred())
2439 return NULL;
2440
2441 if (separator == NULL) {
2442 Py_UNICODE blank = ' ';
2443 sep = &blank;
2444 seplen = 1;
2445 }
2446 else {
2447 separator = PyUnicode_FromObject(separator);
2448 if (separator == NULL)
2449 return NULL;
2450 sep = PyUnicode_AS_UNICODE(separator);
2451 seplen = PyUnicode_GET_SIZE(separator);
2452 }
2453
2454 res = _PyUnicode_New(sz);
2455 if (res == NULL)
2456 goto onError;
2457 p = PyUnicode_AS_UNICODE(res);
2458 reslen = 0;
2459
2460 for (i = 0; i < seqlen; i++) {
2461 int itemlen;
2462 PyObject *item;
2463
2464 item = PySequence_GetItem(seq, i);
2465 if (item == NULL)
2466 goto onError;
2467 if (!PyUnicode_Check(item)) {
2468 PyObject *v;
2469 v = PyUnicode_FromObject(item);
2470 Py_DECREF(item);
2471 item = v;
2472 if (item == NULL)
2473 goto onError;
2474 }
2475 itemlen = PyUnicode_GET_SIZE(item);
2476 while (reslen + itemlen + seplen >= sz) {
2477 if (_PyUnicode_Resize(res, sz*2))
2478 goto onError;
2479 sz *= 2;
2480 p = PyUnicode_AS_UNICODE(res) + reslen;
2481 }
2482 if (i > 0) {
2483 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2484 p += seplen;
2485 reslen += seplen;
2486 }
2487 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2488 p += itemlen;
2489 reslen += itemlen;
2490 Py_DECREF(item);
2491 }
2492 if (_PyUnicode_Resize(res, reslen))
2493 goto onError;
2494
2495 Py_XDECREF(separator);
2496 return (PyObject *)res;
2497
2498 onError:
2499 Py_XDECREF(separator);
2500 Py_DECREF(res);
2501 return NULL;
2502}
2503
2504static
2505PyUnicodeObject *pad(PyUnicodeObject *self,
2506 int left,
2507 int right,
2508 Py_UNICODE fill)
2509{
2510 PyUnicodeObject *u;
2511
2512 if (left < 0)
2513 left = 0;
2514 if (right < 0)
2515 right = 0;
2516
2517 if (left == 0 && right == 0) {
2518 Py_INCREF(self);
2519 return self;
2520 }
2521
2522 u = _PyUnicode_New(left + self->length + right);
2523 if (u) {
2524 if (left)
2525 Py_UNICODE_FILL(u->str, fill, left);
2526 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2527 if (right)
2528 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2529 }
2530
2531 return u;
2532}
2533
2534#define SPLIT_APPEND(data, left, right) \
2535 str = PyUnicode_FromUnicode(data + left, right - left); \
2536 if (!str) \
2537 goto onError; \
2538 if (PyList_Append(list, str)) { \
2539 Py_DECREF(str); \
2540 goto onError; \
2541 } \
2542 else \
2543 Py_DECREF(str);
2544
2545static
2546PyObject *split_whitespace(PyUnicodeObject *self,
2547 PyObject *list,
2548 int maxcount)
2549{
2550 register int i;
2551 register int j;
2552 int len = self->length;
2553 PyObject *str;
2554
2555 for (i = j = 0; i < len; ) {
2556 /* find a token */
2557 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2558 i++;
2559 j = i;
2560 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2561 i++;
2562 if (j < i) {
2563 if (maxcount-- <= 0)
2564 break;
2565 SPLIT_APPEND(self->str, j, i);
2566 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2567 i++;
2568 j = i;
2569 }
2570 }
2571 if (j < len) {
2572 SPLIT_APPEND(self->str, j, len);
2573 }
2574 return list;
2575
2576 onError:
2577 Py_DECREF(list);
2578 return NULL;
2579}
2580
2581PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002582 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002583{
2584 register int i;
2585 register int j;
2586 int len;
2587 PyObject *list;
2588 PyObject *str;
2589 Py_UNICODE *data;
2590
2591 string = PyUnicode_FromObject(string);
2592 if (string == NULL)
2593 return NULL;
2594 data = PyUnicode_AS_UNICODE(string);
2595 len = PyUnicode_GET_SIZE(string);
2596
Guido van Rossumd57fd912000-03-10 22:53:23 +00002597 list = PyList_New(0);
2598 if (!list)
2599 goto onError;
2600
2601 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002602 int eol;
2603
Guido van Rossumd57fd912000-03-10 22:53:23 +00002604 /* Find a line and append it */
2605 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2606 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002607
2608 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002609 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002610 if (i < len) {
2611 if (data[i] == '\r' && i + 1 < len &&
2612 data[i+1] == '\n')
2613 i += 2;
2614 else
2615 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002616 if (keepends)
2617 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002618 }
Guido van Rossum86662912000-04-11 15:38:46 +00002619 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002620 j = i;
2621 }
2622 if (j < len) {
2623 SPLIT_APPEND(data, j, len);
2624 }
2625
2626 Py_DECREF(string);
2627 return list;
2628
2629 onError:
2630 Py_DECREF(list);
2631 Py_DECREF(string);
2632 return NULL;
2633}
2634
2635static
2636PyObject *split_char(PyUnicodeObject *self,
2637 PyObject *list,
2638 Py_UNICODE ch,
2639 int maxcount)
2640{
2641 register int i;
2642 register int j;
2643 int len = self->length;
2644 PyObject *str;
2645
2646 for (i = j = 0; i < len; ) {
2647 if (self->str[i] == ch) {
2648 if (maxcount-- <= 0)
2649 break;
2650 SPLIT_APPEND(self->str, j, i);
2651 i = j = i + 1;
2652 } else
2653 i++;
2654 }
2655 if (j <= len) {
2656 SPLIT_APPEND(self->str, j, len);
2657 }
2658 return list;
2659
2660 onError:
2661 Py_DECREF(list);
2662 return NULL;
2663}
2664
2665static
2666PyObject *split_substring(PyUnicodeObject *self,
2667 PyObject *list,
2668 PyUnicodeObject *substring,
2669 int maxcount)
2670{
2671 register int i;
2672 register int j;
2673 int len = self->length;
2674 int sublen = substring->length;
2675 PyObject *str;
2676
2677 for (i = j = 0; i < len - sublen; ) {
2678 if (Py_UNICODE_MATCH(self, i, substring)) {
2679 if (maxcount-- <= 0)
2680 break;
2681 SPLIT_APPEND(self->str, j, i);
2682 i = j = i + sublen;
2683 } else
2684 i++;
2685 }
2686 if (j <= len) {
2687 SPLIT_APPEND(self->str, j, len);
2688 }
2689 return list;
2690
2691 onError:
2692 Py_DECREF(list);
2693 return NULL;
2694}
2695
2696#undef SPLIT_APPEND
2697
2698static
2699PyObject *split(PyUnicodeObject *self,
2700 PyUnicodeObject *substring,
2701 int maxcount)
2702{
2703 PyObject *list;
2704
2705 if (maxcount < 0)
2706 maxcount = INT_MAX;
2707
2708 list = PyList_New(0);
2709 if (!list)
2710 return NULL;
2711
2712 if (substring == NULL)
2713 return split_whitespace(self,list,maxcount);
2714
2715 else if (substring->length == 1)
2716 return split_char(self,list,substring->str[0],maxcount);
2717
2718 else if (substring->length == 0) {
2719 Py_DECREF(list);
2720 PyErr_SetString(PyExc_ValueError, "empty separator");
2721 return NULL;
2722 }
2723 else
2724 return split_substring(self,list,substring,maxcount);
2725}
2726
2727static
2728PyObject *strip(PyUnicodeObject *self,
2729 int left,
2730 int right)
2731{
2732 Py_UNICODE *p = self->str;
2733 int start = 0;
2734 int end = self->length;
2735
2736 if (left)
2737 while (start < end && Py_UNICODE_ISSPACE(p[start]))
2738 start++;
2739
2740 if (right)
2741 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
2742 end--;
2743
2744 if (start == 0 && end == self->length) {
2745 /* couldn't strip anything off, return original string */
2746 Py_INCREF(self);
2747 return (PyObject*) self;
2748 }
2749
2750 return (PyObject*) PyUnicode_FromUnicode(
2751 self->str + start,
2752 end - start
2753 );
2754}
2755
2756static
2757PyObject *replace(PyUnicodeObject *self,
2758 PyUnicodeObject *str1,
2759 PyUnicodeObject *str2,
2760 int maxcount)
2761{
2762 PyUnicodeObject *u;
2763
2764 if (maxcount < 0)
2765 maxcount = INT_MAX;
2766
2767 if (str1->length == 1 && str2->length == 1) {
2768 int i;
2769
2770 /* replace characters */
2771 if (!findchar(self->str, self->length, str1->str[0])) {
2772 /* nothing to replace, return original string */
2773 Py_INCREF(self);
2774 u = self;
2775 } else {
2776 Py_UNICODE u1 = str1->str[0];
2777 Py_UNICODE u2 = str2->str[0];
2778
2779 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
2780 self->str,
2781 self->length
2782 );
2783 if (u)
2784 for (i = 0; i < u->length; i++)
2785 if (u->str[i] == u1) {
2786 if (--maxcount < 0)
2787 break;
2788 u->str[i] = u2;
2789 }
2790 }
2791
2792 } else {
2793 int n, i;
2794 Py_UNICODE *p;
2795
2796 /* replace strings */
2797 n = count(self, 0, self->length, str1);
2798 if (n > maxcount)
2799 n = maxcount;
2800 if (n == 0) {
2801 /* nothing to replace, return original string */
2802 Py_INCREF(self);
2803 u = self;
2804 } else {
2805 u = _PyUnicode_New(
2806 self->length + n * (str2->length - str1->length));
2807 if (u) {
2808 i = 0;
2809 p = u->str;
2810 while (i <= self->length - str1->length)
2811 if (Py_UNICODE_MATCH(self, i, str1)) {
2812 /* replace string segment */
2813 Py_UNICODE_COPY(p, str2->str, str2->length);
2814 p += str2->length;
2815 i += str1->length;
2816 if (--n <= 0) {
2817 /* copy remaining part */
2818 Py_UNICODE_COPY(p, self->str+i, self->length-i);
2819 break;
2820 }
2821 } else
2822 *p++ = self->str[i++];
2823 }
2824 }
2825 }
2826
2827 return (PyObject *) u;
2828}
2829
2830/* --- Unicode Object Methods --------------------------------------------- */
2831
2832static char title__doc__[] =
2833"S.title() -> unicode\n\
2834\n\
2835Return a titlecased version of S, i.e. words start with title case\n\
2836characters, all remaining cased characters have lower case.";
2837
2838static PyObject*
2839unicode_title(PyUnicodeObject *self, PyObject *args)
2840{
2841 if (!PyArg_NoArgs(args))
2842 return NULL;
2843 return fixup(self, fixtitle);
2844}
2845
2846static char capitalize__doc__[] =
2847"S.capitalize() -> unicode\n\
2848\n\
2849Return a capitalized version of S, i.e. make the first character\n\
2850have upper case.";
2851
2852static PyObject*
2853unicode_capitalize(PyUnicodeObject *self, PyObject *args)
2854{
2855 if (!PyArg_NoArgs(args))
2856 return NULL;
2857 return fixup(self, fixcapitalize);
2858}
2859
2860#if 0
2861static char capwords__doc__[] =
2862"S.capwords() -> unicode\n\
2863\n\
2864Apply .capitalize() to all words in S and return the result with\n\
2865normalized whitespace (all whitespace strings are replaced by ' ').";
2866
2867static PyObject*
2868unicode_capwords(PyUnicodeObject *self, PyObject *args)
2869{
2870 PyObject *list;
2871 PyObject *item;
2872 int i;
2873
2874 if (!PyArg_NoArgs(args))
2875 return NULL;
2876
2877 /* Split into words */
2878 list = split(self, NULL, -1);
2879 if (!list)
2880 return NULL;
2881
2882 /* Capitalize each word */
2883 for (i = 0; i < PyList_GET_SIZE(list); i++) {
2884 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
2885 fixcapitalize);
2886 if (item == NULL)
2887 goto onError;
2888 Py_DECREF(PyList_GET_ITEM(list, i));
2889 PyList_SET_ITEM(list, i, item);
2890 }
2891
2892 /* Join the words to form a new string */
2893 item = PyUnicode_Join(NULL, list);
2894
2895onError:
2896 Py_DECREF(list);
2897 return (PyObject *)item;
2898}
2899#endif
2900
2901static char center__doc__[] =
2902"S.center(width) -> unicode\n\
2903\n\
2904Return S centered in a Unicode string of length width. Padding is done\n\
2905using spaces.";
2906
2907static PyObject *
2908unicode_center(PyUnicodeObject *self, PyObject *args)
2909{
2910 int marg, left;
2911 int width;
2912
2913 if (!PyArg_ParseTuple(args, "i:center", &width))
2914 return NULL;
2915
2916 if (self->length >= width) {
2917 Py_INCREF(self);
2918 return (PyObject*) self;
2919 }
2920
2921 marg = width - self->length;
2922 left = marg / 2 + (marg & width & 1);
2923
2924 return (PyObject*) pad(self, left, marg - left, ' ');
2925}
2926
2927static int
2928unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
2929{
2930 int len1, len2;
2931 Py_UNICODE *s1 = str1->str;
2932 Py_UNICODE *s2 = str2->str;
2933
2934 len1 = str1->length;
2935 len2 = str2->length;
2936
2937 while (len1 > 0 && len2 > 0) {
2938 int cmp = (*s1++) - (*s2++);
2939 if (cmp)
2940 /* This should make Christian happy! */
2941 return (cmp < 0) ? -1 : (cmp != 0);
2942 len1--, len2--;
2943 }
2944
2945 return (len1 < len2) ? -1 : (len1 != len2);
2946}
2947
2948int PyUnicode_Compare(PyObject *left,
2949 PyObject *right)
2950{
2951 PyUnicodeObject *u = NULL, *v = NULL;
2952 int result;
2953
2954 /* Coerce the two arguments */
2955 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
2956 if (u == NULL)
2957 goto onError;
2958 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
2959 if (v == NULL)
2960 goto onError;
2961
2962 /* Shortcut for emtpy or interned objects */
2963 if (v == u) {
2964 Py_DECREF(u);
2965 Py_DECREF(v);
2966 return 0;
2967 }
2968
2969 result = unicode_compare(u, v);
2970
2971 Py_DECREF(u);
2972 Py_DECREF(v);
2973 return result;
2974
2975onError:
2976 Py_XDECREF(u);
2977 Py_XDECREF(v);
2978 return -1;
2979}
2980
Guido van Rossum403d68b2000-03-13 15:55:09 +00002981int PyUnicode_Contains(PyObject *container,
2982 PyObject *element)
2983{
2984 PyUnicodeObject *u = NULL, *v = NULL;
2985 int result;
2986 register const Py_UNICODE *p, *e;
2987 register Py_UNICODE ch;
2988
2989 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00002990 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
2991 if (v == NULL)
2992 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002993 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
2994 if (u == NULL) {
2995 Py_DECREF(v);
2996 goto onError;
2997 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00002998
2999 /* Check v in u */
3000 if (PyUnicode_GET_SIZE(v) != 1) {
3001 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003002 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003003 goto onError;
3004 }
3005 ch = *PyUnicode_AS_UNICODE(v);
3006 p = PyUnicode_AS_UNICODE(u);
3007 e = p + PyUnicode_GET_SIZE(u);
3008 result = 0;
3009 while (p < e) {
3010 if (*p++ == ch) {
3011 result = 1;
3012 break;
3013 }
3014 }
3015
3016 Py_DECREF(u);
3017 Py_DECREF(v);
3018 return result;
3019
3020onError:
3021 Py_XDECREF(u);
3022 Py_XDECREF(v);
3023 return -1;
3024}
3025
Guido van Rossumd57fd912000-03-10 22:53:23 +00003026/* Concat to string or Unicode object giving a new Unicode object. */
3027
3028PyObject *PyUnicode_Concat(PyObject *left,
3029 PyObject *right)
3030{
3031 PyUnicodeObject *u = NULL, *v = NULL, *w;
3032
3033 /* Coerce the two arguments */
3034 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3035 if (u == NULL)
3036 goto onError;
3037 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3038 if (v == NULL)
3039 goto onError;
3040
3041 /* Shortcuts */
3042 if (v == unicode_empty) {
3043 Py_DECREF(v);
3044 return (PyObject *)u;
3045 }
3046 if (u == unicode_empty) {
3047 Py_DECREF(u);
3048 return (PyObject *)v;
3049 }
3050
3051 /* Concat the two Unicode strings */
3052 w = _PyUnicode_New(u->length + v->length);
3053 if (w == NULL)
3054 goto onError;
3055 Py_UNICODE_COPY(w->str, u->str, u->length);
3056 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3057
3058 Py_DECREF(u);
3059 Py_DECREF(v);
3060 return (PyObject *)w;
3061
3062onError:
3063 Py_XDECREF(u);
3064 Py_XDECREF(v);
3065 return NULL;
3066}
3067
3068static char count__doc__[] =
3069"S.count(sub[, start[, end]]) -> int\n\
3070\n\
3071Return the number of occurrences of substring sub in Unicode string\n\
3072S[start:end]. Optional arguments start and end are\n\
3073interpreted as in slice notation.";
3074
3075static PyObject *
3076unicode_count(PyUnicodeObject *self, PyObject *args)
3077{
3078 PyUnicodeObject *substring;
3079 int start = 0;
3080 int end = INT_MAX;
3081 PyObject *result;
3082
Guido van Rossumb8872e62000-05-09 14:14:27 +00003083 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3084 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003085 return NULL;
3086
3087 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3088 (PyObject *)substring);
3089 if (substring == NULL)
3090 return NULL;
3091
Guido van Rossumd57fd912000-03-10 22:53:23 +00003092 if (start < 0)
3093 start += self->length;
3094 if (start < 0)
3095 start = 0;
3096 if (end > self->length)
3097 end = self->length;
3098 if (end < 0)
3099 end += self->length;
3100 if (end < 0)
3101 end = 0;
3102
3103 result = PyInt_FromLong((long) count(self, start, end, substring));
3104
3105 Py_DECREF(substring);
3106 return result;
3107}
3108
3109static char encode__doc__[] =
3110"S.encode([encoding[,errors]]) -> string\n\
3111\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003112Return an encoded string version of S. Default encoding is the current\n\
3113default string encoding. errors may be given to set a different error\n\
3114handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3115a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003116
3117static PyObject *
3118unicode_encode(PyUnicodeObject *self, PyObject *args)
3119{
3120 char *encoding = NULL;
3121 char *errors = NULL;
3122 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3123 return NULL;
3124 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3125}
3126
3127static char expandtabs__doc__[] =
3128"S.expandtabs([tabsize]) -> unicode\n\
3129\n\
3130Return a copy of S where all tab characters are expanded using spaces.\n\
3131If tabsize is not given, a tab size of 8 characters is assumed.";
3132
3133static PyObject*
3134unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3135{
3136 Py_UNICODE *e;
3137 Py_UNICODE *p;
3138 Py_UNICODE *q;
3139 int i, j;
3140 PyUnicodeObject *u;
3141 int tabsize = 8;
3142
3143 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3144 return NULL;
3145
3146 /* First pass: determine size of ouput string */
3147 i = j = 0;
3148 e = self->str + self->length;
3149 for (p = self->str; p < e; p++)
3150 if (*p == '\t') {
3151 if (tabsize > 0)
3152 j += tabsize - (j % tabsize);
3153 }
3154 else {
3155 j++;
3156 if (*p == '\n' || *p == '\r') {
3157 i += j;
3158 j = 0;
3159 }
3160 }
3161
3162 /* Second pass: create output string and fill it */
3163 u = _PyUnicode_New(i + j);
3164 if (!u)
3165 return NULL;
3166
3167 j = 0;
3168 q = u->str;
3169
3170 for (p = self->str; p < e; p++)
3171 if (*p == '\t') {
3172 if (tabsize > 0) {
3173 i = tabsize - (j % tabsize);
3174 j += i;
3175 while (i--)
3176 *q++ = ' ';
3177 }
3178 }
3179 else {
3180 j++;
3181 *q++ = *p;
3182 if (*p == '\n' || *p == '\r')
3183 j = 0;
3184 }
3185
3186 return (PyObject*) u;
3187}
3188
3189static char find__doc__[] =
3190"S.find(sub [,start [,end]]) -> int\n\
3191\n\
3192Return the lowest index in S where substring sub is found,\n\
3193such that sub is contained within s[start,end]. Optional\n\
3194arguments start and end are interpreted as in slice notation.\n\
3195\n\
3196Return -1 on failure.";
3197
3198static PyObject *
3199unicode_find(PyUnicodeObject *self, PyObject *args)
3200{
3201 PyUnicodeObject *substring;
3202 int start = 0;
3203 int end = INT_MAX;
3204 PyObject *result;
3205
Guido van Rossumb8872e62000-05-09 14:14:27 +00003206 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3207 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 return NULL;
3209 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3210 (PyObject *)substring);
3211 if (substring == NULL)
3212 return NULL;
3213
3214 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3215
3216 Py_DECREF(substring);
3217 return result;
3218}
3219
3220static PyObject *
3221unicode_getitem(PyUnicodeObject *self, int index)
3222{
3223 if (index < 0 || index >= self->length) {
3224 PyErr_SetString(PyExc_IndexError, "string index out of range");
3225 return NULL;
3226 }
3227
3228 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3229}
3230
3231static long
3232unicode_hash(PyUnicodeObject *self)
3233{
3234 long hash;
3235 PyObject *utf8;
3236
3237 /* Since Unicode objects compare equal to their UTF-8 string
3238 counterparts, they should also use the UTF-8 strings as basis
3239 for their hash value. This is needed to assure that strings and
3240 Unicode objects behave in the same way as dictionary
3241 keys. Unfortunately, this costs some performance and also some
3242 memory if the cached UTF-8 representation is not used later
3243 on. */
3244 if (self->hash != -1)
3245 return self->hash;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00003246 utf8 = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003247 if (utf8 == NULL)
3248 return -1;
3249 hash = PyObject_Hash(utf8);
3250 if (hash == -1)
3251 return -1;
3252 self->hash = hash;
3253 return hash;
3254}
3255
3256static char index__doc__[] =
3257"S.index(sub [,start [,end]]) -> int\n\
3258\n\
3259Like S.find() but raise ValueError when the substring is not found.";
3260
3261static PyObject *
3262unicode_index(PyUnicodeObject *self, PyObject *args)
3263{
3264 int result;
3265 PyUnicodeObject *substring;
3266 int start = 0;
3267 int end = INT_MAX;
3268
Guido van Rossumb8872e62000-05-09 14:14:27 +00003269 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3270 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003271 return NULL;
3272
3273 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3274 (PyObject *)substring);
3275 if (substring == NULL)
3276 return NULL;
3277
3278 result = findstring(self, substring, start, end, 1);
3279
3280 Py_DECREF(substring);
3281 if (result < 0) {
3282 PyErr_SetString(PyExc_ValueError, "substring not found");
3283 return NULL;
3284 }
3285 return PyInt_FromLong(result);
3286}
3287
3288static char islower__doc__[] =
3289"S.islower() -> int\n\
3290\n\
3291Return 1 if all cased characters in S are lowercase and there is\n\
3292at least one cased character in S, 0 otherwise.";
3293
3294static PyObject*
3295unicode_islower(PyUnicodeObject *self, PyObject *args)
3296{
3297 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3298 register const Py_UNICODE *e;
3299 int cased;
3300
3301 if (!PyArg_NoArgs(args))
3302 return NULL;
3303
3304 /* Shortcut for single character strings */
3305 if (PyUnicode_GET_SIZE(self) == 1)
3306 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3307
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003308 /* Special case for empty strings */
3309 if (PyString_GET_SIZE(self) == 0)
3310 return PyInt_FromLong(0);
3311
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312 e = p + PyUnicode_GET_SIZE(self);
3313 cased = 0;
3314 for (; p < e; p++) {
3315 register const Py_UNICODE ch = *p;
3316
3317 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3318 return PyInt_FromLong(0);
3319 else if (!cased && Py_UNICODE_ISLOWER(ch))
3320 cased = 1;
3321 }
3322 return PyInt_FromLong(cased);
3323}
3324
3325static char isupper__doc__[] =
3326"S.isupper() -> int\n\
3327\n\
3328Return 1 if all cased characters in S are uppercase and there is\n\
3329at least one cased character in S, 0 otherwise.";
3330
3331static PyObject*
3332unicode_isupper(PyUnicodeObject *self, PyObject *args)
3333{
3334 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3335 register const Py_UNICODE *e;
3336 int cased;
3337
3338 if (!PyArg_NoArgs(args))
3339 return NULL;
3340
3341 /* Shortcut for single character strings */
3342 if (PyUnicode_GET_SIZE(self) == 1)
3343 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3344
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003345 /* Special case for empty strings */
3346 if (PyString_GET_SIZE(self) == 0)
3347 return PyInt_FromLong(0);
3348
Guido van Rossumd57fd912000-03-10 22:53:23 +00003349 e = p + PyUnicode_GET_SIZE(self);
3350 cased = 0;
3351 for (; p < e; p++) {
3352 register const Py_UNICODE ch = *p;
3353
3354 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3355 return PyInt_FromLong(0);
3356 else if (!cased && Py_UNICODE_ISUPPER(ch))
3357 cased = 1;
3358 }
3359 return PyInt_FromLong(cased);
3360}
3361
3362static char istitle__doc__[] =
3363"S.istitle() -> int\n\
3364\n\
3365Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3366may only follow uncased characters and lowercase characters only cased\n\
3367ones. Return 0 otherwise.";
3368
3369static PyObject*
3370unicode_istitle(PyUnicodeObject *self, PyObject *args)
3371{
3372 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3373 register const Py_UNICODE *e;
3374 int cased, previous_is_cased;
3375
3376 if (!PyArg_NoArgs(args))
3377 return NULL;
3378
3379 /* Shortcut for single character strings */
3380 if (PyUnicode_GET_SIZE(self) == 1)
3381 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3382 (Py_UNICODE_ISUPPER(*p) != 0));
3383
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003384 /* Special case for empty strings */
3385 if (PyString_GET_SIZE(self) == 0)
3386 return PyInt_FromLong(0);
3387
Guido van Rossumd57fd912000-03-10 22:53:23 +00003388 e = p + PyUnicode_GET_SIZE(self);
3389 cased = 0;
3390 previous_is_cased = 0;
3391 for (; p < e; p++) {
3392 register const Py_UNICODE ch = *p;
3393
3394 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3395 if (previous_is_cased)
3396 return PyInt_FromLong(0);
3397 previous_is_cased = 1;
3398 cased = 1;
3399 }
3400 else if (Py_UNICODE_ISLOWER(ch)) {
3401 if (!previous_is_cased)
3402 return PyInt_FromLong(0);
3403 previous_is_cased = 1;
3404 cased = 1;
3405 }
3406 else
3407 previous_is_cased = 0;
3408 }
3409 return PyInt_FromLong(cased);
3410}
3411
3412static char isspace__doc__[] =
3413"S.isspace() -> int\n\
3414\n\
3415Return 1 if there are only whitespace characters in S,\n\
34160 otherwise.";
3417
3418static PyObject*
3419unicode_isspace(PyUnicodeObject *self, PyObject *args)
3420{
3421 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3422 register const Py_UNICODE *e;
3423
3424 if (!PyArg_NoArgs(args))
3425 return NULL;
3426
3427 /* Shortcut for single character strings */
3428 if (PyUnicode_GET_SIZE(self) == 1 &&
3429 Py_UNICODE_ISSPACE(*p))
3430 return PyInt_FromLong(1);
3431
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003432 /* Special case for empty strings */
3433 if (PyString_GET_SIZE(self) == 0)
3434 return PyInt_FromLong(0);
3435
Guido van Rossumd57fd912000-03-10 22:53:23 +00003436 e = p + PyUnicode_GET_SIZE(self);
3437 for (; p < e; p++) {
3438 if (!Py_UNICODE_ISSPACE(*p))
3439 return PyInt_FromLong(0);
3440 }
3441 return PyInt_FromLong(1);
3442}
3443
3444static char isdecimal__doc__[] =
3445"S.isdecimal() -> int\n\
3446\n\
3447Return 1 if there are only decimal characters in S,\n\
34480 otherwise.";
3449
3450static PyObject*
3451unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3452{
3453 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3454 register const Py_UNICODE *e;
3455
3456 if (!PyArg_NoArgs(args))
3457 return NULL;
3458
3459 /* Shortcut for single character strings */
3460 if (PyUnicode_GET_SIZE(self) == 1 &&
3461 Py_UNICODE_ISDECIMAL(*p))
3462 return PyInt_FromLong(1);
3463
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003464 /* Special case for empty strings */
3465 if (PyString_GET_SIZE(self) == 0)
3466 return PyInt_FromLong(0);
3467
Guido van Rossumd57fd912000-03-10 22:53:23 +00003468 e = p + PyUnicode_GET_SIZE(self);
3469 for (; p < e; p++) {
3470 if (!Py_UNICODE_ISDECIMAL(*p))
3471 return PyInt_FromLong(0);
3472 }
3473 return PyInt_FromLong(1);
3474}
3475
3476static char isdigit__doc__[] =
3477"S.isdigit() -> int\n\
3478\n\
3479Return 1 if there are only digit characters in S,\n\
34800 otherwise.";
3481
3482static PyObject*
3483unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3484{
3485 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3486 register const Py_UNICODE *e;
3487
3488 if (!PyArg_NoArgs(args))
3489 return NULL;
3490
3491 /* Shortcut for single character strings */
3492 if (PyUnicode_GET_SIZE(self) == 1 &&
3493 Py_UNICODE_ISDIGIT(*p))
3494 return PyInt_FromLong(1);
3495
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003496 /* Special case for empty strings */
3497 if (PyString_GET_SIZE(self) == 0)
3498 return PyInt_FromLong(0);
3499
Guido van Rossumd57fd912000-03-10 22:53:23 +00003500 e = p + PyUnicode_GET_SIZE(self);
3501 for (; p < e; p++) {
3502 if (!Py_UNICODE_ISDIGIT(*p))
3503 return PyInt_FromLong(0);
3504 }
3505 return PyInt_FromLong(1);
3506}
3507
3508static char isnumeric__doc__[] =
3509"S.isnumeric() -> int\n\
3510\n\
3511Return 1 if there are only numeric characters in S,\n\
35120 otherwise.";
3513
3514static PyObject*
3515unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3516{
3517 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3518 register const Py_UNICODE *e;
3519
3520 if (!PyArg_NoArgs(args))
3521 return NULL;
3522
3523 /* Shortcut for single character strings */
3524 if (PyUnicode_GET_SIZE(self) == 1 &&
3525 Py_UNICODE_ISNUMERIC(*p))
3526 return PyInt_FromLong(1);
3527
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003528 /* Special case for empty strings */
3529 if (PyString_GET_SIZE(self) == 0)
3530 return PyInt_FromLong(0);
3531
Guido van Rossumd57fd912000-03-10 22:53:23 +00003532 e = p + PyUnicode_GET_SIZE(self);
3533 for (; p < e; p++) {
3534 if (!Py_UNICODE_ISNUMERIC(*p))
3535 return PyInt_FromLong(0);
3536 }
3537 return PyInt_FromLong(1);
3538}
3539
3540static char join__doc__[] =
3541"S.join(sequence) -> unicode\n\
3542\n\
3543Return a string which is the concatenation of the strings in the\n\
3544sequence. The separator between elements is S.";
3545
3546static PyObject*
3547unicode_join(PyUnicodeObject *self, PyObject *args)
3548{
3549 PyObject *data;
3550 if (!PyArg_ParseTuple(args, "O:join", &data))
3551 return NULL;
3552
3553 return PyUnicode_Join((PyObject *)self, data);
3554}
3555
3556static int
3557unicode_length(PyUnicodeObject *self)
3558{
3559 return self->length;
3560}
3561
3562static char ljust__doc__[] =
3563"S.ljust(width) -> unicode\n\
3564\n\
3565Return S left justified in a Unicode string of length width. Padding is\n\
3566done using spaces.";
3567
3568static PyObject *
3569unicode_ljust(PyUnicodeObject *self, PyObject *args)
3570{
3571 int width;
3572 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3573 return NULL;
3574
3575 if (self->length >= width) {
3576 Py_INCREF(self);
3577 return (PyObject*) self;
3578 }
3579
3580 return (PyObject*) pad(self, 0, width - self->length, ' ');
3581}
3582
3583static char lower__doc__[] =
3584"S.lower() -> unicode\n\
3585\n\
3586Return a copy of the string S converted to lowercase.";
3587
3588static PyObject*
3589unicode_lower(PyUnicodeObject *self, PyObject *args)
3590{
3591 if (!PyArg_NoArgs(args))
3592 return NULL;
3593 return fixup(self, fixlower);
3594}
3595
3596static char lstrip__doc__[] =
3597"S.lstrip() -> unicode\n\
3598\n\
3599Return a copy of the string S with leading whitespace removed.";
3600
3601static PyObject *
3602unicode_lstrip(PyUnicodeObject *self, PyObject *args)
3603{
3604 if (!PyArg_NoArgs(args))
3605 return NULL;
3606 return strip(self, 1, 0);
3607}
3608
3609static PyObject*
3610unicode_repeat(PyUnicodeObject *str, int len)
3611{
3612 PyUnicodeObject *u;
3613 Py_UNICODE *p;
3614
3615 if (len < 0)
3616 len = 0;
3617
3618 if (len == 1) {
3619 /* no repeat, return original string */
3620 Py_INCREF(str);
3621 return (PyObject*) str;
3622 }
3623
3624 u = _PyUnicode_New(len * str->length);
3625 if (!u)
3626 return NULL;
3627
3628 p = u->str;
3629
3630 while (len-- > 0) {
3631 Py_UNICODE_COPY(p, str->str, str->length);
3632 p += str->length;
3633 }
3634
3635 return (PyObject*) u;
3636}
3637
3638PyObject *PyUnicode_Replace(PyObject *obj,
3639 PyObject *subobj,
3640 PyObject *replobj,
3641 int maxcount)
3642{
3643 PyObject *self;
3644 PyObject *str1;
3645 PyObject *str2;
3646 PyObject *result;
3647
3648 self = PyUnicode_FromObject(obj);
3649 if (self == NULL)
3650 return NULL;
3651 str1 = PyUnicode_FromObject(subobj);
3652 if (str1 == NULL) {
3653 Py_DECREF(self);
3654 return NULL;
3655 }
3656 str2 = PyUnicode_FromObject(replobj);
3657 if (str2 == NULL) {
3658 Py_DECREF(self);
3659 Py_DECREF(str1);
3660 return NULL;
3661 }
3662 result = replace((PyUnicodeObject *)self,
3663 (PyUnicodeObject *)str1,
3664 (PyUnicodeObject *)str2,
3665 maxcount);
3666 Py_DECREF(self);
3667 Py_DECREF(str1);
3668 Py_DECREF(str2);
3669 return result;
3670}
3671
3672static char replace__doc__[] =
3673"S.replace (old, new[, maxsplit]) -> unicode\n\
3674\n\
3675Return a copy of S with all occurrences of substring\n\
3676old replaced by new. If the optional argument maxsplit is\n\
3677given, only the first maxsplit occurrences are replaced.";
3678
3679static PyObject*
3680unicode_replace(PyUnicodeObject *self, PyObject *args)
3681{
3682 PyUnicodeObject *str1;
3683 PyUnicodeObject *str2;
3684 int maxcount = -1;
3685 PyObject *result;
3686
3687 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
3688 return NULL;
3689 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
3690 if (str1 == NULL)
3691 return NULL;
3692 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
3693 if (str2 == NULL)
3694 return NULL;
3695
3696 result = replace(self, str1, str2, maxcount);
3697
3698 Py_DECREF(str1);
3699 Py_DECREF(str2);
3700 return result;
3701}
3702
3703static
3704PyObject *unicode_repr(PyObject *unicode)
3705{
3706 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
3707 PyUnicode_GET_SIZE(unicode),
3708 1);
3709}
3710
3711static char rfind__doc__[] =
3712"S.rfind(sub [,start [,end]]) -> int\n\
3713\n\
3714Return the highest index in S where substring sub is found,\n\
3715such that sub is contained within s[start,end]. Optional\n\
3716arguments start and end are interpreted as in slice notation.\n\
3717\n\
3718Return -1 on failure.";
3719
3720static PyObject *
3721unicode_rfind(PyUnicodeObject *self, PyObject *args)
3722{
3723 PyUnicodeObject *substring;
3724 int start = 0;
3725 int end = INT_MAX;
3726 PyObject *result;
3727
Guido van Rossumb8872e62000-05-09 14:14:27 +00003728 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
3729 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003730 return NULL;
3731 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3732 (PyObject *)substring);
3733 if (substring == NULL)
3734 return NULL;
3735
3736 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
3737
3738 Py_DECREF(substring);
3739 return result;
3740}
3741
3742static char rindex__doc__[] =
3743"S.rindex(sub [,start [,end]]) -> int\n\
3744\n\
3745Like S.rfind() but raise ValueError when the substring is not found.";
3746
3747static PyObject *
3748unicode_rindex(PyUnicodeObject *self, PyObject *args)
3749{
3750 int result;
3751 PyUnicodeObject *substring;
3752 int start = 0;
3753 int end = INT_MAX;
3754
Guido van Rossumb8872e62000-05-09 14:14:27 +00003755 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
3756 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003757 return NULL;
3758 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3759 (PyObject *)substring);
3760 if (substring == NULL)
3761 return NULL;
3762
3763 result = findstring(self, substring, start, end, -1);
3764
3765 Py_DECREF(substring);
3766 if (result < 0) {
3767 PyErr_SetString(PyExc_ValueError, "substring not found");
3768 return NULL;
3769 }
3770 return PyInt_FromLong(result);
3771}
3772
3773static char rjust__doc__[] =
3774"S.rjust(width) -> unicode\n\
3775\n\
3776Return S right justified in a Unicode string of length width. Padding is\n\
3777done using spaces.";
3778
3779static PyObject *
3780unicode_rjust(PyUnicodeObject *self, PyObject *args)
3781{
3782 int width;
3783 if (!PyArg_ParseTuple(args, "i:rjust", &width))
3784 return NULL;
3785
3786 if (self->length >= width) {
3787 Py_INCREF(self);
3788 return (PyObject*) self;
3789 }
3790
3791 return (PyObject*) pad(self, width - self->length, 0, ' ');
3792}
3793
3794static char rstrip__doc__[] =
3795"S.rstrip() -> unicode\n\
3796\n\
3797Return a copy of the string S with trailing whitespace removed.";
3798
3799static PyObject *
3800unicode_rstrip(PyUnicodeObject *self, PyObject *args)
3801{
3802 if (!PyArg_NoArgs(args))
3803 return NULL;
3804 return strip(self, 0, 1);
3805}
3806
3807static PyObject*
3808unicode_slice(PyUnicodeObject *self, int start, int end)
3809{
3810 /* standard clamping */
3811 if (start < 0)
3812 start = 0;
3813 if (end < 0)
3814 end = 0;
3815 if (end > self->length)
3816 end = self->length;
3817 if (start == 0 && end == self->length) {
3818 /* full slice, return original string */
3819 Py_INCREF(self);
3820 return (PyObject*) self;
3821 }
3822 if (start > end)
3823 start = end;
3824 /* copy slice */
3825 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
3826 end - start);
3827}
3828
3829PyObject *PyUnicode_Split(PyObject *s,
3830 PyObject *sep,
3831 int maxsplit)
3832{
3833 PyObject *result;
3834
3835 s = PyUnicode_FromObject(s);
3836 if (s == NULL)
3837 return NULL;
3838 if (sep != NULL) {
3839 sep = PyUnicode_FromObject(sep);
3840 if (sep == NULL) {
3841 Py_DECREF(s);
3842 return NULL;
3843 }
3844 }
3845
3846 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
3847
3848 Py_DECREF(s);
3849 Py_XDECREF(sep);
3850 return result;
3851}
3852
3853static char split__doc__[] =
3854"S.split([sep [,maxsplit]]) -> list of strings\n\
3855\n\
3856Return a list of the words in S, using sep as the\n\
3857delimiter string. If maxsplit is given, at most maxsplit\n\
3858splits are done. If sep is not specified, any whitespace string\n\
3859is a separator.";
3860
3861static PyObject*
3862unicode_split(PyUnicodeObject *self, PyObject *args)
3863{
3864 PyObject *substring = Py_None;
3865 int maxcount = -1;
3866
3867 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
3868 return NULL;
3869
3870 if (substring == Py_None)
3871 return split(self, NULL, maxcount);
3872 else if (PyUnicode_Check(substring))
3873 return split(self, (PyUnicodeObject *)substring, maxcount);
3874 else
3875 return PyUnicode_Split((PyObject *)self, substring, maxcount);
3876}
3877
3878static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00003879"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00003880\n\
3881Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00003882Line breaks are not included in the resulting list unless keepends\n\
3883is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003884
3885static PyObject*
3886unicode_splitlines(PyUnicodeObject *self, PyObject *args)
3887{
Guido van Rossum86662912000-04-11 15:38:46 +00003888 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003889
Guido van Rossum86662912000-04-11 15:38:46 +00003890 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 return NULL;
3892
Guido van Rossum86662912000-04-11 15:38:46 +00003893 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003894}
3895
3896static
3897PyObject *unicode_str(PyUnicodeObject *self)
3898{
Fred Drakee4315f52000-05-09 19:53:39 +00003899 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003900}
3901
3902static char strip__doc__[] =
3903"S.strip() -> unicode\n\
3904\n\
3905Return a copy of S with leading and trailing whitespace removed.";
3906
3907static PyObject *
3908unicode_strip(PyUnicodeObject *self, PyObject *args)
3909{
3910 if (!PyArg_NoArgs(args))
3911 return NULL;
3912 return strip(self, 1, 1);
3913}
3914
3915static char swapcase__doc__[] =
3916"S.swapcase() -> unicode\n\
3917\n\
3918Return a copy of S with uppercase characters converted to lowercase\n\
3919and vice versa.";
3920
3921static PyObject*
3922unicode_swapcase(PyUnicodeObject *self, PyObject *args)
3923{
3924 if (!PyArg_NoArgs(args))
3925 return NULL;
3926 return fixup(self, fixswapcase);
3927}
3928
3929static char translate__doc__[] =
3930"S.translate(table) -> unicode\n\
3931\n\
3932Return a copy of the string S, where all characters have been mapped\n\
3933through the given translation table, which must be a mapping of\n\
3934Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
3935are left untouched. Characters mapped to None are deleted.";
3936
3937static PyObject*
3938unicode_translate(PyUnicodeObject *self, PyObject *args)
3939{
3940 PyObject *table;
3941
3942 if (!PyArg_ParseTuple(args, "O:translate", &table))
3943 return NULL;
3944 return PyUnicode_TranslateCharmap(self->str,
3945 self->length,
3946 table,
3947 "ignore");
3948}
3949
3950static char upper__doc__[] =
3951"S.upper() -> unicode\n\
3952\n\
3953Return a copy of S converted to uppercase.";
3954
3955static PyObject*
3956unicode_upper(PyUnicodeObject *self, PyObject *args)
3957{
3958 if (!PyArg_NoArgs(args))
3959 return NULL;
3960 return fixup(self, fixupper);
3961}
3962
3963#if 0
3964static char zfill__doc__[] =
3965"S.zfill(width) -> unicode\n\
3966\n\
3967Pad a numeric string x with zeros on the left, to fill a field\n\
3968of the specified width. The string x is never truncated.";
3969
3970static PyObject *
3971unicode_zfill(PyUnicodeObject *self, PyObject *args)
3972{
3973 int fill;
3974 PyUnicodeObject *u;
3975
3976 int width;
3977 if (!PyArg_ParseTuple(args, "i:zfill", &width))
3978 return NULL;
3979
3980 if (self->length >= width) {
3981 Py_INCREF(self);
3982 return (PyObject*) self;
3983 }
3984
3985 fill = width - self->length;
3986
3987 u = pad(self, fill, 0, '0');
3988
3989 if (u->str[fill] == '+' || u->str[fill] == '-') {
3990 /* move sign to beginning of string */
3991 u->str[0] = u->str[fill];
3992 u->str[fill] = '0';
3993 }
3994
3995 return (PyObject*) u;
3996}
3997#endif
3998
3999#if 0
4000static PyObject*
4001unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4002{
4003 if (!PyArg_NoArgs(args))
4004 return NULL;
4005 return PyInt_FromLong(unicode_freelist_size);
4006}
4007#endif
4008
4009static char startswith__doc__[] =
4010"S.startswith(prefix[, start[, end]]) -> int\n\
4011\n\
4012Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4013optional start, test S beginning at that position. With optional end, stop\n\
4014comparing S at that position.";
4015
4016static PyObject *
4017unicode_startswith(PyUnicodeObject *self,
4018 PyObject *args)
4019{
4020 PyUnicodeObject *substring;
4021 int start = 0;
4022 int end = INT_MAX;
4023 PyObject *result;
4024
Guido van Rossumb8872e62000-05-09 14:14:27 +00004025 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4026 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004027 return NULL;
4028 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4029 (PyObject *)substring);
4030 if (substring == NULL)
4031 return NULL;
4032
4033 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4034
4035 Py_DECREF(substring);
4036 return result;
4037}
4038
4039
4040static char endswith__doc__[] =
4041"S.endswith(suffix[, start[, end]]) -> int\n\
4042\n\
4043Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4044optional start, test S beginning at that position. With optional end, stop\n\
4045comparing S at that position.";
4046
4047static PyObject *
4048unicode_endswith(PyUnicodeObject *self,
4049 PyObject *args)
4050{
4051 PyUnicodeObject *substring;
4052 int start = 0;
4053 int end = INT_MAX;
4054 PyObject *result;
4055
Guido van Rossumb8872e62000-05-09 14:14:27 +00004056 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4057 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004058 return NULL;
4059 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4060 (PyObject *)substring);
4061 if (substring == NULL)
4062 return NULL;
4063
4064 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4065
4066 Py_DECREF(substring);
4067 return result;
4068}
4069
4070
4071static PyMethodDef unicode_methods[] = {
4072
4073 /* Order is according to common usage: often used methods should
4074 appear first, since lookup is done sequentially. */
4075
4076 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4077 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4078 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4079 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4080 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4081 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4082 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4083 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4084 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4085 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4086 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4087 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4088 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4089 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4090/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4091 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4092 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4093 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4094 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4095 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4096 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4097 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4098 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4099 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4100 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4101 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4102 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4103 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4104 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4105 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4106 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4107 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4108 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
4109#if 0
4110 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4111 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4112#endif
4113
4114#if 0
4115 /* This one is just used for debugging the implementation. */
4116 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4117#endif
4118
4119 {NULL, NULL}
4120};
4121
4122static PyObject *
4123unicode_getattr(PyUnicodeObject *self, char *name)
4124{
4125 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4126}
4127
4128static PySequenceMethods unicode_as_sequence = {
4129 (inquiry) unicode_length, /* sq_length */
4130 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4131 (intargfunc) unicode_repeat, /* sq_repeat */
4132 (intargfunc) unicode_getitem, /* sq_item */
4133 (intintargfunc) unicode_slice, /* sq_slice */
4134 0, /* sq_ass_item */
4135 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004136 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004137};
4138
4139static int
4140unicode_buffer_getreadbuf(PyUnicodeObject *self,
4141 int index,
4142 const void **ptr)
4143{
4144 if (index != 0) {
4145 PyErr_SetString(PyExc_SystemError,
4146 "accessing non-existent unicode segment");
4147 return -1;
4148 }
4149 *ptr = (void *) self->str;
4150 return PyUnicode_GET_DATA_SIZE(self);
4151}
4152
4153static int
4154unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4155 const void **ptr)
4156{
4157 PyErr_SetString(PyExc_TypeError,
4158 "cannot use unicode as modifyable buffer");
4159 return -1;
4160}
4161
4162static int
4163unicode_buffer_getsegcount(PyUnicodeObject *self,
4164 int *lenp)
4165{
4166 if (lenp)
4167 *lenp = PyUnicode_GET_DATA_SIZE(self);
4168 return 1;
4169}
4170
4171static int
4172unicode_buffer_getcharbuf(PyUnicodeObject *self,
4173 int index,
4174 const void **ptr)
4175{
4176 PyObject *str;
4177
4178 if (index != 0) {
4179 PyErr_SetString(PyExc_SystemError,
4180 "accessing non-existent unicode segment");
4181 return -1;
4182 }
Guido van Rossum3c1bb802000-04-27 20:13:50 +00004183 str = _PyUnicode_AsUTF8String((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004184 if (str == NULL)
4185 return -1;
4186 *ptr = (void *) PyString_AS_STRING(str);
4187 return PyString_GET_SIZE(str);
4188}
4189
4190/* Helpers for PyUnicode_Format() */
4191
4192static PyObject *
4193getnextarg(args, arglen, p_argidx)
4194 PyObject *args;
4195int arglen;
4196int *p_argidx;
4197{
4198 int argidx = *p_argidx;
4199 if (argidx < arglen) {
4200 (*p_argidx)++;
4201 if (arglen < 0)
4202 return args;
4203 else
4204 return PyTuple_GetItem(args, argidx);
4205 }
4206 PyErr_SetString(PyExc_TypeError,
4207 "not enough arguments for format string");
4208 return NULL;
4209}
4210
4211#define F_LJUST (1<<0)
4212#define F_SIGN (1<<1)
4213#define F_BLANK (1<<2)
4214#define F_ALT (1<<3)
4215#define F_ZERO (1<<4)
4216
4217static
4218#ifdef HAVE_STDARG_PROTOTYPES
4219int usprintf(register Py_UNICODE *buffer, char *format, ...)
4220#else
4221int usprintf(va_alist) va_dcl
4222#endif
4223{
4224 register int i;
4225 int len;
4226 va_list va;
4227 char *charbuffer;
4228#ifdef HAVE_STDARG_PROTOTYPES
4229 va_start(va, format);
4230#else
4231 Py_UNICODE *args;
4232 char *format;
4233
4234 va_start(va);
4235 buffer = va_arg(va, Py_UNICODE *);
4236 format = va_arg(va, char *);
4237#endif
4238
4239 /* First, format the string as char array, then expand to Py_UNICODE
4240 array. */
4241 charbuffer = (char *)buffer;
4242 len = vsprintf(charbuffer, format, va);
4243 for (i = len - 1; i >= 0; i--)
4244 buffer[i] = (Py_UNICODE) charbuffer[i];
4245
4246 va_end(va);
4247 return len;
4248}
4249
4250static int
4251formatfloat(Py_UNICODE *buf,
4252 int flags,
4253 int prec,
4254 int type,
4255 PyObject *v)
4256{
4257 char fmt[20];
4258 double x;
4259
4260 x = PyFloat_AsDouble(v);
4261 if (x == -1.0 && PyErr_Occurred())
4262 return -1;
4263 if (prec < 0)
4264 prec = 6;
4265 if (prec > 50)
4266 prec = 50; /* Arbitrary limitation */
4267 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4268 type = 'g';
4269 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
4270 return usprintf(buf, fmt, x);
4271}
4272
4273static int
4274formatint(Py_UNICODE *buf,
4275 int flags,
4276 int prec,
4277 int type,
4278 PyObject *v)
4279{
4280 char fmt[20];
4281 long x;
4282
4283 x = PyInt_AsLong(v);
4284 if (x == -1 && PyErr_Occurred())
4285 return -1;
4286 if (prec < 0)
4287 prec = 1;
4288 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4289 return usprintf(buf, fmt, x);
4290}
4291
4292static int
4293formatchar(Py_UNICODE *buf,
4294 PyObject *v)
4295{
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004296 if (PyUnicode_Check(v)) {
4297 if (PyUnicode_GET_SIZE(v) != 1)
4298 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004299 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004300 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004301
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004302 else if (PyString_Check(v)) {
4303 if (PyString_GET_SIZE(v) != 1)
4304 goto onError;
4305 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4306 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004307
4308 else {
4309 /* Integer input truncated to a character */
4310 long x;
4311 x = PyInt_AsLong(v);
4312 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004313 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004314 buf[0] = (char) x;
4315 }
4316 buf[1] = '\0';
4317 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004318
4319 onError:
4320 PyErr_SetString(PyExc_TypeError,
4321 "%c requires int or char");
4322 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004323}
4324
4325PyObject *PyUnicode_Format(PyObject *format,
4326 PyObject *args)
4327{
4328 Py_UNICODE *fmt, *res;
4329 int fmtcnt, rescnt, reslen, arglen, argidx;
4330 int args_owned = 0;
4331 PyUnicodeObject *result = NULL;
4332 PyObject *dict = NULL;
4333 PyObject *uformat;
4334
4335 if (format == NULL || args == NULL) {
4336 PyErr_BadInternalCall();
4337 return NULL;
4338 }
4339 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004340 if (uformat == NULL)
4341 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004342 fmt = PyUnicode_AS_UNICODE(uformat);
4343 fmtcnt = PyUnicode_GET_SIZE(uformat);
4344
4345 reslen = rescnt = fmtcnt + 100;
4346 result = _PyUnicode_New(reslen);
4347 if (result == NULL)
4348 goto onError;
4349 res = PyUnicode_AS_UNICODE(result);
4350
4351 if (PyTuple_Check(args)) {
4352 arglen = PyTuple_Size(args);
4353 argidx = 0;
4354 }
4355 else {
4356 arglen = -1;
4357 argidx = -2;
4358 }
4359 if (args->ob_type->tp_as_mapping)
4360 dict = args;
4361
4362 while (--fmtcnt >= 0) {
4363 if (*fmt != '%') {
4364 if (--rescnt < 0) {
4365 rescnt = fmtcnt + 100;
4366 reslen += rescnt;
4367 if (_PyUnicode_Resize(result, reslen) < 0)
4368 return NULL;
4369 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4370 --rescnt;
4371 }
4372 *res++ = *fmt++;
4373 }
4374 else {
4375 /* Got a format specifier */
4376 int flags = 0;
4377 int width = -1;
4378 int prec = -1;
4379 int size = 0;
4380 Py_UNICODE c = '\0';
4381 Py_UNICODE fill;
4382 PyObject *v = NULL;
4383 PyObject *temp = NULL;
4384 Py_UNICODE *buf;
4385 Py_UNICODE sign;
4386 int len;
4387 Py_UNICODE tmpbuf[120]; /* For format{float,int,char}() */
4388
4389 fmt++;
4390 if (*fmt == '(') {
4391 Py_UNICODE *keystart;
4392 int keylen;
4393 PyObject *key;
4394 int pcount = 1;
4395
4396 if (dict == NULL) {
4397 PyErr_SetString(PyExc_TypeError,
4398 "format requires a mapping");
4399 goto onError;
4400 }
4401 ++fmt;
4402 --fmtcnt;
4403 keystart = fmt;
4404 /* Skip over balanced parentheses */
4405 while (pcount > 0 && --fmtcnt >= 0) {
4406 if (*fmt == ')')
4407 --pcount;
4408 else if (*fmt == '(')
4409 ++pcount;
4410 fmt++;
4411 }
4412 keylen = fmt - keystart - 1;
4413 if (fmtcnt < 0 || pcount > 0) {
4414 PyErr_SetString(PyExc_ValueError,
4415 "incomplete format key");
4416 goto onError;
4417 }
Fred Drakee4315f52000-05-09 19:53:39 +00004418 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004419 then looked up since Python uses strings to hold
4420 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004421 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004422 key = PyUnicode_EncodeUTF8(keystart,
4423 keylen,
4424 NULL);
4425 if (key == NULL)
4426 goto onError;
4427 if (args_owned) {
4428 Py_DECREF(args);
4429 args_owned = 0;
4430 }
4431 args = PyObject_GetItem(dict, key);
4432 Py_DECREF(key);
4433 if (args == NULL) {
4434 goto onError;
4435 }
4436 args_owned = 1;
4437 arglen = -1;
4438 argidx = -2;
4439 }
4440 while (--fmtcnt >= 0) {
4441 switch (c = *fmt++) {
4442 case '-': flags |= F_LJUST; continue;
4443 case '+': flags |= F_SIGN; continue;
4444 case ' ': flags |= F_BLANK; continue;
4445 case '#': flags |= F_ALT; continue;
4446 case '0': flags |= F_ZERO; continue;
4447 }
4448 break;
4449 }
4450 if (c == '*') {
4451 v = getnextarg(args, arglen, &argidx);
4452 if (v == NULL)
4453 goto onError;
4454 if (!PyInt_Check(v)) {
4455 PyErr_SetString(PyExc_TypeError,
4456 "* wants int");
4457 goto onError;
4458 }
4459 width = PyInt_AsLong(v);
4460 if (width < 0) {
4461 flags |= F_LJUST;
4462 width = -width;
4463 }
4464 if (--fmtcnt >= 0)
4465 c = *fmt++;
4466 }
4467 else if (c >= '0' && c <= '9') {
4468 width = c - '0';
4469 while (--fmtcnt >= 0) {
4470 c = *fmt++;
4471 if (c < '0' || c > '9')
4472 break;
4473 if ((width*10) / 10 != width) {
4474 PyErr_SetString(PyExc_ValueError,
4475 "width too big");
4476 goto onError;
4477 }
4478 width = width*10 + (c - '0');
4479 }
4480 }
4481 if (c == '.') {
4482 prec = 0;
4483 if (--fmtcnt >= 0)
4484 c = *fmt++;
4485 if (c == '*') {
4486 v = getnextarg(args, arglen, &argidx);
4487 if (v == NULL)
4488 goto onError;
4489 if (!PyInt_Check(v)) {
4490 PyErr_SetString(PyExc_TypeError,
4491 "* wants int");
4492 goto onError;
4493 }
4494 prec = PyInt_AsLong(v);
4495 if (prec < 0)
4496 prec = 0;
4497 if (--fmtcnt >= 0)
4498 c = *fmt++;
4499 }
4500 else if (c >= '0' && c <= '9') {
4501 prec = c - '0';
4502 while (--fmtcnt >= 0) {
4503 c = Py_CHARMASK(*fmt++);
4504 if (c < '0' || c > '9')
4505 break;
4506 if ((prec*10) / 10 != prec) {
4507 PyErr_SetString(PyExc_ValueError,
4508 "prec too big");
4509 goto onError;
4510 }
4511 prec = prec*10 + (c - '0');
4512 }
4513 }
4514 } /* prec */
4515 if (fmtcnt >= 0) {
4516 if (c == 'h' || c == 'l' || c == 'L') {
4517 size = c;
4518 if (--fmtcnt >= 0)
4519 c = *fmt++;
4520 }
4521 }
4522 if (fmtcnt < 0) {
4523 PyErr_SetString(PyExc_ValueError,
4524 "incomplete format");
4525 goto onError;
4526 }
4527 if (c != '%') {
4528 v = getnextarg(args, arglen, &argidx);
4529 if (v == NULL)
4530 goto onError;
4531 }
4532 sign = 0;
4533 fill = ' ';
4534 switch (c) {
4535
4536 case '%':
4537 buf = tmpbuf;
4538 buf[0] = '%';
4539 len = 1;
4540 break;
4541
4542 case 's':
4543 case 'r':
4544 if (PyUnicode_Check(v) && c == 's') {
4545 temp = v;
4546 Py_INCREF(temp);
4547 }
4548 else {
4549 PyObject *unicode;
4550 if (c == 's')
4551 temp = PyObject_Str(v);
4552 else
4553 temp = PyObject_Repr(v);
4554 if (temp == NULL)
4555 goto onError;
4556 if (!PyString_Check(temp)) {
4557 /* XXX Note: this should never happen, since
4558 PyObject_Repr() and PyObject_Str() assure
4559 this */
4560 Py_DECREF(temp);
4561 PyErr_SetString(PyExc_TypeError,
4562 "%s argument has non-string str()");
4563 goto onError;
4564 }
Fred Drakee4315f52000-05-09 19:53:39 +00004565 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00004566 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00004567 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004568 "strict");
4569 Py_DECREF(temp);
4570 temp = unicode;
4571 if (temp == NULL)
4572 goto onError;
4573 }
4574 buf = PyUnicode_AS_UNICODE(temp);
4575 len = PyUnicode_GET_SIZE(temp);
4576 if (prec >= 0 && len > prec)
4577 len = prec;
4578 break;
4579
4580 case 'i':
4581 case 'd':
4582 case 'u':
4583 case 'o':
4584 case 'x':
4585 case 'X':
4586 if (c == 'i')
4587 c = 'd';
4588 buf = tmpbuf;
4589 len = formatint(buf, flags, prec, c, v);
4590 if (len < 0)
4591 goto onError;
4592 sign = (c == 'd');
4593 if (flags & F_ZERO) {
4594 fill = '0';
4595 if ((flags&F_ALT) &&
4596 (c == 'x' || c == 'X') &&
4597 buf[0] == '0' && buf[1] == c) {
4598 *res++ = *buf++;
4599 *res++ = *buf++;
4600 rescnt -= 2;
4601 len -= 2;
4602 width -= 2;
4603 if (width < 0)
4604 width = 0;
4605 }
4606 }
4607 break;
4608
4609 case 'e':
4610 case 'E':
4611 case 'f':
4612 case 'g':
4613 case 'G':
4614 buf = tmpbuf;
4615 len = formatfloat(buf, flags, prec, c, v);
4616 if (len < 0)
4617 goto onError;
4618 sign = 1;
4619 if (flags&F_ZERO)
4620 fill = '0';
4621 break;
4622
4623 case 'c':
4624 buf = tmpbuf;
4625 len = formatchar(buf, v);
4626 if (len < 0)
4627 goto onError;
4628 break;
4629
4630 default:
4631 PyErr_Format(PyExc_ValueError,
4632 "unsupported format character '%c' (0x%x)",
4633 c, c);
4634 goto onError;
4635 }
4636 if (sign) {
4637 if (*buf == '-' || *buf == '+') {
4638 sign = *buf++;
4639 len--;
4640 }
4641 else if (flags & F_SIGN)
4642 sign = '+';
4643 else if (flags & F_BLANK)
4644 sign = ' ';
4645 else
4646 sign = 0;
4647 }
4648 if (width < len)
4649 width = len;
4650 if (rescnt < width + (sign != 0)) {
4651 reslen -= rescnt;
4652 rescnt = width + fmtcnt + 100;
4653 reslen += rescnt;
4654 if (_PyUnicode_Resize(result, reslen) < 0)
4655 return NULL;
4656 res = PyUnicode_AS_UNICODE(result)
4657 + reslen - rescnt;
4658 }
4659 if (sign) {
4660 if (fill != ' ')
4661 *res++ = sign;
4662 rescnt--;
4663 if (width > len)
4664 width--;
4665 }
4666 if (width > len && !(flags & F_LJUST)) {
4667 do {
4668 --rescnt;
4669 *res++ = fill;
4670 } while (--width > len);
4671 }
4672 if (sign && fill == ' ')
4673 *res++ = sign;
4674 memcpy(res, buf, len * sizeof(Py_UNICODE));
4675 res += len;
4676 rescnt -= len;
4677 while (--width >= len) {
4678 --rescnt;
4679 *res++ = ' ';
4680 }
4681 if (dict && (argidx < arglen) && c != '%') {
4682 PyErr_SetString(PyExc_TypeError,
4683 "not all arguments converted");
4684 goto onError;
4685 }
4686 Py_XDECREF(temp);
4687 } /* '%' */
4688 } /* until end */
4689 if (argidx < arglen && !dict) {
4690 PyErr_SetString(PyExc_TypeError,
4691 "not all arguments converted");
4692 goto onError;
4693 }
4694
4695 if (args_owned) {
4696 Py_DECREF(args);
4697 }
4698 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004699 if (_PyUnicode_Resize(result, reslen - rescnt))
4700 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004701 return (PyObject *)result;
4702
4703 onError:
4704 Py_XDECREF(result);
4705 Py_DECREF(uformat);
4706 if (args_owned) {
4707 Py_DECREF(args);
4708 }
4709 return NULL;
4710}
4711
4712static PyBufferProcs unicode_as_buffer = {
4713 (getreadbufferproc) unicode_buffer_getreadbuf,
4714 (getwritebufferproc) unicode_buffer_getwritebuf,
4715 (getsegcountproc) unicode_buffer_getsegcount,
4716 (getcharbufferproc) unicode_buffer_getcharbuf,
4717};
4718
4719PyTypeObject PyUnicode_Type = {
4720 PyObject_HEAD_INIT(&PyType_Type)
4721 0, /* ob_size */
4722 "unicode", /* tp_name */
4723 sizeof(PyUnicodeObject), /* tp_size */
4724 0, /* tp_itemsize */
4725 /* Slots */
4726 (destructor)_PyUnicode_Free, /* tp_dealloc */
4727 0, /* tp_print */
4728 (getattrfunc)unicode_getattr, /* tp_getattr */
4729 0, /* tp_setattr */
4730 (cmpfunc) unicode_compare, /* tp_compare */
4731 (reprfunc) unicode_repr, /* tp_repr */
4732 0, /* tp_as_number */
4733 &unicode_as_sequence, /* tp_as_sequence */
4734 0, /* tp_as_mapping */
4735 (hashfunc) unicode_hash, /* tp_hash*/
4736 0, /* tp_call*/
4737 (reprfunc) unicode_str, /* tp_str */
4738 (getattrofunc) NULL, /* tp_getattro */
4739 (setattrofunc) NULL, /* tp_setattro */
4740 &unicode_as_buffer, /* tp_as_buffer */
4741 Py_TPFLAGS_DEFAULT, /* tp_flags */
4742};
4743
4744/* Initialize the Unicode implementation */
4745
4746void _PyUnicode_Init()
4747{
4748 /* Doublecheck the configuration... */
4749 if (sizeof(Py_UNICODE) != 2)
4750 Py_FatalError("Unicode configuration error: "
4751 "sizeof(Py_UNICODE) != 2 bytes");
4752
Fred Drakee4315f52000-05-09 19:53:39 +00004753 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004754 unicode_freelist = NULL;
4755 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004756 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00004757 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00004758}
4759
4760/* Finalize the Unicode implementation */
4761
4762void
4763_PyUnicode_Fini()
4764{
4765 PyUnicodeObject *u = unicode_freelist;
4766
4767 while (u != NULL) {
4768 PyUnicodeObject *v = u;
4769 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004770 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00004771 PyMem_DEL(v->str);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00004772 Py_XDECREF(v->utf8str);
Guido van Rossumb18618d2000-05-03 23:44:39 +00004773 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004774 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004775 unicode_freelist = NULL;
4776 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004777 Py_XDECREF(unicode_empty);
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004778 unicode_empty = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004779}