blob: c1f3d5414f0cfc4112d576613e9c975246bea338 [file] [log] [blame]
Guido van Rossumd57fd912000-03-10 22:53:23 +00001/*
2
3Unicode implementation based on original code by Fredrik Lundh,
Fred Drake785d14f2000-05-09 19:54:43 +00004modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
Guido van Rossumd57fd912000-03-10 22:53:23 +00005Unicode Integration Proposal (see file Misc/unicode.txt).
6
Guido van Rossum16b1ad92000-08-03 16:24:25 +00007Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +00008
9
10 Original header:
11 --------------------------------------------------------------------
12
13 * Yet another Unicode string type for Python. This type supports the
14 * 16-bit Basic Multilingual Plane (BMP) only.
15 *
16 * Note that this string class supports embedded NULL characters. End
17 * of string is given by the length attribute. However, the internal
18 * representation always stores a trailing NULL to make it easier to
19 * use unicode strings with standard APIs.
20 *
21 * History:
22 * 1999-01-23 fl Created
23 * 1999-01-24 fl Added split, join, capwords; basic UTF-8 support
24 * 1999-01-24 fl Basic UCS-2 support, buffer interface, etc.
25 * 1999-03-06 fl Moved declarations to separate file, etc.
26 * 1999-06-13 fl Changed join method semantics according to Tim's proposal
27 * 1999-08-10 fl Some minor tweaks
28 *
29 * Written by Fredrik Lundh, January 1999.
30 *
31 * Copyright (c) 1999 by Secret Labs AB.
32 * Copyright (c) 1999 by Fredrik Lundh.
33 *
34 * fredrik@pythonware.com
35 * http://www.pythonware.com
36 *
37 * --------------------------------------------------------------------
38 * This Unicode String Type is
39 *
40 * Copyright (c) 1999 by Secret Labs AB
41 * Copyright (c) 1999 by Fredrik Lundh
42 *
43 * By obtaining, using, and/or copying this software and/or its
44 * associated documentation, you agree that you have read, understood,
45 * and will comply with the following terms and conditions:
46 *
47 * Permission to use, copy, modify, and distribute this software and its
48 * associated documentation for any purpose and without fee is hereby
49 * granted, provided that the above copyright notice appears in all
50 * copies, and that both that copyright notice and this permission notice
51 * appear in supporting documentation, and that the name of Secret Labs
52 * AB or the author not be used in advertising or publicity pertaining to
53 * distribution of the software without specific, written prior
54 * permission.
55 *
56 * SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
57 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
58 * FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
59 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
60 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
61 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
62 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
63 * -------------------------------------------------------------------- */
64
65#include "Python.h"
66
Guido van Rossumd57fd912000-03-10 22:53:23 +000067#include "unicodeobject.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000068#include "ucnhash.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000069
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000070#ifdef MS_WIN32
71#include <windows.h>
72#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000073
Guido van Rossumd57fd912000-03-10 22:53:23 +000074/* Limit for the Unicode object free list */
75
76#define MAX_UNICODE_FREELIST_SIZE 1024
77
78/* Limit for the Unicode object free list stay alive optimization.
79
80 The implementation will keep allocated Unicode memory intact for
81 all objects on the free list having a size less than this
82 limit. This reduces malloc() overhead for small Unicode objects.
83
Barry Warsaw51ac5802000-03-20 16:36:48 +000084 At worst this will result in MAX_UNICODE_FREELIST_SIZE *
Guido van Rossumfd4b9572000-04-10 13:51:10 +000085 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
Guido van Rossumd57fd912000-03-10 22:53:23 +000086 malloc()-overhead) bytes of unused garbage.
87
88 Setting the limit to 0 effectively turns the feature off.
89
Guido van Rossumfd4b9572000-04-10 13:51:10 +000090 Note: This is an experimental feature ! If you get core dumps when
91 using Unicode objects, turn this feature off.
Guido van Rossumd57fd912000-03-10 22:53:23 +000092
93*/
94
Guido van Rossumfd4b9572000-04-10 13:51:10 +000095#define KEEPALIVE_SIZE_LIMIT 9
Guido van Rossumd57fd912000-03-10 22:53:23 +000096
97/* Endianness switches; defaults to little endian */
98
99#ifdef WORDS_BIGENDIAN
100# define BYTEORDER_IS_BIG_ENDIAN
101#else
102# define BYTEORDER_IS_LITTLE_ENDIAN
103#endif
104
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000105/* --- Globals ------------------------------------------------------------
106
107 The globals are initialized by the _PyUnicode_Init() API and should
108 not be used before calling that API.
109
110*/
Guido van Rossumd57fd912000-03-10 22:53:23 +0000111
112/* The empty Unicode object */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000113static PyUnicodeObject *unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000114
115/* Free list for Unicode objects */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +0000116static PyUnicodeObject *unicode_freelist;
117static int unicode_freelist_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000118
Fred Drakee4315f52000-05-09 19:53:39 +0000119/* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125*/
126
127static char unicode_default_encoding[100];
128
Guido van Rossumd57fd912000-03-10 22:53:23 +0000129/* --- Unicode Object ----------------------------------------------------- */
130
131static
132int _PyUnicode_Resize(register PyUnicodeObject *unicode,
133 int length)
134{
135 void *oldstr;
136
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000137 /* Shortcut if there's nothing much to do. */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000138 if (unicode->length == length)
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000139 goto reset;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000140
141 /* Resizing unicode_empty is not allowed. */
142 if (unicode == unicode_empty) {
143 PyErr_SetString(PyExc_SystemError,
144 "can't resize empty unicode object");
145 return -1;
146 }
147
148 /* We allocate one more byte to make sure the string is
149 Ux0000 terminated -- XXX is this needed ? */
150 oldstr = unicode->str;
151 PyMem_RESIZE(unicode->str, Py_UNICODE, length + 1);
152 if (!unicode->str) {
153 unicode->str = oldstr;
154 PyErr_NoMemory();
155 return -1;
156 }
157 unicode->str[length] = 0;
158 unicode->length = length;
159
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000160 reset:
Guido van Rossumd57fd912000-03-10 22:53:23 +0000161 /* Reset the object caches */
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000162 if (unicode->defenc) {
163 Py_DECREF(unicode->defenc);
164 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000165 }
166 unicode->hash = -1;
167
168 return 0;
169}
170
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000171int PyUnicode_Resize(PyObject **unicode,
172 int length)
173{
174 PyUnicodeObject *v;
175
176 if (unicode == NULL) {
177 PyErr_BadInternalCall();
178 return -1;
179 }
180 v = (PyUnicodeObject *)*unicode;
181 if (v == NULL || !PyUnicode_Check(v) || v->ob_refcnt != 1) {
182 PyErr_BadInternalCall();
183 return -1;
184 }
185 return _PyUnicode_Resize(v, length);
186}
187
Guido van Rossumd57fd912000-03-10 22:53:23 +0000188/* We allocate one more byte to make sure the string is
189 Ux0000 terminated -- XXX is this needed ?
190
191 XXX This allocator could further be enhanced by assuring that the
192 free list never reduces its size below 1.
193
194*/
195
196static
197PyUnicodeObject *_PyUnicode_New(int length)
198{
199 register PyUnicodeObject *unicode;
200
201 /* Optimization for empty strings */
202 if (length == 0 && unicode_empty != NULL) {
203 Py_INCREF(unicode_empty);
204 return unicode_empty;
205 }
206
207 /* Unicode freelist & memory allocation */
208 if (unicode_freelist) {
209 unicode = unicode_freelist;
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000210 unicode_freelist = *(PyUnicodeObject **)unicode;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000211 unicode_freelist_size--;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000212 if (unicode->str) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000213 /* Keep-Alive optimization: we only upsize the buffer,
214 never downsize it. */
215 if ((unicode->length < length) &&
Guido van Rossumd57fd912000-03-10 22:53:23 +0000216 _PyUnicode_Resize(unicode, length)) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000217 PyMem_DEL(unicode->str);
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000218 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000219 }
220 }
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000221 else {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000222 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
Marc-André Lemburgbea47e72000-06-17 20:31:17 +0000223 }
224 PyObject_INIT(unicode, &PyUnicode_Type);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000225 }
226 else {
227 unicode = PyObject_NEW(PyUnicodeObject, &PyUnicode_Type);
228 if (unicode == NULL)
229 return NULL;
230 unicode->str = PyMem_NEW(Py_UNICODE, length + 1);
231 }
232
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000233 if (!unicode->str) {
234 PyErr_NoMemory();
Barry Warsaw51ac5802000-03-20 16:36:48 +0000235 goto onError;
Guido van Rossum3c1bb802000-04-27 20:13:50 +0000236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000237 unicode->str[length] = 0;
238 unicode->length = length;
239 unicode->hash = -1;
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000240 unicode->defenc = NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000241 return unicode;
Barry Warsaw51ac5802000-03-20 16:36:48 +0000242
243 onError:
244 _Py_ForgetReference((PyObject *)unicode);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000245 PyObject_DEL(unicode);
Barry Warsaw51ac5802000-03-20 16:36:48 +0000246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000247}
248
249static
250void _PyUnicode_Free(register PyUnicodeObject *unicode)
251{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000252 if (unicode_freelist_size < MAX_UNICODE_FREELIST_SIZE) {
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000253 /* Keep-Alive optimization */
254 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000255 PyMem_DEL(unicode->str);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000256 unicode->str = NULL;
257 unicode->length = 0;
258 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000259 if (unicode->defenc) {
260 Py_DECREF(unicode->defenc);
261 unicode->defenc = NULL;
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000262 }
263 /* Add to free list */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000264 *(PyUnicodeObject **)unicode = unicode_freelist;
265 unicode_freelist = unicode;
266 unicode_freelist_size++;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000267 }
268 else {
Guido van Rossumb18618d2000-05-03 23:44:39 +0000269 PyMem_DEL(unicode->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000270 Py_XDECREF(unicode->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +0000271 PyObject_DEL(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000272 }
273}
274
275PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
276 int size)
277{
278 PyUnicodeObject *unicode;
279
280 unicode = _PyUnicode_New(size);
281 if (!unicode)
282 return NULL;
283
284 /* Copy the Unicode data into the new object */
285 if (u != NULL)
286 memcpy(unicode->str, u, size * sizeof(Py_UNICODE));
287
288 return (PyObject *)unicode;
289}
290
291#ifdef HAVE_WCHAR_H
292
293PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
294 int size)
295{
296 PyUnicodeObject *unicode;
297
298 if (w == NULL) {
299 PyErr_BadInternalCall();
300 return NULL;
301 }
302
303 unicode = _PyUnicode_New(size);
304 if (!unicode)
305 return NULL;
306
307 /* Copy the wchar_t data into the new object */
308#ifdef HAVE_USABLE_WCHAR_T
309 memcpy(unicode->str, w, size * sizeof(wchar_t));
310#else
311 {
312 register Py_UNICODE *u;
313 register int i;
314 u = PyUnicode_AS_UNICODE(unicode);
315 for (i = size; i >= 0; i--)
316 *u++ = *w++;
317 }
318#endif
319
320 return (PyObject *)unicode;
321}
322
323int PyUnicode_AsWideChar(PyUnicodeObject *unicode,
324 register wchar_t *w,
325 int size)
326{
327 if (unicode == NULL) {
328 PyErr_BadInternalCall();
329 return -1;
330 }
331 if (size > PyUnicode_GET_SIZE(unicode))
332 size = PyUnicode_GET_SIZE(unicode);
333#ifdef HAVE_USABLE_WCHAR_T
334 memcpy(w, unicode->str, size * sizeof(wchar_t));
335#else
336 {
337 register Py_UNICODE *u;
338 register int i;
339 u = PyUnicode_AS_UNICODE(unicode);
340 for (i = size; i >= 0; i--)
341 *w++ = *u++;
342 }
343#endif
344
345 return size;
346}
347
348#endif
349
350PyObject *PyUnicode_FromObject(register PyObject *obj)
351{
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000352 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
353}
354
355PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
356 const char *encoding,
357 const char *errors)
358{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000359 const char *s;
360 int len;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000361 int owned = 0;
362 PyObject *v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000363
364 if (obj == NULL) {
365 PyErr_BadInternalCall();
366 return NULL;
367 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000368
369 /* Coerce object */
370 if (PyInstance_Check(obj)) {
371 PyObject *func;
372 func = PyObject_GetAttrString(obj, "__str__");
373 if (func == NULL) {
374 PyErr_SetString(PyExc_TypeError,
375 "coercing to Unicode: instance doesn't define __str__");
376 return NULL;
377 }
378 obj = PyEval_CallObject(func, NULL);
379 Py_DECREF(func);
380 if (obj == NULL)
381 return NULL;
382 owned = 1;
383 }
384 if (PyUnicode_Check(obj)) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000385 Py_INCREF(obj);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000386 v = obj;
387 if (encoding) {
388 PyErr_SetString(PyExc_TypeError,
389 "decoding Unicode is not supported");
390 return NULL;
391 }
392 goto done;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000393 }
394 else if (PyString_Check(obj)) {
395 s = PyString_AS_STRING(obj);
396 len = PyString_GET_SIZE(obj);
397 }
Guido van Rossum9e896b32000-04-05 20:11:21 +0000398 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
399 /* Overwrite the error message with something more useful in
400 case of a TypeError. */
401 if (PyErr_ExceptionMatches(PyExc_TypeError))
Marc-André Lemburg566d8a62000-07-11 09:47:04 +0000402 PyErr_Format(PyExc_TypeError,
403 "coercing to Unicode: need string or buffer, "
404 "%.80s found",
405 obj->ob_type->tp_name);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000406 goto onError;
Guido van Rossum9e896b32000-04-05 20:11:21 +0000407 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000408
409 /* Convert to Unicode */
Guido van Rossumd57fd912000-03-10 22:53:23 +0000410 if (len == 0) {
411 Py_INCREF(unicode_empty);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000412 v = (PyObject *)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000413 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000414 else
415 v = PyUnicode_Decode(s, len, encoding, errors);
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +0000416
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000417 done:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000418 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000419 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000420 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000421 return v;
422
423 onError:
Greg Steinaf36a3a2000-07-17 09:04:43 +0000424 if (owned) {
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000425 Py_DECREF(obj);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000426 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +0000427 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000428}
429
430PyObject *PyUnicode_Decode(const char *s,
431 int size,
432 const char *encoding,
433 const char *errors)
434{
435 PyObject *buffer = NULL, *unicode;
436
Fred Drakee4315f52000-05-09 19:53:39 +0000437 if (encoding == NULL)
438 encoding = PyUnicode_GetDefaultEncoding();
439
440 /* Shortcuts for common default encodings */
441 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000442 return PyUnicode_DecodeUTF8(s, size, errors);
Fred Drakee4315f52000-05-09 19:53:39 +0000443 else if (strcmp(encoding, "latin-1") == 0)
444 return PyUnicode_DecodeLatin1(s, size, errors);
445 else if (strcmp(encoding, "ascii") == 0)
446 return PyUnicode_DecodeASCII(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000447
448 /* Decode via the codec registry */
449 buffer = PyBuffer_FromMemory((void *)s, size);
450 if (buffer == NULL)
451 goto onError;
452 unicode = PyCodec_Decode(buffer, encoding, errors);
453 if (unicode == NULL)
454 goto onError;
455 if (!PyUnicode_Check(unicode)) {
456 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000457 "decoder did not return an unicode object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000458 unicode->ob_type->tp_name);
459 Py_DECREF(unicode);
460 goto onError;
461 }
462 Py_DECREF(buffer);
463 return unicode;
464
465 onError:
466 Py_XDECREF(buffer);
467 return NULL;
468}
469
470PyObject *PyUnicode_Encode(const Py_UNICODE *s,
471 int size,
472 const char *encoding,
473 const char *errors)
474{
475 PyObject *v, *unicode;
476
477 unicode = PyUnicode_FromUnicode(s, size);
478 if (unicode == NULL)
479 return NULL;
480 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
481 Py_DECREF(unicode);
482 return v;
483}
484
485PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
486 const char *encoding,
487 const char *errors)
488{
489 PyObject *v;
490
491 if (!PyUnicode_Check(unicode)) {
492 PyErr_BadArgument();
493 goto onError;
494 }
Fred Drakee4315f52000-05-09 19:53:39 +0000495
496 if (encoding == NULL)
497 encoding = PyUnicode_GetDefaultEncoding();
498
499 /* Shortcuts for common default encodings */
500 if (errors == NULL) {
501 if (strcmp(encoding, "utf-8") == 0)
Guido van Rossumd57fd912000-03-10 22:53:23 +0000502 return PyUnicode_AsUTF8String(unicode);
Fred Drakee4315f52000-05-09 19:53:39 +0000503 else if (strcmp(encoding, "latin-1") == 0)
504 return PyUnicode_AsLatin1String(unicode);
505 else if (strcmp(encoding, "ascii") == 0)
506 return PyUnicode_AsASCIIString(unicode);
507 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000508
509 /* Encode via the codec registry */
510 v = PyCodec_Encode(unicode, encoding, errors);
511 if (v == NULL)
512 goto onError;
513 /* XXX Should we really enforce this ? */
514 if (!PyString_Check(v)) {
515 PyErr_Format(PyExc_TypeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000516 "encoder did not return a string object (type=%.400s)",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000517 v->ob_type->tp_name);
518 Py_DECREF(v);
519 goto onError;
520 }
521 return v;
522
523 onError:
524 return NULL;
525}
526
Marc-André Lemburgbff879c2000-08-03 18:46:08 +0000527/* Return a Python string holding the default encoded value of the
528 Unicode object.
529
530 The resulting string is cached in the Unicode object for subsequent
531 usage by this function. The cached version is needed to implement
532 the character buffer interface and will live (at least) as long as
533 the Unicode object itself.
534
535 The refcount of the string is *not* incremented.
536
537 *** Exported for internal use by the interpreter only !!! ***
538
539*/
540
541PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
542 const char *errors)
543{
544 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
545
546 if (v)
547 return v;
548 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
549 if (v && errors == NULL)
550 ((PyUnicodeObject *)unicode)->defenc = v;
551 return v;
552}
553
Guido van Rossumd57fd912000-03-10 22:53:23 +0000554Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
555{
556 if (!PyUnicode_Check(unicode)) {
557 PyErr_BadArgument();
558 goto onError;
559 }
560 return PyUnicode_AS_UNICODE(unicode);
561
562 onError:
563 return NULL;
564}
565
566int PyUnicode_GetSize(PyObject *unicode)
567{
568 if (!PyUnicode_Check(unicode)) {
569 PyErr_BadArgument();
570 goto onError;
571 }
572 return PyUnicode_GET_SIZE(unicode);
573
574 onError:
575 return -1;
576}
577
Thomas Wouters78890102000-07-22 19:25:51 +0000578const char *PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +0000579{
580 return unicode_default_encoding;
581}
582
583int PyUnicode_SetDefaultEncoding(const char *encoding)
584{
585 PyObject *v;
586
587 /* Make sure the encoding is valid. As side effect, this also
588 loads the encoding into the codec registry cache. */
589 v = _PyCodec_Lookup(encoding);
590 if (v == NULL)
591 goto onError;
592 Py_DECREF(v);
593 strncpy(unicode_default_encoding,
594 encoding,
595 sizeof(unicode_default_encoding));
596 return 0;
597
598 onError:
599 return -1;
600}
601
Guido van Rossumd57fd912000-03-10 22:53:23 +0000602/* --- UTF-8 Codec -------------------------------------------------------- */
603
604static
605char utf8_code_length[256] = {
606 /* Map UTF-8 encoded prefix byte to sequence length. zero means
607 illegal prefix. see RFC 2279 for details */
608 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
609 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
610 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
611 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
612 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
615 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
616 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
617 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
618 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
619 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
620 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
621 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
622 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
623 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0
624};
625
626static
627int utf8_decoding_error(const char **source,
628 Py_UNICODE **dest,
629 const char *errors,
630 const char *details)
631{
632 if ((errors == NULL) ||
633 (strcmp(errors,"strict") == 0)) {
634 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000635 "UTF-8 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000636 details);
637 return -1;
638 }
639 else if (strcmp(errors,"ignore") == 0) {
640 (*source)++;
641 return 0;
642 }
643 else if (strcmp(errors,"replace") == 0) {
644 (*source)++;
645 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
646 (*dest)++;
647 return 0;
648 }
649 else {
650 PyErr_Format(PyExc_ValueError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000651 "UTF-8 decoding error; unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000652 errors);
653 return -1;
654 }
655}
656
Guido van Rossumd57fd912000-03-10 22:53:23 +0000657PyObject *PyUnicode_DecodeUTF8(const char *s,
658 int size,
659 const char *errors)
660{
661 int n;
662 const char *e;
663 PyUnicodeObject *unicode;
664 Py_UNICODE *p;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000665 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000666
667 /* Note: size will always be longer than the resulting Unicode
668 character count */
669 unicode = _PyUnicode_New(size);
670 if (!unicode)
671 return NULL;
672 if (size == 0)
673 return (PyObject *)unicode;
674
675 /* Unpack UTF-8 encoded data */
676 p = unicode->str;
677 e = s + size;
678
679 while (s < e) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000680 Py_UCS4 ch = (unsigned char)*s;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000681
682 if (ch < 0x80) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000683 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000684 s++;
685 continue;
686 }
687
688 n = utf8_code_length[ch];
689
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000690 if (s + n > e) {
691 errmsg = "unexpected end of data";
692 goto utf8Error;
693 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000694
695 switch (n) {
696
697 case 0:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000698 errmsg = "unexpected code byte";
699 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000700 break;
701
702 case 1:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000703 errmsg = "internal error";
704 goto utf8Error;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000705 break;
706
707 case 2:
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000708 if ((s[1] & 0xc0) != 0x80) {
709 errmsg = "invalid data";
710 goto utf8Error;
711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000712 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000713 if (ch < 0x80) {
714 errmsg = "illegal encoding";
715 goto utf8Error;
716 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000717 else
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000718 *p++ = (Py_UNICODE)ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000719 break;
720
721 case 3:
722 if ((s[1] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000723 (s[2] & 0xc0) != 0x80) {
724 errmsg = "invalid data";
725 goto utf8Error;
726 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000727 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000728 if (ch < 0x800 || (ch >= 0xd800 && ch < 0xe000)) {
729 errmsg = "illegal encoding";
730 goto utf8Error;
731 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000732 else
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000733 *p++ = (Py_UNICODE)ch;
734 break;
735
736 case 4:
737 if ((s[1] & 0xc0) != 0x80 ||
738 (s[2] & 0xc0) != 0x80 ||
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000739 (s[3] & 0xc0) != 0x80) {
740 errmsg = "invalid data";
741 goto utf8Error;
742 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000743 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
744 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
745 /* validate and convert to UTF-16 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000746 if ((ch < 0x10000) || /* minimum value allowed for 4
747 byte encoding */
748 (ch > 0x10ffff)) { /* maximum value allowed for
749 UTF-16 */
750 errmsg = "illegal encoding";
751 goto utf8Error;
752 }
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000753 /* compute and append the two surrogates: */
754
755 /* translate from 10000..10FFFF to 0..FFFF */
756 ch -= 0x10000;
757
758 /* high surrogate = top 10 bits added to D800 */
759 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
760
761 /* low surrogate = bottom 10 bits added to DC00 */
762 *p++ = (Py_UNICODE)(0xDC00 + (ch & ~0xFC00));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000763 break;
764
765 default:
766 /* Other sizes are only needed for UCS-4 */
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000767 errmsg = "unsupported Unicode code range";
768 goto utf8Error;
769 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000770 }
771 s += n;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000772 continue;
773
774 utf8Error:
775 if (utf8_decoding_error(&s, &p, errors, errmsg))
776 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000777 }
778
779 /* Adjust length */
780 if (_PyUnicode_Resize(unicode, p - unicode->str))
781 goto onError;
782
783 return (PyObject *)unicode;
784
785onError:
786 Py_DECREF(unicode);
787 return NULL;
788}
789
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000790/* Not used anymore, now that the encoder supports UTF-16
791 surrogates. */
Greg Steinaf36a3a2000-07-17 09:04:43 +0000792#if 0
Guido van Rossumd57fd912000-03-10 22:53:23 +0000793static
794int utf8_encoding_error(const Py_UNICODE **source,
795 char **dest,
796 const char *errors,
797 const char *details)
798{
799 if ((errors == NULL) ||
800 (strcmp(errors,"strict") == 0)) {
801 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000802 "UTF-8 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000803 details);
804 return -1;
805 }
806 else if (strcmp(errors,"ignore") == 0) {
807 return 0;
808 }
809 else if (strcmp(errors,"replace") == 0) {
810 **dest = '?';
811 (*dest)++;
812 return 0;
813 }
814 else {
815 PyErr_Format(PyExc_ValueError,
816 "UTF-8 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +0000817 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000818 errors);
819 return -1;
820 }
821}
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000822#endif
Guido van Rossumd57fd912000-03-10 22:53:23 +0000823
824PyObject *PyUnicode_EncodeUTF8(const Py_UNICODE *s,
825 int size,
826 const char *errors)
827{
828 PyObject *v;
829 char *p;
830 char *q;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000831 Py_UCS4 ch2;
832 unsigned int cbAllocated = 3 * size;
833 unsigned int cbWritten = 0;
834 int i = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000835
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000836 v = PyString_FromStringAndSize(NULL, cbAllocated);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000837 if (v == NULL)
838 return NULL;
839 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +0000840 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000841
842 p = q = PyString_AS_STRING(v);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000843 while (i < size) {
844 Py_UCS4 ch = s[i++];
845 if (ch < 0x80) {
Guido van Rossumd57fd912000-03-10 22:53:23 +0000846 *p++ = (char) ch;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000847 cbWritten++;
848 }
Guido van Rossumd57fd912000-03-10 22:53:23 +0000849 else if (ch < 0x0800) {
850 *p++ = 0xc0 | (ch >> 6);
851 *p++ = 0x80 | (ch & 0x3f);
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000852 cbWritten += 2;
853 }
854 else {
855 /* Check for high surrogate */
856 if (0xD800 <= ch && ch <= 0xDBFF) {
857 if (i != size) {
858 ch2 = s[i];
859 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
860
861 if (cbWritten >= (cbAllocated - 4)) {
862 /* Provide enough room for some more
863 surrogates */
864 cbAllocated += 4*10;
865 if (_PyString_Resize(&v, cbAllocated))
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000866 goto onError;
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000867 }
868
869 /* combine the two values */
870 ch = ((ch - 0xD800)<<10 | (ch2-0xDC00))+0x10000;
871
872 *p++ = (char)((ch >> 18) | 0xf0);
Greg Steinaf36a3a2000-07-17 09:04:43 +0000873 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
Marc-André Lemburge12896e2000-07-07 17:51:08 +0000874 i++;
875 cbWritten += 4;
876 }
877 }
878 }
879 else {
880 *p++ = (char)(0xe0 | (ch >> 12));
881 cbWritten += 3;
882 }
883 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
884 *p++ = (char)(0x80 | (ch & 0x3f));
Guido van Rossumd57fd912000-03-10 22:53:23 +0000885 }
886 }
887 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +0000888 if (_PyString_Resize(&v, p - q))
889 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +0000890 return v;
891
892 onError:
893 Py_DECREF(v);
894 return NULL;
895}
896
Guido van Rossumd57fd912000-03-10 22:53:23 +0000897PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
898{
Guido van Rossumd57fd912000-03-10 22:53:23 +0000899 if (!PyUnicode_Check(unicode)) {
900 PyErr_BadArgument();
901 return NULL;
902 }
Barry Warsaw2dd4abf2000-08-18 06:58:15 +0000903 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
904 PyUnicode_GET_SIZE(unicode),
905 NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +0000906}
907
908/* --- UTF-16 Codec ------------------------------------------------------- */
909
910static
911int utf16_decoding_error(const Py_UNICODE **source,
912 Py_UNICODE **dest,
913 const char *errors,
914 const char *details)
915{
916 if ((errors == NULL) ||
917 (strcmp(errors,"strict") == 0)) {
918 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +0000919 "UTF-16 decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000920 details);
921 return -1;
922 }
923 else if (strcmp(errors,"ignore") == 0) {
924 return 0;
925 }
926 else if (strcmp(errors,"replace") == 0) {
927 if (dest) {
928 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
929 (*dest)++;
930 }
931 return 0;
932 }
933 else {
934 PyErr_Format(PyExc_ValueError,
Fred Drakee4315f52000-05-09 19:53:39 +0000935 "UTF-16 decoding error; "
936 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +0000937 errors);
938 return -1;
939 }
940}
941
Guido van Rossumd57fd912000-03-10 22:53:23 +0000942PyObject *PyUnicode_DecodeUTF16(const char *s,
943 int size,
944 const char *errors,
945 int *byteorder)
946{
947 PyUnicodeObject *unicode;
948 Py_UNICODE *p;
949 const Py_UNICODE *q, *e;
950 int bo = 0;
Marc-André Lemburg9542f482000-07-17 18:23:13 +0000951 const char *errmsg = "";
Guido van Rossumd57fd912000-03-10 22:53:23 +0000952
953 /* size should be an even number */
954 if (size % sizeof(Py_UNICODE) != 0) {
955 if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
956 return NULL;
957 /* The remaining input chars are ignored if we fall through
958 here... */
959 }
960
961 /* Note: size will always be longer than the resulting Unicode
962 character count */
963 unicode = _PyUnicode_New(size);
964 if (!unicode)
965 return NULL;
966 if (size == 0)
967 return (PyObject *)unicode;
968
969 /* Unpack UTF-16 encoded data */
970 p = unicode->str;
971 q = (Py_UNICODE *)s;
972 e = q + (size / sizeof(Py_UNICODE));
973
974 if (byteorder)
975 bo = *byteorder;
976
977 while (q < e) {
978 register Py_UNICODE ch = *q++;
979
980 /* Check for BOM marks (U+FEFF) in the input and adjust
981 current byte order setting accordingly. Swap input
982 bytes if needed. (This assumes sizeof(Py_UNICODE) == 2
983 !) */
984#ifdef BYTEORDER_IS_LITTLE_ENDIAN
985 if (ch == 0xFEFF) {
986 bo = -1;
987 continue;
988 } else if (ch == 0xFFFE) {
989 bo = 1;
990 continue;
991 }
992 if (bo == 1)
993 ch = (ch >> 8) | (ch << 8);
994#else
995 if (ch == 0xFEFF) {
996 bo = 1;
997 continue;
998 } else if (ch == 0xFFFE) {
999 bo = -1;
1000 continue;
1001 }
1002 if (bo == -1)
1003 ch = (ch >> 8) | (ch << 8);
1004#endif
1005 if (ch < 0xD800 || ch > 0xDFFF) {
1006 *p++ = ch;
1007 continue;
1008 }
1009
1010 /* UTF-16 code pair: */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001011 if (q >= e) {
1012 errmsg = "unexpected end of data";
1013 goto utf16Error;
1014 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001015 if (0xDC00 <= *q && *q <= 0xDFFF) {
1016 q++;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001017 if (0xD800 <= *q && *q <= 0xDBFF) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001018 /* This is valid data (a UTF-16 surrogate pair), but
1019 we are not able to store this information since our
1020 Py_UNICODE type only has 16 bits... this might
1021 change someday, even though it's unlikely. */
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001022 errmsg = "code pairs are not supported";
1023 goto utf16Error;
1024 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001025 else
1026 continue;
1027 }
Marc-André Lemburg9542f482000-07-17 18:23:13 +00001028 errmsg = "illegal encoding";
1029 /* Fall through to report the error */
1030
1031 utf16Error:
1032 if (utf16_decoding_error(&q, &p, errors, errmsg))
1033 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001034 }
1035
1036 if (byteorder)
1037 *byteorder = bo;
1038
1039 /* Adjust length */
1040 if (_PyUnicode_Resize(unicode, p - unicode->str))
1041 goto onError;
1042
1043 return (PyObject *)unicode;
1044
1045onError:
1046 Py_DECREF(unicode);
1047 return NULL;
1048}
1049
1050#undef UTF16_ERROR
1051
1052PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
1053 int size,
1054 const char *errors,
1055 int byteorder)
1056{
1057 PyObject *v;
1058 Py_UNICODE *p;
1059 char *q;
1060
1061 /* We don't create UTF-16 pairs... */
1062 v = PyString_FromStringAndSize(NULL,
1063 sizeof(Py_UNICODE) * (size + (byteorder == 0)));
1064 if (v == NULL)
1065 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001066
1067 q = PyString_AS_STRING(v);
1068 p = (Py_UNICODE *)q;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001069 if (byteorder == 0)
1070 *p++ = 0xFEFF;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001071 if (size == 0)
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001072 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001073 if (byteorder == 0 ||
1074#ifdef BYTEORDER_IS_LITTLE_ENDIAN
1075 byteorder == -1
1076#else
1077 byteorder == 1
1078#endif
1079 )
1080 memcpy(p, s, size * sizeof(Py_UNICODE));
1081 else
1082 while (size-- > 0) {
1083 Py_UNICODE ch = *s++;
1084 *p++ = (ch >> 8) | (ch << 8);
1085 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086 return v;
1087}
1088
1089PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
1090{
1091 if (!PyUnicode_Check(unicode)) {
1092 PyErr_BadArgument();
1093 return NULL;
1094 }
1095 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
1096 PyUnicode_GET_SIZE(unicode),
1097 NULL,
1098 0);
1099}
1100
1101/* --- Unicode Escape Codec ----------------------------------------------- */
1102
1103static
1104int unicodeescape_decoding_error(const char **source,
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001105 Py_UNICODE *x,
Guido van Rossumd57fd912000-03-10 22:53:23 +00001106 const char *errors,
1107 const char *details)
1108{
1109 if ((errors == NULL) ||
1110 (strcmp(errors,"strict") == 0)) {
1111 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001112 "Unicode-Escape decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001113 details);
1114 return -1;
1115 }
1116 else if (strcmp(errors,"ignore") == 0) {
1117 return 0;
1118 }
1119 else if (strcmp(errors,"replace") == 0) {
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001120 *x = Py_UNICODE_REPLACEMENT_CHARACTER;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001121 return 0;
1122 }
1123 else {
1124 PyErr_Format(PyExc_ValueError,
1125 "Unicode-Escape decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001126 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001127 errors);
1128 return -1;
1129 }
1130}
1131
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001132static _Py_UCNHashAPI *pucnHash = NULL;
1133
1134static
1135int mystrnicmp(const char *s1, const char *s2, size_t count)
1136{
1137 char c1, c2;
1138
1139 if (count)
1140 {
1141 do
1142 {
1143 c1 = tolower(*(s1++));
1144 c2 = tolower(*(s2++));
1145 }
1146 while(--count && c1 == c2);
1147
1148 return c1 - c2;
1149 }
1150
1151 return 0;
1152}
1153
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
1155 int size,
1156 const char *errors)
1157{
1158 PyUnicodeObject *v;
1159 Py_UNICODE *p = NULL, *buf = NULL;
1160 const char *end;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001161 Py_UCS4 chr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001162
1163 /* Escaped strings will always be longer than the resulting
1164 Unicode string, so we start with size here and then reduce the
1165 length after conversion to the true value. */
1166 v = _PyUnicode_New(size);
1167 if (v == NULL)
1168 goto onError;
1169 if (size == 0)
1170 return (PyObject *)v;
1171 p = buf = PyUnicode_AS_UNICODE(v);
1172 end = s + size;
1173 while (s < end) {
1174 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001175 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001176 int i;
1177
1178 /* Non-escape characters are interpreted as Unicode ordinals */
1179 if (*s != '\\') {
1180 *p++ = (unsigned char)*s++;
1181 continue;
1182 }
1183
1184 /* \ - Escapes */
1185 s++;
1186 switch (*s++) {
1187
1188 /* \x escapes */
1189 case '\n': break;
1190 case '\\': *p++ = '\\'; break;
1191 case '\'': *p++ = '\''; break;
1192 case '\"': *p++ = '\"'; break;
1193 case 'b': *p++ = '\b'; break;
1194 case 'f': *p++ = '\014'; break; /* FF */
1195 case 't': *p++ = '\t'; break;
1196 case 'n': *p++ = '\n'; break;
1197 case 'r': *p++ = '\r'; break;
1198 case 'v': *p++ = '\013'; break; /* VT */
1199 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
1200
1201 /* \OOO (octal) escapes */
1202 case '0': case '1': case '2': case '3':
1203 case '4': case '5': case '6': case '7':
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001204 x = s[-1] - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205 if ('0' <= *s && *s <= '7') {
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001206 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001207 if ('0' <= *s && *s <= '7')
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001208 x = (x<<3) + *s++ - '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001209 }
Guido van Rossum0e4f6572000-05-01 21:27:20 +00001210 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001211 break;
1212
Fredrik Lundhdf846752000-09-03 11:29:49 +00001213 /* \xXX with two hex digits */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001214 case 'x':
Fredrik Lundhdf846752000-09-03 11:29:49 +00001215 for (x = 0, i = 0; i < 2; i++) {
1216 c = (unsigned char)s[i];
1217 if (!isxdigit(c)) {
1218 if (unicodeescape_decoding_error(&s, &x, errors,
1219 "truncated \\xXX"))
1220 goto onError;
1221 i++;
1222 break;
1223 }
1224 x = (x<<4) & ~0xF;
1225 if (c >= '0' && c <= '9')
1226 x += c - '0';
1227 else if (c >= 'a' && c <= 'f')
1228 x += 10 + c - 'a';
1229 else
1230 x += 10 + c - 'A';
Guido van Rossumd57fd912000-03-10 22:53:23 +00001231 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001232 s += i;
1233 *p++ = x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001234 break;
1235
1236 /* \uXXXX with 4 hex digits */
1237 case 'u':
1238 for (x = 0, i = 0; i < 4; i++) {
1239 c = (unsigned char)s[i];
1240 if (!isxdigit(c)) {
1241 if (unicodeescape_decoding_error(&s, &x, errors,
1242 "truncated \\uXXXX"))
1243 goto onError;
1244 i++;
1245 break;
1246 }
1247 x = (x<<4) & ~0xF;
1248 if (c >= '0' && c <= '9')
1249 x += c - '0';
1250 else if (c >= 'a' && c <= 'f')
1251 x += 10 + c - 'a';
1252 else
1253 x += 10 + c - 'A';
1254 }
1255 s += i;
1256 *p++ = x;
1257 break;
1258
Fredrik Lundhdf846752000-09-03 11:29:49 +00001259 /* \UXXXXXXXX with 8 hex digits */
1260 case 'U':
1261 for (chr = 0, i = 0; i < 8; i++) {
1262 c = (unsigned char)s[i];
1263 if (!isxdigit(c)) {
1264 if (unicodeescape_decoding_error(&s, &x, errors,
1265 "truncated \\uXXXX"))
1266 goto onError;
1267 i++;
1268 break;
1269 }
1270 chr = (chr<<4) & ~0xF;
1271 if (c >= '0' && c <= '9')
1272 chr += c - '0';
1273 else if (c >= 'a' && c <= 'f')
1274 chr += 10 + c - 'a';
1275 else
1276 chr += 10 + c - 'A';
1277 }
1278 s += i;
1279 goto store;
1280
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001281 case 'N':
1282 /* Ok, we need to deal with Unicode Character Names now,
1283 * make sure we've imported the hash table data...
1284 */
Fredrik Lundhdf846752000-09-03 11:29:49 +00001285 if (pucnHash == NULL) {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001286 PyObject *mod = 0, *v = 0;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001287 mod = PyImport_ImportModule("ucnhash");
1288 if (mod == NULL)
1289 goto onError;
1290 v = PyObject_GetAttrString(mod,"ucnhashAPI");
1291 Py_DECREF(mod);
1292 if (v == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001293 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001294 pucnHash = PyCObject_AsVoidPtr(v);
1295 Py_DECREF(v);
1296 if (pucnHash == NULL)
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001297 goto onError;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001298 }
1299
Fredrik Lundhdf846752000-09-03 11:29:49 +00001300 if (*s == '{') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001301 const char *start = s + 1;
1302 const char *endBrace = start;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001303 unsigned long j;
1304
1305 /* look for either the closing brace, or we
1306 * exceed the maximum length of the unicode character names
1307 */
1308 while (*endBrace != '}' &&
1309 (unsigned int)(endBrace - start) <=
1310 pucnHash->cchMax &&
1311 endBrace < end)
1312 {
1313 endBrace++;
1314 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001315 if (endBrace != end && *endBrace == '}') {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001316 j = pucnHash->hash(start, endBrace - start);
1317 if (j > pucnHash->cKeys ||
1318 mystrnicmp(
1319 start,
1320 ((_Py_UnicodeCharacterName *)
1321 (pucnHash->getValue(j)))->pszUCN,
1322 (int)(endBrace - start)) != 0)
1323 {
1324 if (unicodeescape_decoding_error(
1325 &s, &x, errors,
1326 "Invalid Unicode Character Name"))
1327 {
1328 goto onError;
1329 }
1330 goto ucnFallthrough;
1331 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00001332 chr = ((_Py_UnicodeCharacterName *)
1333 (pucnHash->getValue(j)))->value;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001334 s = endBrace + 1;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001335 goto store;
1336 } else {
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00001337 if (unicodeescape_decoding_error(
1338 &s, &x, errors,
1339 "Unicode name missing closing brace"))
1340 goto onError;
1341 goto ucnFallthrough;
1342 }
1343 break;
1344 }
1345 if (unicodeescape_decoding_error(
1346 &s, &x, errors,
1347 "Missing opening brace for Unicode Character Name escape"))
1348 goto onError;
1349ucnFallthrough:
1350 /* fall through on purpose */
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001351 default:
Guido van Rossumd57fd912000-03-10 22:53:23 +00001352 *p++ = '\\';
1353 *p++ = (unsigned char)s[-1];
1354 break;
Fredrik Lundhdf846752000-09-03 11:29:49 +00001355store:
1356 /* when we get here, chr is a 32-bit unicode character */
1357 if (chr <= 0xffff)
1358 /* UCS-2 character */
1359 *p++ = (Py_UNICODE) chr;
1360 else if (chr <= 0x10ffff) {
1361 /* UCS-4 character. store as two surrogate characters */
1362 chr -= 0x10000L;
1363 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
1364 *p++ = 0xDC00 + (Py_UNICODE) (chr & ~0xFC00);
1365 } else {
1366 if (unicodeescape_decoding_error(
1367 &s, &x, errors,
1368 "Illegal Unicode character")
1369 )
1370 goto onError;
1371 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00001372 }
1373 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001374 if (_PyUnicode_Resize(v, (int)(p - buf)))
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00001375 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001376 return (PyObject *)v;
1377
1378 onError:
1379 Py_XDECREF(v);
1380 return NULL;
1381}
1382
1383/* Return a Unicode-Escape string version of the Unicode object.
1384
1385 If quotes is true, the string is enclosed in u"" or u'' quotes as
1386 appropriate.
1387
1388*/
1389
Barry Warsaw51ac5802000-03-20 16:36:48 +00001390static const Py_UNICODE *findchar(const Py_UNICODE *s,
1391 int size,
1392 Py_UNICODE ch);
1393
Guido van Rossumd57fd912000-03-10 22:53:23 +00001394static
1395PyObject *unicodeescape_string(const Py_UNICODE *s,
1396 int size,
1397 int quotes)
1398{
1399 PyObject *repr;
1400 char *p;
1401 char *q;
1402
1403 static const char *hexdigit = "0123456789ABCDEF";
1404
1405 repr = PyString_FromStringAndSize(NULL, 2 + 6*size + 1);
1406 if (repr == NULL)
1407 return NULL;
1408
1409 p = q = PyString_AS_STRING(repr);
1410
1411 if (quotes) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00001412 *p++ = 'u';
1413 *p++ = (findchar(s, size, '\'') &&
1414 !findchar(s, size, '"')) ? '"' : '\'';
1415 }
1416 while (size-- > 0) {
1417 Py_UNICODE ch = *s++;
1418 /* Escape quotes */
1419 if (quotes && (ch == q[1] || ch == '\\')) {
1420 *p++ = '\\';
1421 *p++ = (char) ch;
1422 }
1423 /* Map 16-bit characters to '\uxxxx' */
1424 else if (ch >= 256) {
1425 *p++ = '\\';
1426 *p++ = 'u';
1427 *p++ = hexdigit[(ch >> 12) & 0xf];
1428 *p++ = hexdigit[(ch >> 8) & 0xf];
1429 *p++ = hexdigit[(ch >> 4) & 0xf];
1430 *p++ = hexdigit[ch & 15];
1431 }
1432 /* Map non-printable US ASCII to '\ooo' */
1433 else if (ch < ' ' || ch >= 128) {
1434 *p++ = '\\';
1435 *p++ = hexdigit[(ch >> 6) & 7];
1436 *p++ = hexdigit[(ch >> 3) & 7];
1437 *p++ = hexdigit[ch & 7];
1438 }
1439 /* Copy everything else as-is */
1440 else
1441 *p++ = (char) ch;
1442 }
1443 if (quotes)
1444 *p++ = q[1];
1445
1446 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001447 if (_PyString_Resize(&repr, p - q))
1448 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001449
1450 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001451
1452 onError:
1453 Py_DECREF(repr);
1454 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001455}
1456
1457PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
1458 int size)
1459{
1460 return unicodeescape_string(s, size, 0);
1461}
1462
1463PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
1464{
1465 if (!PyUnicode_Check(unicode)) {
1466 PyErr_BadArgument();
1467 return NULL;
1468 }
1469 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1470 PyUnicode_GET_SIZE(unicode));
1471}
1472
1473/* --- Raw Unicode Escape Codec ------------------------------------------- */
1474
1475PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
1476 int size,
1477 const char *errors)
1478{
1479 PyUnicodeObject *v;
1480 Py_UNICODE *p, *buf;
1481 const char *end;
1482 const char *bs;
1483
1484 /* Escaped strings will always be longer than the resulting
1485 Unicode string, so we start with size here and then reduce the
1486 length after conversion to the true value. */
1487 v = _PyUnicode_New(size);
1488 if (v == NULL)
1489 goto onError;
1490 if (size == 0)
1491 return (PyObject *)v;
1492 p = buf = PyUnicode_AS_UNICODE(v);
1493 end = s + size;
1494 while (s < end) {
1495 unsigned char c;
Marc-André Lemburg063e0cb2000-07-07 11:27:45 +00001496 Py_UNICODE x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001497 int i;
1498
1499 /* Non-escape characters are interpreted as Unicode ordinals */
1500 if (*s != '\\') {
1501 *p++ = (unsigned char)*s++;
1502 continue;
1503 }
1504
1505 /* \u-escapes are only interpreted iff the number of leading
1506 backslashes if odd */
1507 bs = s;
1508 for (;s < end;) {
1509 if (*s != '\\')
1510 break;
1511 *p++ = (unsigned char)*s++;
1512 }
1513 if (((s - bs) & 1) == 0 ||
1514 s >= end ||
1515 *s != 'u') {
1516 continue;
1517 }
1518 p--;
1519 s++;
1520
1521 /* \uXXXX with 4 hex digits */
1522 for (x = 0, i = 0; i < 4; i++) {
1523 c = (unsigned char)s[i];
1524 if (!isxdigit(c)) {
1525 if (unicodeescape_decoding_error(&s, &x, errors,
1526 "truncated \\uXXXX"))
1527 goto onError;
1528 i++;
1529 break;
1530 }
1531 x = (x<<4) & ~0xF;
1532 if (c >= '0' && c <= '9')
1533 x += c - '0';
1534 else if (c >= 'a' && c <= 'f')
1535 x += 10 + c - 'a';
1536 else
1537 x += 10 + c - 'A';
1538 }
1539 s += i;
1540 *p++ = x;
1541 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001542 if (_PyUnicode_Resize(v, (int)(p - buf)))
1543 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001544 return (PyObject *)v;
1545
1546 onError:
1547 Py_XDECREF(v);
1548 return NULL;
1549}
1550
1551PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
1552 int size)
1553{
1554 PyObject *repr;
1555 char *p;
1556 char *q;
1557
1558 static const char *hexdigit = "0123456789ABCDEF";
1559
1560 repr = PyString_FromStringAndSize(NULL, 6 * size);
1561 if (repr == NULL)
1562 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001563 if (size == 0)
1564 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001565
1566 p = q = PyString_AS_STRING(repr);
1567 while (size-- > 0) {
1568 Py_UNICODE ch = *s++;
1569 /* Map 16-bit characters to '\uxxxx' */
1570 if (ch >= 256) {
1571 *p++ = '\\';
1572 *p++ = 'u';
1573 *p++ = hexdigit[(ch >> 12) & 0xf];
1574 *p++ = hexdigit[(ch >> 8) & 0xf];
1575 *p++ = hexdigit[(ch >> 4) & 0xf];
1576 *p++ = hexdigit[ch & 15];
1577 }
1578 /* Copy everything else as-is */
1579 else
1580 *p++ = (char) ch;
1581 }
1582 *p = '\0';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001583 if (_PyString_Resize(&repr, p - q))
1584 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001585
1586 return repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001587
1588 onError:
1589 Py_DECREF(repr);
1590 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001591}
1592
1593PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
1594{
1595 if (!PyUnicode_Check(unicode)) {
1596 PyErr_BadArgument();
1597 return NULL;
1598 }
1599 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
1600 PyUnicode_GET_SIZE(unicode));
1601}
1602
1603/* --- Latin-1 Codec ------------------------------------------------------ */
1604
1605PyObject *PyUnicode_DecodeLatin1(const char *s,
1606 int size,
1607 const char *errors)
1608{
1609 PyUnicodeObject *v;
1610 Py_UNICODE *p;
1611
1612 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
1613 v = _PyUnicode_New(size);
1614 if (v == NULL)
1615 goto onError;
1616 if (size == 0)
1617 return (PyObject *)v;
1618 p = PyUnicode_AS_UNICODE(v);
1619 while (size-- > 0)
1620 *p++ = (unsigned char)*s++;
1621 return (PyObject *)v;
1622
1623 onError:
1624 Py_XDECREF(v);
1625 return NULL;
1626}
1627
1628static
1629int latin1_encoding_error(const Py_UNICODE **source,
1630 char **dest,
1631 const char *errors,
1632 const char *details)
1633{
1634 if ((errors == NULL) ||
1635 (strcmp(errors,"strict") == 0)) {
1636 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001637 "Latin-1 encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001638 details);
1639 return -1;
1640 }
1641 else if (strcmp(errors,"ignore") == 0) {
1642 return 0;
1643 }
1644 else if (strcmp(errors,"replace") == 0) {
1645 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001646 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001647 return 0;
1648 }
1649 else {
1650 PyErr_Format(PyExc_ValueError,
1651 "Latin-1 encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001652 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001653 errors);
1654 return -1;
1655 }
1656}
1657
1658PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
1659 int size,
1660 const char *errors)
1661{
1662 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001663 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001664
Guido van Rossumd57fd912000-03-10 22:53:23 +00001665 repr = PyString_FromStringAndSize(NULL, size);
1666 if (repr == NULL)
1667 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001668 if (size == 0)
1669 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001670
1671 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001672 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001673 while (size-- > 0) {
1674 Py_UNICODE ch = *p++;
1675 if (ch >= 256) {
1676 if (latin1_encoding_error(&p, &s, errors,
1677 "ordinal not in range(256)"))
1678 goto onError;
1679 }
1680 else
1681 *s++ = (char)ch;
1682 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001683 /* Resize if error handling skipped some characters */
1684 if (s - start < PyString_GET_SIZE(repr))
1685 if (_PyString_Resize(&repr, s - start))
1686 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001687 return repr;
1688
1689 onError:
1690 Py_DECREF(repr);
1691 return NULL;
1692}
1693
1694PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
1695{
1696 if (!PyUnicode_Check(unicode)) {
1697 PyErr_BadArgument();
1698 return NULL;
1699 }
1700 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1701 PyUnicode_GET_SIZE(unicode),
1702 NULL);
1703}
1704
1705/* --- 7-bit ASCII Codec -------------------------------------------------- */
1706
1707static
1708int ascii_decoding_error(const char **source,
1709 Py_UNICODE **dest,
1710 const char *errors,
1711 const char *details)
1712{
1713 if ((errors == NULL) ||
1714 (strcmp(errors,"strict") == 0)) {
1715 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001716 "ASCII decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001717 details);
1718 return -1;
1719 }
1720 else if (strcmp(errors,"ignore") == 0) {
1721 return 0;
1722 }
1723 else if (strcmp(errors,"replace") == 0) {
1724 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1725 (*dest)++;
1726 return 0;
1727 }
1728 else {
1729 PyErr_Format(PyExc_ValueError,
1730 "ASCII decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001731 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001732 errors);
1733 return -1;
1734 }
1735}
1736
1737PyObject *PyUnicode_DecodeASCII(const char *s,
1738 int size,
1739 const char *errors)
1740{
1741 PyUnicodeObject *v;
1742 Py_UNICODE *p;
1743
1744 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
1745 v = _PyUnicode_New(size);
1746 if (v == NULL)
1747 goto onError;
1748 if (size == 0)
1749 return (PyObject *)v;
1750 p = PyUnicode_AS_UNICODE(v);
1751 while (size-- > 0) {
1752 register unsigned char c;
1753
1754 c = (unsigned char)*s++;
1755 if (c < 128)
1756 *p++ = c;
1757 else if (ascii_decoding_error(&s, &p, errors,
1758 "ordinal not in range(128)"))
1759 goto onError;
1760 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001761 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
1762 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
1763 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001764 return (PyObject *)v;
1765
1766 onError:
1767 Py_XDECREF(v);
1768 return NULL;
1769}
1770
1771static
1772int ascii_encoding_error(const Py_UNICODE **source,
1773 char **dest,
1774 const char *errors,
1775 const char *details)
1776{
1777 if ((errors == NULL) ||
1778 (strcmp(errors,"strict") == 0)) {
1779 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001780 "ASCII encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001781 details);
1782 return -1;
1783 }
1784 else if (strcmp(errors,"ignore") == 0) {
1785 return 0;
1786 }
1787 else if (strcmp(errors,"replace") == 0) {
1788 **dest = '?';
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001789 (*dest)++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001790 return 0;
1791 }
1792 else {
1793 PyErr_Format(PyExc_ValueError,
1794 "ASCII encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001795 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001796 errors);
1797 return -1;
1798 }
1799}
1800
1801PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
1802 int size,
1803 const char *errors)
1804{
1805 PyObject *repr;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001806 char *s, *start;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001807
Guido van Rossumd57fd912000-03-10 22:53:23 +00001808 repr = PyString_FromStringAndSize(NULL, size);
1809 if (repr == NULL)
1810 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001811 if (size == 0)
1812 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001813
1814 s = PyString_AS_STRING(repr);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001815 start = s;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001816 while (size-- > 0) {
1817 Py_UNICODE ch = *p++;
1818 if (ch >= 128) {
1819 if (ascii_encoding_error(&p, &s, errors,
1820 "ordinal not in range(128)"))
1821 goto onError;
1822 }
1823 else
1824 *s++ = (char)ch;
1825 }
Guido van Rossumfd4b9572000-04-10 13:51:10 +00001826 /* Resize if error handling skipped some characters */
1827 if (s - start < PyString_GET_SIZE(repr))
1828 if (_PyString_Resize(&repr, s - start))
1829 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001830 return repr;
1831
1832 onError:
1833 Py_DECREF(repr);
1834 return NULL;
1835}
1836
1837PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
1838{
1839 if (!PyUnicode_Check(unicode)) {
1840 PyErr_BadArgument();
1841 return NULL;
1842 }
1843 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1844 PyUnicode_GET_SIZE(unicode),
1845 NULL);
1846}
1847
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001848#ifdef MS_WIN32
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001849
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001850/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001851
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001852PyObject *PyUnicode_DecodeMBCS(const char *s,
1853 int size,
1854 const char *errors)
1855{
1856 PyUnicodeObject *v;
1857 Py_UNICODE *p;
1858
1859 /* First get the size of the result */
1860 DWORD usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
Guido van Rossum03e29f12000-05-04 15:52:20 +00001861 if (size > 0 && usize==0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001862 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1863
1864 v = _PyUnicode_New(usize);
1865 if (v == NULL)
1866 return NULL;
1867 if (usize == 0)
1868 return (PyObject *)v;
1869 p = PyUnicode_AS_UNICODE(v);
1870 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
1871 Py_DECREF(v);
1872 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1873 }
1874
1875 return (PyObject *)v;
1876}
1877
1878PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
1879 int size,
1880 const char *errors)
1881{
1882 PyObject *repr;
1883 char *s;
Guido van Rossum03e29f12000-05-04 15:52:20 +00001884 DWORD mbcssize;
1885
1886 /* If there are no characters, bail now! */
1887 if (size==0)
1888 return PyString_FromString("");
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001889
1890 /* First get the size of the result */
Guido van Rossum03e29f12000-05-04 15:52:20 +00001891 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001892 if (mbcssize==0)
1893 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1894
1895 repr = PyString_FromStringAndSize(NULL, mbcssize);
1896 if (repr == NULL)
1897 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00001898 if (mbcssize == 0)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001899 return repr;
1900
1901 /* Do the conversion */
1902 s = PyString_AS_STRING(repr);
1903 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
1904 Py_DECREF(repr);
1905 return PyErr_SetFromWindowsErrWithFilename(0, NULL);
1906 }
1907 return repr;
1908}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00001909
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00001910#endif /* MS_WIN32 */
1911
Guido van Rossumd57fd912000-03-10 22:53:23 +00001912/* --- Character Mapping Codec -------------------------------------------- */
1913
1914static
1915int charmap_decoding_error(const char **source,
1916 Py_UNICODE **dest,
1917 const char *errors,
1918 const char *details)
1919{
1920 if ((errors == NULL) ||
1921 (strcmp(errors,"strict") == 0)) {
1922 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00001923 "charmap decoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001924 details);
1925 return -1;
1926 }
1927 else if (strcmp(errors,"ignore") == 0) {
1928 return 0;
1929 }
1930 else if (strcmp(errors,"replace") == 0) {
1931 **dest = Py_UNICODE_REPLACEMENT_CHARACTER;
1932 (*dest)++;
1933 return 0;
1934 }
1935 else {
1936 PyErr_Format(PyExc_ValueError,
1937 "charmap decoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00001938 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939 errors);
1940 return -1;
1941 }
1942}
1943
1944PyObject *PyUnicode_DecodeCharmap(const char *s,
1945 int size,
1946 PyObject *mapping,
1947 const char *errors)
1948{
1949 PyUnicodeObject *v;
1950 Py_UNICODE *p;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00001951 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001952
1953 /* Default to Latin-1 */
1954 if (mapping == NULL)
1955 return PyUnicode_DecodeLatin1(s, size, errors);
1956
1957 v = _PyUnicode_New(size);
1958 if (v == NULL)
1959 goto onError;
1960 if (size == 0)
1961 return (PyObject *)v;
1962 p = PyUnicode_AS_UNICODE(v);
1963 while (size-- > 0) {
1964 unsigned char ch = *s++;
1965 PyObject *w, *x;
1966
1967 /* Get mapping (char ordinal -> integer, Unicode char or None) */
1968 w = PyInt_FromLong((long)ch);
1969 if (w == NULL)
1970 goto onError;
1971 x = PyObject_GetItem(mapping, w);
1972 Py_DECREF(w);
1973 if (x == NULL) {
1974 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00001975 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001976 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00001977 x = Py_None;
1978 Py_INCREF(x);
1979 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00001980 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001981 }
1982
1983 /* Apply mapping */
1984 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00001985 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001986 if (value < 0 || value > 65535) {
1987 PyErr_SetString(PyExc_TypeError,
Marc-André Lemburg07ceb672000-06-10 09:32:51 +00001988 "character mapping must be in range(65536)");
Guido van Rossumd57fd912000-03-10 22:53:23 +00001989 Py_DECREF(x);
1990 goto onError;
1991 }
1992 *p++ = (Py_UNICODE)value;
1993 }
1994 else if (x == Py_None) {
1995 /* undefined mapping */
1996 if (charmap_decoding_error(&s, &p, errors,
1997 "character maps to <undefined>")) {
1998 Py_DECREF(x);
1999 goto onError;
2000 }
2001 }
2002 else if (PyUnicode_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002003 int targetsize = PyUnicode_GET_SIZE(x);
2004
2005 if (targetsize == 1)
2006 /* 1-1 mapping */
2007 *p++ = *PyUnicode_AS_UNICODE(x);
2008
2009 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002010 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002011 if (targetsize > extrachars) {
2012 /* resize first */
2013 int oldpos = (int)(p - PyUnicode_AS_UNICODE(v));
2014 int needed = (targetsize - extrachars) + \
2015 (targetsize << 2);
2016 extrachars += needed;
2017 if (_PyUnicode_Resize(v, PyUnicode_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002018 Py_DECREF(x);
2019 goto onError;
2020 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002021 p = PyUnicode_AS_UNICODE(v) + oldpos;
2022 }
2023 Py_UNICODE_COPY(p,
2024 PyUnicode_AS_UNICODE(x),
2025 targetsize);
2026 p += targetsize;
2027 extrachars -= targetsize;
2028 }
2029 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002030 }
2031 else {
2032 /* wrong return value */
2033 PyErr_SetString(PyExc_TypeError,
2034 "character mapping must return integer, None or unicode");
2035 Py_DECREF(x);
2036 goto onError;
2037 }
2038 Py_DECREF(x);
2039 }
2040 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
2041 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2042 goto onError;
2043 return (PyObject *)v;
2044
2045 onError:
2046 Py_XDECREF(v);
2047 return NULL;
2048}
2049
2050static
2051int charmap_encoding_error(const Py_UNICODE **source,
2052 char **dest,
2053 const char *errors,
2054 const char *details)
2055{
2056 if ((errors == NULL) ||
2057 (strcmp(errors,"strict") == 0)) {
2058 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002059 "charmap encoding error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002060 details);
2061 return -1;
2062 }
2063 else if (strcmp(errors,"ignore") == 0) {
2064 return 0;
2065 }
2066 else if (strcmp(errors,"replace") == 0) {
2067 **dest = '?';
2068 (*dest)++;
2069 return 0;
2070 }
2071 else {
2072 PyErr_Format(PyExc_ValueError,
2073 "charmap encoding error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002074 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075 errors);
2076 return -1;
2077 }
2078}
2079
2080PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
2081 int size,
2082 PyObject *mapping,
2083 const char *errors)
2084{
2085 PyObject *v;
2086 char *s;
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002087 int extrachars = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002088
2089 /* Default to Latin-1 */
2090 if (mapping == NULL)
2091 return PyUnicode_EncodeLatin1(p, size, errors);
2092
2093 v = PyString_FromStringAndSize(NULL, size);
2094 if (v == NULL)
2095 return NULL;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00002096 if (size == 0)
2097 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002098 s = PyString_AS_STRING(v);
2099 while (size-- > 0) {
2100 Py_UNICODE ch = *p++;
2101 PyObject *w, *x;
2102
2103 /* Get mapping (Unicode ordinal -> string char, integer or None) */
2104 w = PyInt_FromLong((long)ch);
2105 if (w == NULL)
2106 goto onError;
2107 x = PyObject_GetItem(mapping, w);
2108 Py_DECREF(w);
2109 if (x == NULL) {
2110 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
Marc-André Lemburga866df82001-01-03 21:29:14 +00002111 /* No mapping found means: mapping is undefined. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002112 PyErr_Clear();
Marc-André Lemburga866df82001-01-03 21:29:14 +00002113 x = Py_None;
2114 Py_INCREF(x);
2115 } else
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002116 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002117 }
2118
2119 /* Apply mapping */
2120 if (PyInt_Check(x)) {
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00002121 long value = PyInt_AS_LONG(x);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 if (value < 0 || value > 255) {
2123 PyErr_SetString(PyExc_TypeError,
2124 "character mapping must be in range(256)");
2125 Py_DECREF(x);
2126 goto onError;
2127 }
2128 *s++ = (char)value;
2129 }
2130 else if (x == Py_None) {
2131 /* undefined mapping */
2132 if (charmap_encoding_error(&p, &s, errors,
2133 "character maps to <undefined>")) {
2134 Py_DECREF(x);
2135 goto onError;
2136 }
2137 }
2138 else if (PyString_Check(x)) {
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002139 int targetsize = PyString_GET_SIZE(x);
2140
2141 if (targetsize == 1)
2142 /* 1-1 mapping */
2143 *s++ = *PyString_AS_STRING(x);
2144
2145 else if (targetsize > 1) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002146 /* 1-n mapping */
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002147 if (targetsize > extrachars) {
2148 /* resize first */
2149 int oldpos = (int)(s - PyString_AS_STRING(v));
2150 int needed = (targetsize - extrachars) + \
2151 (targetsize << 2);
2152 extrachars += needed;
2153 if (_PyString_Resize(&v, PyString_GET_SIZE(v) + needed)) {
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002154 Py_DECREF(x);
2155 goto onError;
2156 }
Marc-André Lemburgec233e52001-01-06 14:59:58 +00002157 s = PyString_AS_STRING(v) + oldpos;
2158 }
2159 memcpy(s,
2160 PyString_AS_STRING(x),
2161 targetsize);
2162 s += targetsize;
2163 extrachars -= targetsize;
2164 }
2165 /* 1-0 mapping: skip the character */
Guido van Rossumd57fd912000-03-10 22:53:23 +00002166 }
2167 else {
2168 /* wrong return value */
2169 PyErr_SetString(PyExc_TypeError,
2170 "character mapping must return integer, None or unicode");
2171 Py_DECREF(x);
2172 goto onError;
2173 }
2174 Py_DECREF(x);
2175 }
2176 if (s - PyString_AS_STRING(v) < PyString_GET_SIZE(v))
2177 if (_PyString_Resize(&v, (int)(s - PyString_AS_STRING(v))))
2178 goto onError;
2179 return v;
2180
2181 onError:
2182 Py_DECREF(v);
2183 return NULL;
2184}
2185
2186PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
2187 PyObject *mapping)
2188{
2189 if (!PyUnicode_Check(unicode) || mapping == NULL) {
2190 PyErr_BadArgument();
2191 return NULL;
2192 }
2193 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
2194 PyUnicode_GET_SIZE(unicode),
2195 mapping,
2196 NULL);
2197}
2198
2199static
2200int translate_error(const Py_UNICODE **source,
2201 Py_UNICODE **dest,
2202 const char *errors,
2203 const char *details)
2204{
2205 if ((errors == NULL) ||
2206 (strcmp(errors,"strict") == 0)) {
2207 PyErr_Format(PyExc_UnicodeError,
Guido van Rossum5db862d2000-04-10 12:46:51 +00002208 "translate error: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002209 details);
2210 return -1;
2211 }
2212 else if (strcmp(errors,"ignore") == 0) {
2213 return 0;
2214 }
2215 else if (strcmp(errors,"replace") == 0) {
2216 **dest = '?';
2217 (*dest)++;
2218 return 0;
2219 }
2220 else {
2221 PyErr_Format(PyExc_ValueError,
2222 "translate error; "
Guido van Rossum5db862d2000-04-10 12:46:51 +00002223 "unknown error handling code: %.400s",
Guido van Rossumd57fd912000-03-10 22:53:23 +00002224 errors);
2225 return -1;
2226 }
2227}
2228
2229PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *s,
2230 int size,
2231 PyObject *mapping,
2232 const char *errors)
2233{
2234 PyUnicodeObject *v;
2235 Py_UNICODE *p;
2236
2237 if (mapping == NULL) {
2238 PyErr_BadArgument();
2239 return NULL;
2240 }
2241
2242 /* Output will never be longer than input */
2243 v = _PyUnicode_New(size);
2244 if (v == NULL)
2245 goto onError;
2246 if (size == 0)
2247 goto done;
2248 p = PyUnicode_AS_UNICODE(v);
2249 while (size-- > 0) {
2250 Py_UNICODE ch = *s++;
2251 PyObject *w, *x;
2252
2253 /* Get mapping */
2254 w = PyInt_FromLong(ch);
2255 if (w == NULL)
2256 goto onError;
2257 x = PyObject_GetItem(mapping, w);
2258 Py_DECREF(w);
2259 if (x == NULL) {
2260 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
2261 /* No mapping found: default to 1-1 mapping */
2262 PyErr_Clear();
2263 *p++ = ch;
2264 continue;
2265 }
2266 goto onError;
2267 }
2268
2269 /* Apply mapping */
2270 if (PyInt_Check(x))
2271 *p++ = (Py_UNICODE)PyInt_AS_LONG(x);
2272 else if (x == Py_None) {
2273 /* undefined mapping */
2274 if (translate_error(&s, &p, errors,
2275 "character maps to <undefined>")) {
2276 Py_DECREF(x);
2277 goto onError;
2278 }
2279 }
2280 else if (PyUnicode_Check(x)) {
2281 if (PyUnicode_GET_SIZE(x) != 1) {
2282 /* 1-n mapping */
2283 PyErr_SetString(PyExc_NotImplementedError,
2284 "1-n mappings are currently not implemented");
2285 Py_DECREF(x);
2286 goto onError;
2287 }
2288 *p++ = *PyUnicode_AS_UNICODE(x);
2289 }
2290 else {
2291 /* wrong return value */
2292 PyErr_SetString(PyExc_TypeError,
2293 "translate mapping must return integer, None or unicode");
2294 Py_DECREF(x);
2295 goto onError;
2296 }
2297 Py_DECREF(x);
2298 }
2299 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
Guido van Rossumfd4b9572000-04-10 13:51:10 +00002300 if (_PyUnicode_Resize(v, (int)(p - PyUnicode_AS_UNICODE(v))))
2301 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002302
2303 done:
2304 return (PyObject *)v;
2305
2306 onError:
2307 Py_XDECREF(v);
2308 return NULL;
2309}
2310
2311PyObject *PyUnicode_Translate(PyObject *str,
2312 PyObject *mapping,
2313 const char *errors)
2314{
2315 PyObject *result;
2316
2317 str = PyUnicode_FromObject(str);
2318 if (str == NULL)
2319 goto onError;
2320 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
2321 PyUnicode_GET_SIZE(str),
2322 mapping,
2323 errors);
2324 Py_DECREF(str);
2325 return result;
2326
2327 onError:
2328 Py_XDECREF(str);
2329 return NULL;
2330}
2331
Guido van Rossum9e896b32000-04-05 20:11:21 +00002332/* --- Decimal Encoder ---------------------------------------------------- */
2333
2334int PyUnicode_EncodeDecimal(Py_UNICODE *s,
2335 int length,
2336 char *output,
2337 const char *errors)
2338{
2339 Py_UNICODE *p, *end;
2340
2341 if (output == NULL) {
2342 PyErr_BadArgument();
2343 return -1;
2344 }
2345
2346 p = s;
2347 end = s + length;
2348 while (p < end) {
2349 register Py_UNICODE ch = *p++;
2350 int decimal;
2351
2352 if (Py_UNICODE_ISSPACE(ch)) {
2353 *output++ = ' ';
2354 continue;
2355 }
2356 decimal = Py_UNICODE_TODECIMAL(ch);
2357 if (decimal >= 0) {
2358 *output++ = '0' + decimal;
2359 continue;
2360 }
Guido van Rossumba477042000-04-06 18:18:10 +00002361 if (0 < ch && ch < 256) {
Guido van Rossum42c29aa2000-05-03 23:58:29 +00002362 *output++ = (char)ch;
Guido van Rossum9e896b32000-04-05 20:11:21 +00002363 continue;
2364 }
2365 /* All other characters are considered invalid */
2366 if (errors == NULL || strcmp(errors, "strict") == 0) {
2367 PyErr_SetString(PyExc_ValueError,
2368 "invalid decimal Unicode string");
2369 goto onError;
2370 }
2371 else if (strcmp(errors, "ignore") == 0)
2372 continue;
2373 else if (strcmp(errors, "replace") == 0) {
2374 *output++ = '?';
2375 continue;
2376 }
2377 }
2378 /* 0-terminate the output string */
2379 *output++ = '\0';
2380 return 0;
2381
2382 onError:
2383 return -1;
2384}
2385
Guido van Rossumd57fd912000-03-10 22:53:23 +00002386/* --- Helpers ------------------------------------------------------------ */
2387
2388static
2389int count(PyUnicodeObject *self,
2390 int start,
2391 int end,
2392 PyUnicodeObject *substring)
2393{
2394 int count = 0;
2395
Marc-André Lemburg3a645e42001-01-16 11:54:12 +00002396 if (start < 0)
2397 start += self->length;
2398 if (start < 0)
2399 start = 0;
2400 if (end > self->length)
2401 end = self->length;
2402 if (end < 0)
2403 end += self->length;
2404 if (end < 0)
2405 end = 0;
2406
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002407 if (substring->length == 0)
2408 return (end - start + 1);
2409
Guido van Rossumd57fd912000-03-10 22:53:23 +00002410 end -= substring->length;
2411
2412 while (start <= end)
2413 if (Py_UNICODE_MATCH(self, start, substring)) {
2414 count++;
2415 start += substring->length;
2416 } else
2417 start++;
2418
2419 return count;
2420}
2421
2422int PyUnicode_Count(PyObject *str,
2423 PyObject *substr,
2424 int start,
2425 int end)
2426{
2427 int result;
2428
2429 str = PyUnicode_FromObject(str);
2430 if (str == NULL)
2431 return -1;
2432 substr = PyUnicode_FromObject(substr);
2433 if (substr == NULL) {
Marc-André Lemburg49ef6dc2000-06-18 22:25:22 +00002434 Py_DECREF(str);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002435 return -1;
2436 }
2437
2438 result = count((PyUnicodeObject *)str,
2439 start, end,
2440 (PyUnicodeObject *)substr);
2441
2442 Py_DECREF(str);
2443 Py_DECREF(substr);
2444 return result;
2445}
2446
2447static
2448int findstring(PyUnicodeObject *self,
2449 PyUnicodeObject *substring,
2450 int start,
2451 int end,
2452 int direction)
2453{
2454 if (start < 0)
2455 start += self->length;
2456 if (start < 0)
2457 start = 0;
2458
2459 if (substring->length == 0)
2460 return start;
2461
2462 if (end > self->length)
2463 end = self->length;
2464 if (end < 0)
2465 end += self->length;
2466 if (end < 0)
2467 end = 0;
2468
2469 end -= substring->length;
2470
2471 if (direction < 0) {
2472 for (; end >= start; end--)
2473 if (Py_UNICODE_MATCH(self, end, substring))
2474 return end;
2475 } else {
2476 for (; start <= end; start++)
2477 if (Py_UNICODE_MATCH(self, start, substring))
2478 return start;
2479 }
2480
2481 return -1;
2482}
2483
2484int PyUnicode_Find(PyObject *str,
2485 PyObject *substr,
2486 int start,
2487 int end,
2488 int direction)
2489{
2490 int result;
2491
2492 str = PyUnicode_FromObject(str);
2493 if (str == NULL)
2494 return -1;
2495 substr = PyUnicode_FromObject(substr);
2496 if (substr == NULL) {
2497 Py_DECREF(substr);
2498 return -1;
2499 }
2500
2501 result = findstring((PyUnicodeObject *)str,
2502 (PyUnicodeObject *)substr,
2503 start, end, direction);
2504 Py_DECREF(str);
2505 Py_DECREF(substr);
2506 return result;
2507}
2508
2509static
2510int tailmatch(PyUnicodeObject *self,
2511 PyUnicodeObject *substring,
2512 int start,
2513 int end,
2514 int direction)
2515{
2516 if (start < 0)
2517 start += self->length;
2518 if (start < 0)
2519 start = 0;
2520
2521 if (substring->length == 0)
2522 return 1;
2523
2524 if (end > self->length)
2525 end = self->length;
2526 if (end < 0)
2527 end += self->length;
2528 if (end < 0)
2529 end = 0;
2530
2531 end -= substring->length;
2532 if (end < start)
2533 return 0;
2534
2535 if (direction > 0) {
2536 if (Py_UNICODE_MATCH(self, end, substring))
2537 return 1;
2538 } else {
2539 if (Py_UNICODE_MATCH(self, start, substring))
2540 return 1;
2541 }
2542
2543 return 0;
2544}
2545
2546int PyUnicode_Tailmatch(PyObject *str,
2547 PyObject *substr,
2548 int start,
2549 int end,
2550 int direction)
2551{
2552 int result;
2553
2554 str = PyUnicode_FromObject(str);
2555 if (str == NULL)
2556 return -1;
2557 substr = PyUnicode_FromObject(substr);
2558 if (substr == NULL) {
2559 Py_DECREF(substr);
2560 return -1;
2561 }
2562
2563 result = tailmatch((PyUnicodeObject *)str,
2564 (PyUnicodeObject *)substr,
2565 start, end, direction);
2566 Py_DECREF(str);
2567 Py_DECREF(substr);
2568 return result;
2569}
2570
2571static
2572const Py_UNICODE *findchar(const Py_UNICODE *s,
2573 int size,
2574 Py_UNICODE ch)
2575{
2576 /* like wcschr, but doesn't stop at NULL characters */
2577
2578 while (size-- > 0) {
2579 if (*s == ch)
2580 return s;
2581 s++;
2582 }
2583
2584 return NULL;
2585}
2586
2587/* Apply fixfct filter to the Unicode object self and return a
2588 reference to the modified object */
2589
2590static
2591PyObject *fixup(PyUnicodeObject *self,
2592 int (*fixfct)(PyUnicodeObject *s))
2593{
2594
2595 PyUnicodeObject *u;
2596
2597 u = (PyUnicodeObject*) PyUnicode_FromUnicode(self->str,
2598 self->length);
2599 if (u == NULL)
2600 return NULL;
2601 if (!fixfct(u)) {
2602 /* fixfct should return TRUE if it modified the buffer. If
2603 FALSE, return a reference to the original buffer instead
2604 (to save space, not time) */
2605 Py_INCREF(self);
2606 Py_DECREF(u);
2607 return (PyObject*) self;
2608 }
2609 return (PyObject*) u;
2610}
2611
2612static
2613int fixupper(PyUnicodeObject *self)
2614{
2615 int len = self->length;
2616 Py_UNICODE *s = self->str;
2617 int status = 0;
2618
2619 while (len-- > 0) {
2620 register Py_UNICODE ch;
2621
2622 ch = Py_UNICODE_TOUPPER(*s);
2623 if (ch != *s) {
2624 status = 1;
2625 *s = ch;
2626 }
2627 s++;
2628 }
2629
2630 return status;
2631}
2632
2633static
2634int fixlower(PyUnicodeObject *self)
2635{
2636 int len = self->length;
2637 Py_UNICODE *s = self->str;
2638 int status = 0;
2639
2640 while (len-- > 0) {
2641 register Py_UNICODE ch;
2642
2643 ch = Py_UNICODE_TOLOWER(*s);
2644 if (ch != *s) {
2645 status = 1;
2646 *s = ch;
2647 }
2648 s++;
2649 }
2650
2651 return status;
2652}
2653
2654static
2655int fixswapcase(PyUnicodeObject *self)
2656{
2657 int len = self->length;
2658 Py_UNICODE *s = self->str;
2659 int status = 0;
2660
2661 while (len-- > 0) {
2662 if (Py_UNICODE_ISUPPER(*s)) {
2663 *s = Py_UNICODE_TOLOWER(*s);
2664 status = 1;
2665 } else if (Py_UNICODE_ISLOWER(*s)) {
2666 *s = Py_UNICODE_TOUPPER(*s);
2667 status = 1;
2668 }
2669 s++;
2670 }
2671
2672 return status;
2673}
2674
2675static
2676int fixcapitalize(PyUnicodeObject *self)
2677{
2678 if (self->length > 0 && Py_UNICODE_ISLOWER(self->str[0])) {
2679 self->str[0] = Py_UNICODE_TOUPPER(self->str[0]);
2680 return 1;
2681 }
2682 return 0;
2683}
2684
2685static
2686int fixtitle(PyUnicodeObject *self)
2687{
2688 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
2689 register Py_UNICODE *e;
2690 int previous_is_cased;
2691
2692 /* Shortcut for single character strings */
2693 if (PyUnicode_GET_SIZE(self) == 1) {
2694 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
2695 if (*p != ch) {
2696 *p = ch;
2697 return 1;
2698 }
2699 else
2700 return 0;
2701 }
2702
2703 e = p + PyUnicode_GET_SIZE(self);
2704 previous_is_cased = 0;
2705 for (; p < e; p++) {
2706 register const Py_UNICODE ch = *p;
2707
2708 if (previous_is_cased)
2709 *p = Py_UNICODE_TOLOWER(ch);
2710 else
2711 *p = Py_UNICODE_TOTITLE(ch);
2712
2713 if (Py_UNICODE_ISLOWER(ch) ||
2714 Py_UNICODE_ISUPPER(ch) ||
2715 Py_UNICODE_ISTITLE(ch))
2716 previous_is_cased = 1;
2717 else
2718 previous_is_cased = 0;
2719 }
2720 return 1;
2721}
2722
2723PyObject *PyUnicode_Join(PyObject *separator,
2724 PyObject *seq)
2725{
2726 Py_UNICODE *sep;
2727 int seplen;
2728 PyUnicodeObject *res = NULL;
2729 int reslen = 0;
2730 Py_UNICODE *p;
2731 int seqlen = 0;
2732 int sz = 100;
2733 int i;
2734
Jeremy Hylton03657cf2000-07-12 13:05:33 +00002735 seqlen = PySequence_Size(seq);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002736 if (seqlen < 0 && PyErr_Occurred())
2737 return NULL;
2738
2739 if (separator == NULL) {
2740 Py_UNICODE blank = ' ';
2741 sep = &blank;
2742 seplen = 1;
2743 }
2744 else {
2745 separator = PyUnicode_FromObject(separator);
2746 if (separator == NULL)
2747 return NULL;
2748 sep = PyUnicode_AS_UNICODE(separator);
2749 seplen = PyUnicode_GET_SIZE(separator);
2750 }
2751
2752 res = _PyUnicode_New(sz);
2753 if (res == NULL)
2754 goto onError;
2755 p = PyUnicode_AS_UNICODE(res);
2756 reslen = 0;
2757
2758 for (i = 0; i < seqlen; i++) {
2759 int itemlen;
2760 PyObject *item;
2761
2762 item = PySequence_GetItem(seq, i);
2763 if (item == NULL)
2764 goto onError;
2765 if (!PyUnicode_Check(item)) {
2766 PyObject *v;
2767 v = PyUnicode_FromObject(item);
2768 Py_DECREF(item);
2769 item = v;
2770 if (item == NULL)
2771 goto onError;
2772 }
2773 itemlen = PyUnicode_GET_SIZE(item);
2774 while (reslen + itemlen + seplen >= sz) {
2775 if (_PyUnicode_Resize(res, sz*2))
2776 goto onError;
2777 sz *= 2;
2778 p = PyUnicode_AS_UNICODE(res) + reslen;
2779 }
2780 if (i > 0) {
2781 memcpy(p, sep, seplen * sizeof(Py_UNICODE));
2782 p += seplen;
2783 reslen += seplen;
2784 }
2785 memcpy(p, PyUnicode_AS_UNICODE(item), itemlen * sizeof(Py_UNICODE));
2786 p += itemlen;
2787 reslen += itemlen;
2788 Py_DECREF(item);
2789 }
2790 if (_PyUnicode_Resize(res, reslen))
2791 goto onError;
2792
2793 Py_XDECREF(separator);
2794 return (PyObject *)res;
2795
2796 onError:
2797 Py_XDECREF(separator);
2798 Py_DECREF(res);
2799 return NULL;
2800}
2801
2802static
2803PyUnicodeObject *pad(PyUnicodeObject *self,
2804 int left,
2805 int right,
2806 Py_UNICODE fill)
2807{
2808 PyUnicodeObject *u;
2809
2810 if (left < 0)
2811 left = 0;
2812 if (right < 0)
2813 right = 0;
2814
2815 if (left == 0 && right == 0) {
2816 Py_INCREF(self);
2817 return self;
2818 }
2819
2820 u = _PyUnicode_New(left + self->length + right);
2821 if (u) {
2822 if (left)
2823 Py_UNICODE_FILL(u->str, fill, left);
2824 Py_UNICODE_COPY(u->str + left, self->str, self->length);
2825 if (right)
2826 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
2827 }
2828
2829 return u;
2830}
2831
2832#define SPLIT_APPEND(data, left, right) \
2833 str = PyUnicode_FromUnicode(data + left, right - left); \
2834 if (!str) \
2835 goto onError; \
2836 if (PyList_Append(list, str)) { \
2837 Py_DECREF(str); \
2838 goto onError; \
2839 } \
2840 else \
2841 Py_DECREF(str);
2842
2843static
2844PyObject *split_whitespace(PyUnicodeObject *self,
2845 PyObject *list,
2846 int maxcount)
2847{
2848 register int i;
2849 register int j;
2850 int len = self->length;
2851 PyObject *str;
2852
2853 for (i = j = 0; i < len; ) {
2854 /* find a token */
2855 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2856 i++;
2857 j = i;
2858 while (i < len && !Py_UNICODE_ISSPACE(self->str[i]))
2859 i++;
2860 if (j < i) {
2861 if (maxcount-- <= 0)
2862 break;
2863 SPLIT_APPEND(self->str, j, i);
2864 while (i < len && Py_UNICODE_ISSPACE(self->str[i]))
2865 i++;
2866 j = i;
2867 }
2868 }
2869 if (j < len) {
2870 SPLIT_APPEND(self->str, j, len);
2871 }
2872 return list;
2873
2874 onError:
2875 Py_DECREF(list);
2876 return NULL;
2877}
2878
2879PyObject *PyUnicode_Splitlines(PyObject *string,
Guido van Rossum86662912000-04-11 15:38:46 +00002880 int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002881{
2882 register int i;
2883 register int j;
2884 int len;
2885 PyObject *list;
2886 PyObject *str;
2887 Py_UNICODE *data;
2888
2889 string = PyUnicode_FromObject(string);
2890 if (string == NULL)
2891 return NULL;
2892 data = PyUnicode_AS_UNICODE(string);
2893 len = PyUnicode_GET_SIZE(string);
2894
Guido van Rossumd57fd912000-03-10 22:53:23 +00002895 list = PyList_New(0);
2896 if (!list)
2897 goto onError;
2898
2899 for (i = j = 0; i < len; ) {
Guido van Rossum86662912000-04-11 15:38:46 +00002900 int eol;
2901
Guido van Rossumd57fd912000-03-10 22:53:23 +00002902 /* Find a line and append it */
2903 while (i < len && !Py_UNICODE_ISLINEBREAK(data[i]))
2904 i++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002905
2906 /* Skip the line break reading CRLF as one line break */
Guido van Rossum86662912000-04-11 15:38:46 +00002907 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002908 if (i < len) {
2909 if (data[i] == '\r' && i + 1 < len &&
2910 data[i+1] == '\n')
2911 i += 2;
2912 else
2913 i++;
Guido van Rossum86662912000-04-11 15:38:46 +00002914 if (keepends)
2915 eol = i;
Guido van Rossumd57fd912000-03-10 22:53:23 +00002916 }
Guido van Rossum86662912000-04-11 15:38:46 +00002917 SPLIT_APPEND(data, j, eol);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002918 j = i;
2919 }
2920 if (j < len) {
2921 SPLIT_APPEND(data, j, len);
2922 }
2923
2924 Py_DECREF(string);
2925 return list;
2926
2927 onError:
2928 Py_DECREF(list);
2929 Py_DECREF(string);
2930 return NULL;
2931}
2932
2933static
2934PyObject *split_char(PyUnicodeObject *self,
2935 PyObject *list,
2936 Py_UNICODE ch,
2937 int maxcount)
2938{
2939 register int i;
2940 register int j;
2941 int len = self->length;
2942 PyObject *str;
2943
2944 for (i = j = 0; i < len; ) {
2945 if (self->str[i] == ch) {
2946 if (maxcount-- <= 0)
2947 break;
2948 SPLIT_APPEND(self->str, j, i);
2949 i = j = i + 1;
2950 } else
2951 i++;
2952 }
2953 if (j <= len) {
2954 SPLIT_APPEND(self->str, j, len);
2955 }
2956 return list;
2957
2958 onError:
2959 Py_DECREF(list);
2960 return NULL;
2961}
2962
2963static
2964PyObject *split_substring(PyUnicodeObject *self,
2965 PyObject *list,
2966 PyUnicodeObject *substring,
2967 int maxcount)
2968{
2969 register int i;
2970 register int j;
2971 int len = self->length;
2972 int sublen = substring->length;
2973 PyObject *str;
2974
Guido van Rossumcda4f9a2000-12-19 02:23:19 +00002975 for (i = j = 0; i <= len - sublen; ) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00002976 if (Py_UNICODE_MATCH(self, i, substring)) {
2977 if (maxcount-- <= 0)
2978 break;
2979 SPLIT_APPEND(self->str, j, i);
2980 i = j = i + sublen;
2981 } else
2982 i++;
2983 }
2984 if (j <= len) {
2985 SPLIT_APPEND(self->str, j, len);
2986 }
2987 return list;
2988
2989 onError:
2990 Py_DECREF(list);
2991 return NULL;
2992}
2993
2994#undef SPLIT_APPEND
2995
2996static
2997PyObject *split(PyUnicodeObject *self,
2998 PyUnicodeObject *substring,
2999 int maxcount)
3000{
3001 PyObject *list;
3002
3003 if (maxcount < 0)
3004 maxcount = INT_MAX;
3005
3006 list = PyList_New(0);
3007 if (!list)
3008 return NULL;
3009
3010 if (substring == NULL)
3011 return split_whitespace(self,list,maxcount);
3012
3013 else if (substring->length == 1)
3014 return split_char(self,list,substring->str[0],maxcount);
3015
3016 else if (substring->length == 0) {
3017 Py_DECREF(list);
3018 PyErr_SetString(PyExc_ValueError, "empty separator");
3019 return NULL;
3020 }
3021 else
3022 return split_substring(self,list,substring,maxcount);
3023}
3024
3025static
3026PyObject *strip(PyUnicodeObject *self,
3027 int left,
3028 int right)
3029{
3030 Py_UNICODE *p = self->str;
3031 int start = 0;
3032 int end = self->length;
3033
3034 if (left)
3035 while (start < end && Py_UNICODE_ISSPACE(p[start]))
3036 start++;
3037
3038 if (right)
3039 while (end > start && Py_UNICODE_ISSPACE(p[end-1]))
3040 end--;
3041
3042 if (start == 0 && end == self->length) {
3043 /* couldn't strip anything off, return original string */
3044 Py_INCREF(self);
3045 return (PyObject*) self;
3046 }
3047
3048 return (PyObject*) PyUnicode_FromUnicode(
3049 self->str + start,
3050 end - start
3051 );
3052}
3053
3054static
3055PyObject *replace(PyUnicodeObject *self,
3056 PyUnicodeObject *str1,
3057 PyUnicodeObject *str2,
3058 int maxcount)
3059{
3060 PyUnicodeObject *u;
3061
3062 if (maxcount < 0)
3063 maxcount = INT_MAX;
3064
3065 if (str1->length == 1 && str2->length == 1) {
3066 int i;
3067
3068 /* replace characters */
3069 if (!findchar(self->str, self->length, str1->str[0])) {
3070 /* nothing to replace, return original string */
3071 Py_INCREF(self);
3072 u = self;
3073 } else {
3074 Py_UNICODE u1 = str1->str[0];
3075 Py_UNICODE u2 = str2->str[0];
3076
3077 u = (PyUnicodeObject*) PyUnicode_FromUnicode(
3078 self->str,
3079 self->length
3080 );
3081 if (u)
3082 for (i = 0; i < u->length; i++)
3083 if (u->str[i] == u1) {
3084 if (--maxcount < 0)
3085 break;
3086 u->str[i] = u2;
3087 }
3088 }
3089
3090 } else {
3091 int n, i;
3092 Py_UNICODE *p;
3093
3094 /* replace strings */
3095 n = count(self, 0, self->length, str1);
3096 if (n > maxcount)
3097 n = maxcount;
3098 if (n == 0) {
3099 /* nothing to replace, return original string */
3100 Py_INCREF(self);
3101 u = self;
3102 } else {
3103 u = _PyUnicode_New(
3104 self->length + n * (str2->length - str1->length));
3105 if (u) {
3106 i = 0;
3107 p = u->str;
3108 while (i <= self->length - str1->length)
3109 if (Py_UNICODE_MATCH(self, i, str1)) {
3110 /* replace string segment */
3111 Py_UNICODE_COPY(p, str2->str, str2->length);
3112 p += str2->length;
3113 i += str1->length;
3114 if (--n <= 0) {
3115 /* copy remaining part */
3116 Py_UNICODE_COPY(p, self->str+i, self->length-i);
3117 break;
3118 }
3119 } else
3120 *p++ = self->str[i++];
3121 }
3122 }
3123 }
3124
3125 return (PyObject *) u;
3126}
3127
3128/* --- Unicode Object Methods --------------------------------------------- */
3129
3130static char title__doc__[] =
3131"S.title() -> unicode\n\
3132\n\
3133Return a titlecased version of S, i.e. words start with title case\n\
3134characters, all remaining cased characters have lower case.";
3135
3136static PyObject*
3137unicode_title(PyUnicodeObject *self, PyObject *args)
3138{
3139 if (!PyArg_NoArgs(args))
3140 return NULL;
3141 return fixup(self, fixtitle);
3142}
3143
3144static char capitalize__doc__[] =
3145"S.capitalize() -> unicode\n\
3146\n\
3147Return a capitalized version of S, i.e. make the first character\n\
3148have upper case.";
3149
3150static PyObject*
3151unicode_capitalize(PyUnicodeObject *self, PyObject *args)
3152{
3153 if (!PyArg_NoArgs(args))
3154 return NULL;
3155 return fixup(self, fixcapitalize);
3156}
3157
3158#if 0
3159static char capwords__doc__[] =
3160"S.capwords() -> unicode\n\
3161\n\
3162Apply .capitalize() to all words in S and return the result with\n\
3163normalized whitespace (all whitespace strings are replaced by ' ').";
3164
3165static PyObject*
3166unicode_capwords(PyUnicodeObject *self, PyObject *args)
3167{
3168 PyObject *list;
3169 PyObject *item;
3170 int i;
3171
3172 if (!PyArg_NoArgs(args))
3173 return NULL;
3174
3175 /* Split into words */
3176 list = split(self, NULL, -1);
3177 if (!list)
3178 return NULL;
3179
3180 /* Capitalize each word */
3181 for (i = 0; i < PyList_GET_SIZE(list); i++) {
3182 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
3183 fixcapitalize);
3184 if (item == NULL)
3185 goto onError;
3186 Py_DECREF(PyList_GET_ITEM(list, i));
3187 PyList_SET_ITEM(list, i, item);
3188 }
3189
3190 /* Join the words to form a new string */
3191 item = PyUnicode_Join(NULL, list);
3192
3193onError:
3194 Py_DECREF(list);
3195 return (PyObject *)item;
3196}
3197#endif
3198
3199static char center__doc__[] =
3200"S.center(width) -> unicode\n\
3201\n\
3202Return S centered in a Unicode string of length width. Padding is done\n\
3203using spaces.";
3204
3205static PyObject *
3206unicode_center(PyUnicodeObject *self, PyObject *args)
3207{
3208 int marg, left;
3209 int width;
3210
3211 if (!PyArg_ParseTuple(args, "i:center", &width))
3212 return NULL;
3213
3214 if (self->length >= width) {
3215 Py_INCREF(self);
3216 return (PyObject*) self;
3217 }
3218
3219 marg = width - self->length;
3220 left = marg / 2 + (marg & width & 1);
3221
3222 return (PyObject*) pad(self, left, marg - left, ' ');
3223}
3224
Marc-André Lemburge5034372000-08-08 08:04:29 +00003225#if 0
3226
3227/* This code should go into some future Unicode collation support
3228 module. The basic comparison should compare ordinals on a naive
Trent Mick20abf572000-08-12 22:14:34 +00003229 basis (this is what Java does and thus JPython too). */
Marc-André Lemburge5034372000-08-08 08:04:29 +00003230
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003231/* speedy UTF-16 code point order comparison */
3232/* gleaned from: */
3233/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
3234
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003235static short utf16Fixup[32] =
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003236{
3237 0, 0, 0, 0, 0, 0, 0, 0,
3238 0, 0, 0, 0, 0, 0, 0, 0,
3239 0, 0, 0, 0, 0, 0, 0, 0,
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003240 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003241};
3242
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243static int
3244unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3245{
3246 int len1, len2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003247
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248 Py_UNICODE *s1 = str1->str;
3249 Py_UNICODE *s2 = str2->str;
3250
3251 len1 = str1->length;
3252 len2 = str2->length;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003253
Guido van Rossumd57fd912000-03-10 22:53:23 +00003254 while (len1 > 0 && len2 > 0) {
Marc-André Lemburge12896e2000-07-07 17:51:08 +00003255 Py_UNICODE c1, c2;
Marc-André Lemburg449c3252000-07-06 20:13:23 +00003256 long diff;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003257
3258 c1 = *s1++;
3259 c2 = *s2++;
3260 if (c1 > (1<<11) * 26)
3261 c1 += utf16Fixup[c1>>11];
3262 if (c2 > (1<<11) * 26)
3263 c2 += utf16Fixup[c2>>11];
3264
3265 /* now c1 and c2 are in UTF-32-compatible order */
Marc-André Lemburg85cc4d82000-07-06 19:43:31 +00003266 diff = (long)c1 - (long)c2;
Marc-André Lemburg1e7205a2000-07-04 09:51:07 +00003267 if (diff)
3268 return (diff < 0) ? -1 : (diff != 0);
3269 len1--; len2--;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003270 }
3271
3272 return (len1 < len2) ? -1 : (len1 != len2);
3273}
3274
Marc-André Lemburge5034372000-08-08 08:04:29 +00003275#else
3276
3277static int
3278unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
3279{
3280 register int len1, len2;
3281
3282 Py_UNICODE *s1 = str1->str;
3283 Py_UNICODE *s2 = str2->str;
3284
3285 len1 = str1->length;
3286 len2 = str2->length;
3287
3288 while (len1 > 0 && len2 > 0) {
3289 register long diff;
3290
3291 diff = (long)*s1++ - (long)*s2++;
3292 if (diff)
3293 return (diff < 0) ? -1 : (diff != 0);
3294 len1--; len2--;
3295 }
3296
3297 return (len1 < len2) ? -1 : (len1 != len2);
3298}
3299
3300#endif
3301
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302int PyUnicode_Compare(PyObject *left,
3303 PyObject *right)
3304{
3305 PyUnicodeObject *u = NULL, *v = NULL;
3306 int result;
3307
3308 /* Coerce the two arguments */
3309 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3310 if (u == NULL)
3311 goto onError;
3312 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3313 if (v == NULL)
3314 goto onError;
3315
Thomas Wouters7e474022000-07-16 12:04:32 +00003316 /* Shortcut for empty or interned objects */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003317 if (v == u) {
3318 Py_DECREF(u);
3319 Py_DECREF(v);
3320 return 0;
3321 }
3322
3323 result = unicode_compare(u, v);
3324
3325 Py_DECREF(u);
3326 Py_DECREF(v);
3327 return result;
3328
3329onError:
3330 Py_XDECREF(u);
3331 Py_XDECREF(v);
3332 return -1;
3333}
3334
Guido van Rossum403d68b2000-03-13 15:55:09 +00003335int PyUnicode_Contains(PyObject *container,
3336 PyObject *element)
3337{
3338 PyUnicodeObject *u = NULL, *v = NULL;
3339 int result;
3340 register const Py_UNICODE *p, *e;
3341 register Py_UNICODE ch;
3342
3343 /* Coerce the two arguments */
Guido van Rossum403d68b2000-03-13 15:55:09 +00003344 v = (PyUnicodeObject *)PyUnicode_FromObject(element);
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003345 if (v == NULL) {
3346 PyErr_SetString(PyExc_TypeError,
3347 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003348 goto onError;
Marc-André Lemburg7c014682000-06-28 08:11:47 +00003349 }
Guido van Rossum9e896b32000-04-05 20:11:21 +00003350 u = (PyUnicodeObject *)PyUnicode_FromObject(container);
3351 if (u == NULL) {
3352 Py_DECREF(v);
3353 goto onError;
3354 }
Guido van Rossum403d68b2000-03-13 15:55:09 +00003355
3356 /* Check v in u */
3357 if (PyUnicode_GET_SIZE(v) != 1) {
3358 PyErr_SetString(PyExc_TypeError,
Andrew M. Kuchlingcb95a142000-06-09 14:04:53 +00003359 "'in <string>' requires character as left operand");
Guido van Rossum403d68b2000-03-13 15:55:09 +00003360 goto onError;
3361 }
3362 ch = *PyUnicode_AS_UNICODE(v);
3363 p = PyUnicode_AS_UNICODE(u);
3364 e = p + PyUnicode_GET_SIZE(u);
3365 result = 0;
3366 while (p < e) {
3367 if (*p++ == ch) {
3368 result = 1;
3369 break;
3370 }
3371 }
3372
3373 Py_DECREF(u);
3374 Py_DECREF(v);
3375 return result;
3376
3377onError:
3378 Py_XDECREF(u);
3379 Py_XDECREF(v);
3380 return -1;
3381}
3382
Guido van Rossumd57fd912000-03-10 22:53:23 +00003383/* Concat to string or Unicode object giving a new Unicode object. */
3384
3385PyObject *PyUnicode_Concat(PyObject *left,
3386 PyObject *right)
3387{
3388 PyUnicodeObject *u = NULL, *v = NULL, *w;
3389
3390 /* Coerce the two arguments */
3391 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
3392 if (u == NULL)
3393 goto onError;
3394 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
3395 if (v == NULL)
3396 goto onError;
3397
3398 /* Shortcuts */
3399 if (v == unicode_empty) {
3400 Py_DECREF(v);
3401 return (PyObject *)u;
3402 }
3403 if (u == unicode_empty) {
3404 Py_DECREF(u);
3405 return (PyObject *)v;
3406 }
3407
3408 /* Concat the two Unicode strings */
3409 w = _PyUnicode_New(u->length + v->length);
3410 if (w == NULL)
3411 goto onError;
3412 Py_UNICODE_COPY(w->str, u->str, u->length);
3413 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
3414
3415 Py_DECREF(u);
3416 Py_DECREF(v);
3417 return (PyObject *)w;
3418
3419onError:
3420 Py_XDECREF(u);
3421 Py_XDECREF(v);
3422 return NULL;
3423}
3424
3425static char count__doc__[] =
3426"S.count(sub[, start[, end]]) -> int\n\
3427\n\
3428Return the number of occurrences of substring sub in Unicode string\n\
3429S[start:end]. Optional arguments start and end are\n\
3430interpreted as in slice notation.";
3431
3432static PyObject *
3433unicode_count(PyUnicodeObject *self, PyObject *args)
3434{
3435 PyUnicodeObject *substring;
3436 int start = 0;
3437 int end = INT_MAX;
3438 PyObject *result;
3439
Guido van Rossumb8872e62000-05-09 14:14:27 +00003440 if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
3441 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442 return NULL;
3443
3444 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3445 (PyObject *)substring);
3446 if (substring == NULL)
3447 return NULL;
3448
Guido van Rossumd57fd912000-03-10 22:53:23 +00003449 if (start < 0)
3450 start += self->length;
3451 if (start < 0)
3452 start = 0;
3453 if (end > self->length)
3454 end = self->length;
3455 if (end < 0)
3456 end += self->length;
3457 if (end < 0)
3458 end = 0;
3459
3460 result = PyInt_FromLong((long) count(self, start, end, substring));
3461
3462 Py_DECREF(substring);
3463 return result;
3464}
3465
3466static char encode__doc__[] =
3467"S.encode([encoding[,errors]]) -> string\n\
3468\n\
Fred Drakee4315f52000-05-09 19:53:39 +00003469Return an encoded string version of S. Default encoding is the current\n\
3470default string encoding. errors may be given to set a different error\n\
3471handling scheme. Default is 'strict' meaning that encoding errors raise\n\
3472a ValueError. Other possible values are 'ignore' and 'replace'.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00003473
3474static PyObject *
3475unicode_encode(PyUnicodeObject *self, PyObject *args)
3476{
3477 char *encoding = NULL;
3478 char *errors = NULL;
3479 if (!PyArg_ParseTuple(args, "|ss:encode", &encoding, &errors))
3480 return NULL;
3481 return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
3482}
3483
3484static char expandtabs__doc__[] =
3485"S.expandtabs([tabsize]) -> unicode\n\
3486\n\
3487Return a copy of S where all tab characters are expanded using spaces.\n\
3488If tabsize is not given, a tab size of 8 characters is assumed.";
3489
3490static PyObject*
3491unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
3492{
3493 Py_UNICODE *e;
3494 Py_UNICODE *p;
3495 Py_UNICODE *q;
3496 int i, j;
3497 PyUnicodeObject *u;
3498 int tabsize = 8;
3499
3500 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
3501 return NULL;
3502
Thomas Wouters7e474022000-07-16 12:04:32 +00003503 /* First pass: determine size of output string */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003504 i = j = 0;
3505 e = self->str + self->length;
3506 for (p = self->str; p < e; p++)
3507 if (*p == '\t') {
3508 if (tabsize > 0)
3509 j += tabsize - (j % tabsize);
3510 }
3511 else {
3512 j++;
3513 if (*p == '\n' || *p == '\r') {
3514 i += j;
3515 j = 0;
3516 }
3517 }
3518
3519 /* Second pass: create output string and fill it */
3520 u = _PyUnicode_New(i + j);
3521 if (!u)
3522 return NULL;
3523
3524 j = 0;
3525 q = u->str;
3526
3527 for (p = self->str; p < e; p++)
3528 if (*p == '\t') {
3529 if (tabsize > 0) {
3530 i = tabsize - (j % tabsize);
3531 j += i;
3532 while (i--)
3533 *q++ = ' ';
3534 }
3535 }
3536 else {
3537 j++;
3538 *q++ = *p;
3539 if (*p == '\n' || *p == '\r')
3540 j = 0;
3541 }
3542
3543 return (PyObject*) u;
3544}
3545
3546static char find__doc__[] =
3547"S.find(sub [,start [,end]]) -> int\n\
3548\n\
3549Return the lowest index in S where substring sub is found,\n\
3550such that sub is contained within s[start,end]. Optional\n\
3551arguments start and end are interpreted as in slice notation.\n\
3552\n\
3553Return -1 on failure.";
3554
3555static PyObject *
3556unicode_find(PyUnicodeObject *self, PyObject *args)
3557{
3558 PyUnicodeObject *substring;
3559 int start = 0;
3560 int end = INT_MAX;
3561 PyObject *result;
3562
Guido van Rossumb8872e62000-05-09 14:14:27 +00003563 if (!PyArg_ParseTuple(args, "O|O&O&:find", &substring,
3564 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003565 return NULL;
3566 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3567 (PyObject *)substring);
3568 if (substring == NULL)
3569 return NULL;
3570
3571 result = PyInt_FromLong(findstring(self, substring, start, end, 1));
3572
3573 Py_DECREF(substring);
3574 return result;
3575}
3576
3577static PyObject *
3578unicode_getitem(PyUnicodeObject *self, int index)
3579{
3580 if (index < 0 || index >= self->length) {
3581 PyErr_SetString(PyExc_IndexError, "string index out of range");
3582 return NULL;
3583 }
3584
3585 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
3586}
3587
3588static long
3589unicode_hash(PyUnicodeObject *self)
3590{
Fredrik Lundhdde61642000-07-10 18:27:47 +00003591 /* Since Unicode objects compare equal to their ASCII string
3592 counterparts, they should use the individual character values
3593 as basis for their hash value. This is needed to assure that
3594 strings and Unicode objects behave in the same way as
3595 dictionary keys. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596
Fredrik Lundhdde61642000-07-10 18:27:47 +00003597 register int len;
3598 register Py_UNICODE *p;
3599 register long x;
3600
Guido van Rossumd57fd912000-03-10 22:53:23 +00003601 if (self->hash != -1)
3602 return self->hash;
Fredrik Lundhdde61642000-07-10 18:27:47 +00003603 len = PyUnicode_GET_SIZE(self);
3604 p = PyUnicode_AS_UNICODE(self);
3605 x = *p << 7;
3606 while (--len >= 0)
3607 x = (1000003*x) ^ *p++;
3608 x ^= PyUnicode_GET_SIZE(self);
3609 if (x == -1)
3610 x = -2;
3611 self->hash = x;
3612 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003613}
3614
3615static char index__doc__[] =
3616"S.index(sub [,start [,end]]) -> int\n\
3617\n\
3618Like S.find() but raise ValueError when the substring is not found.";
3619
3620static PyObject *
3621unicode_index(PyUnicodeObject *self, PyObject *args)
3622{
3623 int result;
3624 PyUnicodeObject *substring;
3625 int start = 0;
3626 int end = INT_MAX;
3627
Guido van Rossumb8872e62000-05-09 14:14:27 +00003628 if (!PyArg_ParseTuple(args, "O|O&O&:index", &substring,
3629 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00003630 return NULL;
3631
3632 substring = (PyUnicodeObject *)PyUnicode_FromObject(
3633 (PyObject *)substring);
3634 if (substring == NULL)
3635 return NULL;
3636
3637 result = findstring(self, substring, start, end, 1);
3638
3639 Py_DECREF(substring);
3640 if (result < 0) {
3641 PyErr_SetString(PyExc_ValueError, "substring not found");
3642 return NULL;
3643 }
3644 return PyInt_FromLong(result);
3645}
3646
3647static char islower__doc__[] =
3648"S.islower() -> int\n\
3649\n\
3650Return 1 if all cased characters in S are lowercase and there is\n\
3651at least one cased character in S, 0 otherwise.";
3652
3653static PyObject*
3654unicode_islower(PyUnicodeObject *self, PyObject *args)
3655{
3656 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3657 register const Py_UNICODE *e;
3658 int cased;
3659
3660 if (!PyArg_NoArgs(args))
3661 return NULL;
3662
3663 /* Shortcut for single character strings */
3664 if (PyUnicode_GET_SIZE(self) == 1)
3665 return PyInt_FromLong(Py_UNICODE_ISLOWER(*p) != 0);
3666
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003667 /* Special case for empty strings */
3668 if (PyString_GET_SIZE(self) == 0)
3669 return PyInt_FromLong(0);
3670
Guido van Rossumd57fd912000-03-10 22:53:23 +00003671 e = p + PyUnicode_GET_SIZE(self);
3672 cased = 0;
3673 for (; p < e; p++) {
3674 register const Py_UNICODE ch = *p;
3675
3676 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
3677 return PyInt_FromLong(0);
3678 else if (!cased && Py_UNICODE_ISLOWER(ch))
3679 cased = 1;
3680 }
3681 return PyInt_FromLong(cased);
3682}
3683
3684static char isupper__doc__[] =
3685"S.isupper() -> int\n\
3686\n\
3687Return 1 if all cased characters in S are uppercase and there is\n\
3688at least one cased character in S, 0 otherwise.";
3689
3690static PyObject*
3691unicode_isupper(PyUnicodeObject *self, PyObject *args)
3692{
3693 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3694 register const Py_UNICODE *e;
3695 int cased;
3696
3697 if (!PyArg_NoArgs(args))
3698 return NULL;
3699
3700 /* Shortcut for single character strings */
3701 if (PyUnicode_GET_SIZE(self) == 1)
3702 return PyInt_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
3703
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003704 /* Special case for empty strings */
3705 if (PyString_GET_SIZE(self) == 0)
3706 return PyInt_FromLong(0);
3707
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 e = p + PyUnicode_GET_SIZE(self);
3709 cased = 0;
3710 for (; p < e; p++) {
3711 register const Py_UNICODE ch = *p;
3712
3713 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
3714 return PyInt_FromLong(0);
3715 else if (!cased && Py_UNICODE_ISUPPER(ch))
3716 cased = 1;
3717 }
3718 return PyInt_FromLong(cased);
3719}
3720
3721static char istitle__doc__[] =
3722"S.istitle() -> int\n\
3723\n\
3724Return 1 if S is a titlecased string, i.e. upper- and titlecase characters\n\
3725may only follow uncased characters and lowercase characters only cased\n\
3726ones. Return 0 otherwise.";
3727
3728static PyObject*
3729unicode_istitle(PyUnicodeObject *self, PyObject *args)
3730{
3731 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3732 register const Py_UNICODE *e;
3733 int cased, previous_is_cased;
3734
3735 if (!PyArg_NoArgs(args))
3736 return NULL;
3737
3738 /* Shortcut for single character strings */
3739 if (PyUnicode_GET_SIZE(self) == 1)
3740 return PyInt_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
3741 (Py_UNICODE_ISUPPER(*p) != 0));
3742
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003743 /* Special case for empty strings */
3744 if (PyString_GET_SIZE(self) == 0)
3745 return PyInt_FromLong(0);
3746
Guido van Rossumd57fd912000-03-10 22:53:23 +00003747 e = p + PyUnicode_GET_SIZE(self);
3748 cased = 0;
3749 previous_is_cased = 0;
3750 for (; p < e; p++) {
3751 register const Py_UNICODE ch = *p;
3752
3753 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
3754 if (previous_is_cased)
3755 return PyInt_FromLong(0);
3756 previous_is_cased = 1;
3757 cased = 1;
3758 }
3759 else if (Py_UNICODE_ISLOWER(ch)) {
3760 if (!previous_is_cased)
3761 return PyInt_FromLong(0);
3762 previous_is_cased = 1;
3763 cased = 1;
3764 }
3765 else
3766 previous_is_cased = 0;
3767 }
3768 return PyInt_FromLong(cased);
3769}
3770
3771static char isspace__doc__[] =
3772"S.isspace() -> int\n\
3773\n\
3774Return 1 if there are only whitespace characters in S,\n\
37750 otherwise.";
3776
3777static PyObject*
3778unicode_isspace(PyUnicodeObject *self, PyObject *args)
3779{
3780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3781 register const Py_UNICODE *e;
3782
3783 if (!PyArg_NoArgs(args))
3784 return NULL;
3785
3786 /* Shortcut for single character strings */
3787 if (PyUnicode_GET_SIZE(self) == 1 &&
3788 Py_UNICODE_ISSPACE(*p))
3789 return PyInt_FromLong(1);
3790
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003791 /* Special case for empty strings */
3792 if (PyString_GET_SIZE(self) == 0)
3793 return PyInt_FromLong(0);
3794
Guido van Rossumd57fd912000-03-10 22:53:23 +00003795 e = p + PyUnicode_GET_SIZE(self);
3796 for (; p < e; p++) {
3797 if (!Py_UNICODE_ISSPACE(*p))
3798 return PyInt_FromLong(0);
3799 }
3800 return PyInt_FromLong(1);
3801}
3802
Marc-André Lemburga7acf422000-07-05 09:49:44 +00003803static char isalpha__doc__[] =
3804"S.isalpha() -> int\n\
3805\n\
3806Return 1 if all characters in S are alphabetic\n\
3807and there is at least one character in S, 0 otherwise.";
3808
3809static PyObject*
3810unicode_isalpha(PyUnicodeObject *self, PyObject *args)
3811{
3812 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3813 register const Py_UNICODE *e;
3814
3815 if (!PyArg_NoArgs(args))
3816 return NULL;
3817
3818 /* Shortcut for single character strings */
3819 if (PyUnicode_GET_SIZE(self) == 1 &&
3820 Py_UNICODE_ISALPHA(*p))
3821 return PyInt_FromLong(1);
3822
3823 /* Special case for empty strings */
3824 if (PyString_GET_SIZE(self) == 0)
3825 return PyInt_FromLong(0);
3826
3827 e = p + PyUnicode_GET_SIZE(self);
3828 for (; p < e; p++) {
3829 if (!Py_UNICODE_ISALPHA(*p))
3830 return PyInt_FromLong(0);
3831 }
3832 return PyInt_FromLong(1);
3833}
3834
3835static char isalnum__doc__[] =
3836"S.isalnum() -> int\n\
3837\n\
3838Return 1 if all characters in S are alphanumeric\n\
3839and there is at least one character in S, 0 otherwise.";
3840
3841static PyObject*
3842unicode_isalnum(PyUnicodeObject *self, PyObject *args)
3843{
3844 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3845 register const Py_UNICODE *e;
3846
3847 if (!PyArg_NoArgs(args))
3848 return NULL;
3849
3850 /* Shortcut for single character strings */
3851 if (PyUnicode_GET_SIZE(self) == 1 &&
3852 Py_UNICODE_ISALNUM(*p))
3853 return PyInt_FromLong(1);
3854
3855 /* Special case for empty strings */
3856 if (PyString_GET_SIZE(self) == 0)
3857 return PyInt_FromLong(0);
3858
3859 e = p + PyUnicode_GET_SIZE(self);
3860 for (; p < e; p++) {
3861 if (!Py_UNICODE_ISALNUM(*p))
3862 return PyInt_FromLong(0);
3863 }
3864 return PyInt_FromLong(1);
3865}
3866
Guido van Rossumd57fd912000-03-10 22:53:23 +00003867static char isdecimal__doc__[] =
3868"S.isdecimal() -> int\n\
3869\n\
3870Return 1 if there are only decimal characters in S,\n\
38710 otherwise.";
3872
3873static PyObject*
3874unicode_isdecimal(PyUnicodeObject *self, PyObject *args)
3875{
3876 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3877 register const Py_UNICODE *e;
3878
3879 if (!PyArg_NoArgs(args))
3880 return NULL;
3881
3882 /* Shortcut for single character strings */
3883 if (PyUnicode_GET_SIZE(self) == 1 &&
3884 Py_UNICODE_ISDECIMAL(*p))
3885 return PyInt_FromLong(1);
3886
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003887 /* Special case for empty strings */
3888 if (PyString_GET_SIZE(self) == 0)
3889 return PyInt_FromLong(0);
3890
Guido van Rossumd57fd912000-03-10 22:53:23 +00003891 e = p + PyUnicode_GET_SIZE(self);
3892 for (; p < e; p++) {
3893 if (!Py_UNICODE_ISDECIMAL(*p))
3894 return PyInt_FromLong(0);
3895 }
3896 return PyInt_FromLong(1);
3897}
3898
3899static char isdigit__doc__[] =
3900"S.isdigit() -> int\n\
3901\n\
3902Return 1 if there are only digit characters in S,\n\
39030 otherwise.";
3904
3905static PyObject*
3906unicode_isdigit(PyUnicodeObject *self, PyObject *args)
3907{
3908 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3909 register const Py_UNICODE *e;
3910
3911 if (!PyArg_NoArgs(args))
3912 return NULL;
3913
3914 /* Shortcut for single character strings */
3915 if (PyUnicode_GET_SIZE(self) == 1 &&
3916 Py_UNICODE_ISDIGIT(*p))
3917 return PyInt_FromLong(1);
3918
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003919 /* Special case for empty strings */
3920 if (PyString_GET_SIZE(self) == 0)
3921 return PyInt_FromLong(0);
3922
Guido van Rossumd57fd912000-03-10 22:53:23 +00003923 e = p + PyUnicode_GET_SIZE(self);
3924 for (; p < e; p++) {
3925 if (!Py_UNICODE_ISDIGIT(*p))
3926 return PyInt_FromLong(0);
3927 }
3928 return PyInt_FromLong(1);
3929}
3930
3931static char isnumeric__doc__[] =
3932"S.isnumeric() -> int\n\
3933\n\
3934Return 1 if there are only numeric characters in S,\n\
39350 otherwise.";
3936
3937static PyObject*
3938unicode_isnumeric(PyUnicodeObject *self, PyObject *args)
3939{
3940 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
3941 register const Py_UNICODE *e;
3942
3943 if (!PyArg_NoArgs(args))
3944 return NULL;
3945
3946 /* Shortcut for single character strings */
3947 if (PyUnicode_GET_SIZE(self) == 1 &&
3948 Py_UNICODE_ISNUMERIC(*p))
3949 return PyInt_FromLong(1);
3950
Marc-André Lemburg60bc8092000-06-14 09:18:32 +00003951 /* Special case for empty strings */
3952 if (PyString_GET_SIZE(self) == 0)
3953 return PyInt_FromLong(0);
3954
Guido van Rossumd57fd912000-03-10 22:53:23 +00003955 e = p + PyUnicode_GET_SIZE(self);
3956 for (; p < e; p++) {
3957 if (!Py_UNICODE_ISNUMERIC(*p))
3958 return PyInt_FromLong(0);
3959 }
3960 return PyInt_FromLong(1);
3961}
3962
3963static char join__doc__[] =
3964"S.join(sequence) -> unicode\n\
3965\n\
3966Return a string which is the concatenation of the strings in the\n\
3967sequence. The separator between elements is S.";
3968
3969static PyObject*
3970unicode_join(PyUnicodeObject *self, PyObject *args)
3971{
3972 PyObject *data;
3973 if (!PyArg_ParseTuple(args, "O:join", &data))
3974 return NULL;
3975
3976 return PyUnicode_Join((PyObject *)self, data);
3977}
3978
3979static int
3980unicode_length(PyUnicodeObject *self)
3981{
3982 return self->length;
3983}
3984
3985static char ljust__doc__[] =
3986"S.ljust(width) -> unicode\n\
3987\n\
3988Return S left justified in a Unicode string of length width. Padding is\n\
3989done using spaces.";
3990
3991static PyObject *
3992unicode_ljust(PyUnicodeObject *self, PyObject *args)
3993{
3994 int width;
3995 if (!PyArg_ParseTuple(args, "i:ljust", &width))
3996 return NULL;
3997
3998 if (self->length >= width) {
3999 Py_INCREF(self);
4000 return (PyObject*) self;
4001 }
4002
4003 return (PyObject*) pad(self, 0, width - self->length, ' ');
4004}
4005
4006static char lower__doc__[] =
4007"S.lower() -> unicode\n\
4008\n\
4009Return a copy of the string S converted to lowercase.";
4010
4011static PyObject*
4012unicode_lower(PyUnicodeObject *self, PyObject *args)
4013{
4014 if (!PyArg_NoArgs(args))
4015 return NULL;
4016 return fixup(self, fixlower);
4017}
4018
4019static char lstrip__doc__[] =
4020"S.lstrip() -> unicode\n\
4021\n\
4022Return a copy of the string S with leading whitespace removed.";
4023
4024static PyObject *
4025unicode_lstrip(PyUnicodeObject *self, PyObject *args)
4026{
4027 if (!PyArg_NoArgs(args))
4028 return NULL;
4029 return strip(self, 1, 0);
4030}
4031
4032static PyObject*
4033unicode_repeat(PyUnicodeObject *str, int len)
4034{
4035 PyUnicodeObject *u;
4036 Py_UNICODE *p;
Tim Peters8f422462000-09-09 06:13:41 +00004037 int nchars;
4038 size_t nbytes;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004039
4040 if (len < 0)
4041 len = 0;
4042
4043 if (len == 1) {
4044 /* no repeat, return original string */
4045 Py_INCREF(str);
4046 return (PyObject*) str;
4047 }
Tim Peters8f422462000-09-09 06:13:41 +00004048
4049 /* ensure # of chars needed doesn't overflow int and # of bytes
4050 * needed doesn't overflow size_t
4051 */
4052 nchars = len * str->length;
4053 if (len && nchars / len != str->length) {
4054 PyErr_SetString(PyExc_OverflowError,
4055 "repeated string is too long");
4056 return NULL;
4057 }
4058 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
4059 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
4060 PyErr_SetString(PyExc_OverflowError,
4061 "repeated string is too long");
4062 return NULL;
4063 }
4064 u = _PyUnicode_New(nchars);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004065 if (!u)
4066 return NULL;
4067
4068 p = u->str;
4069
4070 while (len-- > 0) {
4071 Py_UNICODE_COPY(p, str->str, str->length);
4072 p += str->length;
4073 }
4074
4075 return (PyObject*) u;
4076}
4077
4078PyObject *PyUnicode_Replace(PyObject *obj,
4079 PyObject *subobj,
4080 PyObject *replobj,
4081 int maxcount)
4082{
4083 PyObject *self;
4084 PyObject *str1;
4085 PyObject *str2;
4086 PyObject *result;
4087
4088 self = PyUnicode_FromObject(obj);
4089 if (self == NULL)
4090 return NULL;
4091 str1 = PyUnicode_FromObject(subobj);
4092 if (str1 == NULL) {
4093 Py_DECREF(self);
4094 return NULL;
4095 }
4096 str2 = PyUnicode_FromObject(replobj);
4097 if (str2 == NULL) {
4098 Py_DECREF(self);
4099 Py_DECREF(str1);
4100 return NULL;
4101 }
4102 result = replace((PyUnicodeObject *)self,
4103 (PyUnicodeObject *)str1,
4104 (PyUnicodeObject *)str2,
4105 maxcount);
4106 Py_DECREF(self);
4107 Py_DECREF(str1);
4108 Py_DECREF(str2);
4109 return result;
4110}
4111
4112static char replace__doc__[] =
4113"S.replace (old, new[, maxsplit]) -> unicode\n\
4114\n\
4115Return a copy of S with all occurrences of substring\n\
4116old replaced by new. If the optional argument maxsplit is\n\
4117given, only the first maxsplit occurrences are replaced.";
4118
4119static PyObject*
4120unicode_replace(PyUnicodeObject *self, PyObject *args)
4121{
4122 PyUnicodeObject *str1;
4123 PyUnicodeObject *str2;
4124 int maxcount = -1;
4125 PyObject *result;
4126
4127 if (!PyArg_ParseTuple(args, "OO|i:replace", &str1, &str2, &maxcount))
4128 return NULL;
4129 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
4130 if (str1 == NULL)
4131 return NULL;
4132 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
4133 if (str2 == NULL)
4134 return NULL;
4135
4136 result = replace(self, str1, str2, maxcount);
4137
4138 Py_DECREF(str1);
4139 Py_DECREF(str2);
4140 return result;
4141}
4142
4143static
4144PyObject *unicode_repr(PyObject *unicode)
4145{
4146 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
4147 PyUnicode_GET_SIZE(unicode),
4148 1);
4149}
4150
4151static char rfind__doc__[] =
4152"S.rfind(sub [,start [,end]]) -> int\n\
4153\n\
4154Return the highest index in S where substring sub is found,\n\
4155such that sub is contained within s[start,end]. Optional\n\
4156arguments start and end are interpreted as in slice notation.\n\
4157\n\
4158Return -1 on failure.";
4159
4160static PyObject *
4161unicode_rfind(PyUnicodeObject *self, PyObject *args)
4162{
4163 PyUnicodeObject *substring;
4164 int start = 0;
4165 int end = INT_MAX;
4166 PyObject *result;
4167
Guido van Rossumb8872e62000-05-09 14:14:27 +00004168 if (!PyArg_ParseTuple(args, "O|O&O&:rfind", &substring,
4169 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004170 return NULL;
4171 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4172 (PyObject *)substring);
4173 if (substring == NULL)
4174 return NULL;
4175
4176 result = PyInt_FromLong(findstring(self, substring, start, end, -1));
4177
4178 Py_DECREF(substring);
4179 return result;
4180}
4181
4182static char rindex__doc__[] =
4183"S.rindex(sub [,start [,end]]) -> int\n\
4184\n\
4185Like S.rfind() but raise ValueError when the substring is not found.";
4186
4187static PyObject *
4188unicode_rindex(PyUnicodeObject *self, PyObject *args)
4189{
4190 int result;
4191 PyUnicodeObject *substring;
4192 int start = 0;
4193 int end = INT_MAX;
4194
Guido van Rossumb8872e62000-05-09 14:14:27 +00004195 if (!PyArg_ParseTuple(args, "O|O&O&:rindex", &substring,
4196 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004197 return NULL;
4198 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4199 (PyObject *)substring);
4200 if (substring == NULL)
4201 return NULL;
4202
4203 result = findstring(self, substring, start, end, -1);
4204
4205 Py_DECREF(substring);
4206 if (result < 0) {
4207 PyErr_SetString(PyExc_ValueError, "substring not found");
4208 return NULL;
4209 }
4210 return PyInt_FromLong(result);
4211}
4212
4213static char rjust__doc__[] =
4214"S.rjust(width) -> unicode\n\
4215\n\
4216Return S right justified in a Unicode string of length width. Padding is\n\
4217done using spaces.";
4218
4219static PyObject *
4220unicode_rjust(PyUnicodeObject *self, PyObject *args)
4221{
4222 int width;
4223 if (!PyArg_ParseTuple(args, "i:rjust", &width))
4224 return NULL;
4225
4226 if (self->length >= width) {
4227 Py_INCREF(self);
4228 return (PyObject*) self;
4229 }
4230
4231 return (PyObject*) pad(self, width - self->length, 0, ' ');
4232}
4233
4234static char rstrip__doc__[] =
4235"S.rstrip() -> unicode\n\
4236\n\
4237Return a copy of the string S with trailing whitespace removed.";
4238
4239static PyObject *
4240unicode_rstrip(PyUnicodeObject *self, PyObject *args)
4241{
4242 if (!PyArg_NoArgs(args))
4243 return NULL;
4244 return strip(self, 0, 1);
4245}
4246
4247static PyObject*
4248unicode_slice(PyUnicodeObject *self, int start, int end)
4249{
4250 /* standard clamping */
4251 if (start < 0)
4252 start = 0;
4253 if (end < 0)
4254 end = 0;
4255 if (end > self->length)
4256 end = self->length;
4257 if (start == 0 && end == self->length) {
4258 /* full slice, return original string */
4259 Py_INCREF(self);
4260 return (PyObject*) self;
4261 }
4262 if (start > end)
4263 start = end;
4264 /* copy slice */
4265 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
4266 end - start);
4267}
4268
4269PyObject *PyUnicode_Split(PyObject *s,
4270 PyObject *sep,
4271 int maxsplit)
4272{
4273 PyObject *result;
4274
4275 s = PyUnicode_FromObject(s);
4276 if (s == NULL)
4277 return NULL;
4278 if (sep != NULL) {
4279 sep = PyUnicode_FromObject(sep);
4280 if (sep == NULL) {
4281 Py_DECREF(s);
4282 return NULL;
4283 }
4284 }
4285
4286 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
4287
4288 Py_DECREF(s);
4289 Py_XDECREF(sep);
4290 return result;
4291}
4292
4293static char split__doc__[] =
4294"S.split([sep [,maxsplit]]) -> list of strings\n\
4295\n\
4296Return a list of the words in S, using sep as the\n\
4297delimiter string. If maxsplit is given, at most maxsplit\n\
4298splits are done. If sep is not specified, any whitespace string\n\
4299is a separator.";
4300
4301static PyObject*
4302unicode_split(PyUnicodeObject *self, PyObject *args)
4303{
4304 PyObject *substring = Py_None;
4305 int maxcount = -1;
4306
4307 if (!PyArg_ParseTuple(args, "|Oi:split", &substring, &maxcount))
4308 return NULL;
4309
4310 if (substring == Py_None)
4311 return split(self, NULL, maxcount);
4312 else if (PyUnicode_Check(substring))
4313 return split(self, (PyUnicodeObject *)substring, maxcount);
4314 else
4315 return PyUnicode_Split((PyObject *)self, substring, maxcount);
4316}
4317
4318static char splitlines__doc__[] =
Guido van Rossum86662912000-04-11 15:38:46 +00004319"S.splitlines([keepends]]) -> list of strings\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +00004320\n\
4321Return a list of the lines in S, breaking at line boundaries.\n\
Guido van Rossum86662912000-04-11 15:38:46 +00004322Line breaks are not included in the resulting list unless keepends\n\
4323is given and true.";
Guido van Rossumd57fd912000-03-10 22:53:23 +00004324
4325static PyObject*
4326unicode_splitlines(PyUnicodeObject *self, PyObject *args)
4327{
Guido van Rossum86662912000-04-11 15:38:46 +00004328 int keepends = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004329
Guido van Rossum86662912000-04-11 15:38:46 +00004330 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004331 return NULL;
4332
Guido van Rossum86662912000-04-11 15:38:46 +00004333 return PyUnicode_Splitlines((PyObject *)self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004334}
4335
4336static
4337PyObject *unicode_str(PyUnicodeObject *self)
4338{
Fred Drakee4315f52000-05-09 19:53:39 +00004339 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004340}
4341
4342static char strip__doc__[] =
4343"S.strip() -> unicode\n\
4344\n\
4345Return a copy of S with leading and trailing whitespace removed.";
4346
4347static PyObject *
4348unicode_strip(PyUnicodeObject *self, PyObject *args)
4349{
4350 if (!PyArg_NoArgs(args))
4351 return NULL;
4352 return strip(self, 1, 1);
4353}
4354
4355static char swapcase__doc__[] =
4356"S.swapcase() -> unicode\n\
4357\n\
4358Return a copy of S with uppercase characters converted to lowercase\n\
4359and vice versa.";
4360
4361static PyObject*
4362unicode_swapcase(PyUnicodeObject *self, PyObject *args)
4363{
4364 if (!PyArg_NoArgs(args))
4365 return NULL;
4366 return fixup(self, fixswapcase);
4367}
4368
4369static char translate__doc__[] =
4370"S.translate(table) -> unicode\n\
4371\n\
4372Return a copy of the string S, where all characters have been mapped\n\
4373through the given translation table, which must be a mapping of\n\
4374Unicode ordinals to Unicode ordinals or None. Unmapped characters\n\
4375are left untouched. Characters mapped to None are deleted.";
4376
4377static PyObject*
4378unicode_translate(PyUnicodeObject *self, PyObject *args)
4379{
4380 PyObject *table;
4381
4382 if (!PyArg_ParseTuple(args, "O:translate", &table))
4383 return NULL;
4384 return PyUnicode_TranslateCharmap(self->str,
4385 self->length,
4386 table,
4387 "ignore");
4388}
4389
4390static char upper__doc__[] =
4391"S.upper() -> unicode\n\
4392\n\
4393Return a copy of S converted to uppercase.";
4394
4395static PyObject*
4396unicode_upper(PyUnicodeObject *self, PyObject *args)
4397{
4398 if (!PyArg_NoArgs(args))
4399 return NULL;
4400 return fixup(self, fixupper);
4401}
4402
4403#if 0
4404static char zfill__doc__[] =
4405"S.zfill(width) -> unicode\n\
4406\n\
4407Pad a numeric string x with zeros on the left, to fill a field\n\
4408of the specified width. The string x is never truncated.";
4409
4410static PyObject *
4411unicode_zfill(PyUnicodeObject *self, PyObject *args)
4412{
4413 int fill;
4414 PyUnicodeObject *u;
4415
4416 int width;
4417 if (!PyArg_ParseTuple(args, "i:zfill", &width))
4418 return NULL;
4419
4420 if (self->length >= width) {
4421 Py_INCREF(self);
4422 return (PyObject*) self;
4423 }
4424
4425 fill = width - self->length;
4426
4427 u = pad(self, fill, 0, '0');
4428
4429 if (u->str[fill] == '+' || u->str[fill] == '-') {
4430 /* move sign to beginning of string */
4431 u->str[0] = u->str[fill];
4432 u->str[fill] = '0';
4433 }
4434
4435 return (PyObject*) u;
4436}
4437#endif
4438
4439#if 0
4440static PyObject*
4441unicode_freelistsize(PyUnicodeObject *self, PyObject *args)
4442{
4443 if (!PyArg_NoArgs(args))
4444 return NULL;
4445 return PyInt_FromLong(unicode_freelist_size);
4446}
4447#endif
4448
4449static char startswith__doc__[] =
4450"S.startswith(prefix[, start[, end]]) -> int\n\
4451\n\
4452Return 1 if S starts with the specified prefix, otherwise return 0. With\n\
4453optional start, test S beginning at that position. With optional end, stop\n\
4454comparing S at that position.";
4455
4456static PyObject *
4457unicode_startswith(PyUnicodeObject *self,
4458 PyObject *args)
4459{
4460 PyUnicodeObject *substring;
4461 int start = 0;
4462 int end = INT_MAX;
4463 PyObject *result;
4464
Guido van Rossumb8872e62000-05-09 14:14:27 +00004465 if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &substring,
4466 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004467 return NULL;
4468 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4469 (PyObject *)substring);
4470 if (substring == NULL)
4471 return NULL;
4472
4473 result = PyInt_FromLong(tailmatch(self, substring, start, end, -1));
4474
4475 Py_DECREF(substring);
4476 return result;
4477}
4478
4479
4480static char endswith__doc__[] =
4481"S.endswith(suffix[, start[, end]]) -> int\n\
4482\n\
4483Return 1 if S ends with the specified suffix, otherwise return 0. With\n\
4484optional start, test S beginning at that position. With optional end, stop\n\
4485comparing S at that position.";
4486
4487static PyObject *
4488unicode_endswith(PyUnicodeObject *self,
4489 PyObject *args)
4490{
4491 PyUnicodeObject *substring;
4492 int start = 0;
4493 int end = INT_MAX;
4494 PyObject *result;
4495
Guido van Rossumb8872e62000-05-09 14:14:27 +00004496 if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &substring,
4497 _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +00004498 return NULL;
4499 substring = (PyUnicodeObject *)PyUnicode_FromObject(
4500 (PyObject *)substring);
4501 if (substring == NULL)
4502 return NULL;
4503
4504 result = PyInt_FromLong(tailmatch(self, substring, start, end, +1));
4505
4506 Py_DECREF(substring);
4507 return result;
4508}
4509
4510
4511static PyMethodDef unicode_methods[] = {
4512
4513 /* Order is according to common usage: often used methods should
4514 appear first, since lookup is done sequentially. */
4515
4516 {"encode", (PyCFunction) unicode_encode, 1, encode__doc__},
4517 {"replace", (PyCFunction) unicode_replace, 1, replace__doc__},
4518 {"split", (PyCFunction) unicode_split, 1, split__doc__},
4519 {"join", (PyCFunction) unicode_join, 1, join__doc__},
4520 {"capitalize", (PyCFunction) unicode_capitalize, 0, capitalize__doc__},
4521 {"title", (PyCFunction) unicode_title, 0, title__doc__},
4522 {"center", (PyCFunction) unicode_center, 1, center__doc__},
4523 {"count", (PyCFunction) unicode_count, 1, count__doc__},
4524 {"expandtabs", (PyCFunction) unicode_expandtabs, 1, expandtabs__doc__},
4525 {"find", (PyCFunction) unicode_find, 1, find__doc__},
4526 {"index", (PyCFunction) unicode_index, 1, index__doc__},
4527 {"ljust", (PyCFunction) unicode_ljust, 1, ljust__doc__},
4528 {"lower", (PyCFunction) unicode_lower, 0, lower__doc__},
4529 {"lstrip", (PyCFunction) unicode_lstrip, 0, lstrip__doc__},
4530/* {"maketrans", (PyCFunction) unicode_maketrans, 1, maketrans__doc__}, */
4531 {"rfind", (PyCFunction) unicode_rfind, 1, rfind__doc__},
4532 {"rindex", (PyCFunction) unicode_rindex, 1, rindex__doc__},
4533 {"rjust", (PyCFunction) unicode_rjust, 1, rjust__doc__},
4534 {"rstrip", (PyCFunction) unicode_rstrip, 0, rstrip__doc__},
4535 {"splitlines", (PyCFunction) unicode_splitlines, 1, splitlines__doc__},
4536 {"strip", (PyCFunction) unicode_strip, 0, strip__doc__},
4537 {"swapcase", (PyCFunction) unicode_swapcase, 0, swapcase__doc__},
4538 {"translate", (PyCFunction) unicode_translate, 1, translate__doc__},
4539 {"upper", (PyCFunction) unicode_upper, 0, upper__doc__},
4540 {"startswith", (PyCFunction) unicode_startswith, 1, startswith__doc__},
4541 {"endswith", (PyCFunction) unicode_endswith, 1, endswith__doc__},
4542 {"islower", (PyCFunction) unicode_islower, 0, islower__doc__},
4543 {"isupper", (PyCFunction) unicode_isupper, 0, isupper__doc__},
4544 {"istitle", (PyCFunction) unicode_istitle, 0, istitle__doc__},
4545 {"isspace", (PyCFunction) unicode_isspace, 0, isspace__doc__},
4546 {"isdecimal", (PyCFunction) unicode_isdecimal, 0, isdecimal__doc__},
4547 {"isdigit", (PyCFunction) unicode_isdigit, 0, isdigit__doc__},
4548 {"isnumeric", (PyCFunction) unicode_isnumeric, 0, isnumeric__doc__},
Marc-André Lemburga7acf422000-07-05 09:49:44 +00004549 {"isalpha", (PyCFunction) unicode_isalpha, 0, isalpha__doc__},
4550 {"isalnum", (PyCFunction) unicode_isalnum, 0, isalnum__doc__},
Guido van Rossumd57fd912000-03-10 22:53:23 +00004551#if 0
4552 {"zfill", (PyCFunction) unicode_zfill, 1, zfill__doc__},
4553 {"capwords", (PyCFunction) unicode_capwords, 0, capwords__doc__},
4554#endif
4555
4556#if 0
4557 /* This one is just used for debugging the implementation. */
4558 {"freelistsize", (PyCFunction) unicode_freelistsize, 0},
4559#endif
4560
4561 {NULL, NULL}
4562};
4563
4564static PyObject *
4565unicode_getattr(PyUnicodeObject *self, char *name)
4566{
4567 return Py_FindMethod(unicode_methods, (PyObject*) self, name);
4568}
4569
4570static PySequenceMethods unicode_as_sequence = {
4571 (inquiry) unicode_length, /* sq_length */
4572 (binaryfunc) PyUnicode_Concat, /* sq_concat */
4573 (intargfunc) unicode_repeat, /* sq_repeat */
4574 (intargfunc) unicode_getitem, /* sq_item */
4575 (intintargfunc) unicode_slice, /* sq_slice */
4576 0, /* sq_ass_item */
4577 0, /* sq_ass_slice */
Guido van Rossum403d68b2000-03-13 15:55:09 +00004578 (objobjproc)PyUnicode_Contains, /*sq_contains*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004579};
4580
4581static int
4582unicode_buffer_getreadbuf(PyUnicodeObject *self,
4583 int index,
4584 const void **ptr)
4585{
4586 if (index != 0) {
4587 PyErr_SetString(PyExc_SystemError,
4588 "accessing non-existent unicode segment");
4589 return -1;
4590 }
4591 *ptr = (void *) self->str;
4592 return PyUnicode_GET_DATA_SIZE(self);
4593}
4594
4595static int
4596unicode_buffer_getwritebuf(PyUnicodeObject *self, int index,
4597 const void **ptr)
4598{
4599 PyErr_SetString(PyExc_TypeError,
4600 "cannot use unicode as modifyable buffer");
4601 return -1;
4602}
4603
4604static int
4605unicode_buffer_getsegcount(PyUnicodeObject *self,
4606 int *lenp)
4607{
4608 if (lenp)
4609 *lenp = PyUnicode_GET_DATA_SIZE(self);
4610 return 1;
4611}
4612
4613static int
4614unicode_buffer_getcharbuf(PyUnicodeObject *self,
4615 int index,
4616 const void **ptr)
4617{
4618 PyObject *str;
4619
4620 if (index != 0) {
4621 PyErr_SetString(PyExc_SystemError,
4622 "accessing non-existent unicode segment");
4623 return -1;
4624 }
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00004625 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004626 if (str == NULL)
4627 return -1;
4628 *ptr = (void *) PyString_AS_STRING(str);
4629 return PyString_GET_SIZE(str);
4630}
4631
4632/* Helpers for PyUnicode_Format() */
4633
4634static PyObject *
Thomas Wouters78890102000-07-22 19:25:51 +00004635getnextarg(PyObject *args, int arglen, int *p_argidx)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004636{
4637 int argidx = *p_argidx;
4638 if (argidx < arglen) {
4639 (*p_argidx)++;
4640 if (arglen < 0)
4641 return args;
4642 else
4643 return PyTuple_GetItem(args, argidx);
4644 }
4645 PyErr_SetString(PyExc_TypeError,
4646 "not enough arguments for format string");
4647 return NULL;
4648}
4649
4650#define F_LJUST (1<<0)
4651#define F_SIGN (1<<1)
4652#define F_BLANK (1<<2)
4653#define F_ALT (1<<3)
4654#define F_ZERO (1<<4)
4655
4656static
Guido van Rossumd57fd912000-03-10 22:53:23 +00004657int usprintf(register Py_UNICODE *buffer, char *format, ...)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004658{
4659 register int i;
4660 int len;
4661 va_list va;
4662 char *charbuffer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004663 va_start(va, format);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004664
4665 /* First, format the string as char array, then expand to Py_UNICODE
4666 array. */
4667 charbuffer = (char *)buffer;
4668 len = vsprintf(charbuffer, format, va);
4669 for (i = len - 1; i >= 0; i--)
4670 buffer[i] = (Py_UNICODE) charbuffer[i];
4671
4672 va_end(va);
4673 return len;
4674}
4675
4676static int
4677formatfloat(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004678 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004679 int flags,
4680 int prec,
4681 int type,
4682 PyObject *v)
4683{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004684 /* fmt = '%#.' + `prec` + `type`
4685 worst case length = 3 + 10 (len of INT_MAX) + 1 = 14 (use 20)*/
Guido van Rossumd57fd912000-03-10 22:53:23 +00004686 char fmt[20];
4687 double x;
4688
4689 x = PyFloat_AsDouble(v);
4690 if (x == -1.0 && PyErr_Occurred())
4691 return -1;
4692 if (prec < 0)
4693 prec = 6;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004694 if (type == 'f' && (fabs(x) / 1e25) >= 1e25)
4695 type = 'g';
4696 sprintf(fmt, "%%%s.%d%c", (flags & F_ALT) ? "#" : "", prec, type);
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004697 /* worst case length calc to ensure no buffer overrun:
4698 fmt = %#.<prec>g
4699 buf = '-' + [0-9]*prec + '.' + 'e+' + (longest exp
4700 for any double rep.)
4701 len = 1 + prec + 1 + 2 + 5 = 9 + prec
4702 If prec=0 the effective precision is 1 (the leading digit is
4703 always given), therefore increase by one to 10+prec. */
4704 if (buflen <= (size_t)10 + (size_t)prec) {
4705 PyErr_SetString(PyExc_OverflowError,
4706 "formatted float is too long (precision too long?)");
4707 return -1;
4708 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004709 return usprintf(buf, fmt, x);
4710}
4711
Tim Peters38fd5b62000-09-21 05:43:11 +00004712static PyObject*
4713formatlong(PyObject *val, int flags, int prec, int type)
4714{
4715 char *buf;
4716 int i, len;
4717 PyObject *str; /* temporary string object. */
4718 PyUnicodeObject *result;
4719
4720 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
4721 if (!str)
4722 return NULL;
4723 result = _PyUnicode_New(len);
4724 for (i = 0; i < len; i++)
4725 result->str[i] = buf[i];
4726 result->str[len] = 0;
4727 Py_DECREF(str);
4728 return (PyObject*)result;
4729}
4730
Guido van Rossumd57fd912000-03-10 22:53:23 +00004731static int
4732formatint(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004733 size_t buflen,
Guido van Rossumd57fd912000-03-10 22:53:23 +00004734 int flags,
4735 int prec,
4736 int type,
4737 PyObject *v)
4738{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004739 /* fmt = '%#.' + `prec` + 'l' + `type`
Tim Peters38fd5b62000-09-21 05:43:11 +00004740 worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
4741 + 1 + 1 = 24*/
4742 char fmt[64]; /* plenty big enough! */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004743 long x;
4744
4745 x = PyInt_AsLong(v);
4746 if (x == -1 && PyErr_Occurred())
4747 return -1;
4748 if (prec < 0)
4749 prec = 1;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004750 /* buf = '+'/'-'/'0'/'0x' + '[0-9]'*max(prec,len(x in octal))
4751 worst case buf = '0x' + [0-9]*prec, where prec >= 11 */
4752 if (buflen <= 13 || buflen <= (size_t)2+(size_t)prec) {
4753 PyErr_SetString(PyExc_OverflowError,
4754 "formatted integer is too long (precision too long?)");
4755 return -1;
4756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004757 sprintf(fmt, "%%%s.%dl%c", (flags & F_ALT) ? "#" : "", prec, type);
4758 return usprintf(buf, fmt, x);
4759}
4760
4761static int
4762formatchar(Py_UNICODE *buf,
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004763 size_t buflen,
4764 PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004765{
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004766 /* presume that the buffer is at least 2 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004767 if (PyUnicode_Check(v)) {
4768 if (PyUnicode_GET_SIZE(v) != 1)
4769 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004770 buf[0] = PyUnicode_AS_UNICODE(v)[0];
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004771 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004772
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004773 else if (PyString_Check(v)) {
4774 if (PyString_GET_SIZE(v) != 1)
4775 goto onError;
4776 buf[0] = (Py_UNICODE)PyString_AS_STRING(v)[0];
4777 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00004778
4779 else {
4780 /* Integer input truncated to a character */
4781 long x;
4782 x = PyInt_AsLong(v);
4783 if (x == -1 && PyErr_Occurred())
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004784 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004785 buf[0] = (char) x;
4786 }
4787 buf[1] = '\0';
4788 return 1;
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00004789
4790 onError:
4791 PyErr_SetString(PyExc_TypeError,
4792 "%c requires int or char");
4793 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004794}
4795
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004796/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
4797
4798 FORMATBUFLEN is the length of the buffer in which the floats, ints, &
4799 chars are formatted. XXX This is a magic number. Each formatting
4800 routine does bounds checking to ensure no overflow, but a better
4801 solution may be to malloc a buffer of appropriate size for each
4802 format. For now, the current solution is sufficient.
4803*/
4804#define FORMATBUFLEN (size_t)120
4805
Guido van Rossumd57fd912000-03-10 22:53:23 +00004806PyObject *PyUnicode_Format(PyObject *format,
4807 PyObject *args)
4808{
4809 Py_UNICODE *fmt, *res;
4810 int fmtcnt, rescnt, reslen, arglen, argidx;
4811 int args_owned = 0;
4812 PyUnicodeObject *result = NULL;
4813 PyObject *dict = NULL;
4814 PyObject *uformat;
4815
4816 if (format == NULL || args == NULL) {
4817 PyErr_BadInternalCall();
4818 return NULL;
4819 }
4820 uformat = PyUnicode_FromObject(format);
Fred Drakee4315f52000-05-09 19:53:39 +00004821 if (uformat == NULL)
4822 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004823 fmt = PyUnicode_AS_UNICODE(uformat);
4824 fmtcnt = PyUnicode_GET_SIZE(uformat);
4825
4826 reslen = rescnt = fmtcnt + 100;
4827 result = _PyUnicode_New(reslen);
4828 if (result == NULL)
4829 goto onError;
4830 res = PyUnicode_AS_UNICODE(result);
4831
4832 if (PyTuple_Check(args)) {
4833 arglen = PyTuple_Size(args);
4834 argidx = 0;
4835 }
4836 else {
4837 arglen = -1;
4838 argidx = -2;
4839 }
4840 if (args->ob_type->tp_as_mapping)
4841 dict = args;
4842
4843 while (--fmtcnt >= 0) {
4844 if (*fmt != '%') {
4845 if (--rescnt < 0) {
4846 rescnt = fmtcnt + 100;
4847 reslen += rescnt;
4848 if (_PyUnicode_Resize(result, reslen) < 0)
4849 return NULL;
4850 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
4851 --rescnt;
4852 }
4853 *res++ = *fmt++;
4854 }
4855 else {
4856 /* Got a format specifier */
4857 int flags = 0;
4858 int width = -1;
4859 int prec = -1;
4860 int size = 0;
4861 Py_UNICODE c = '\0';
4862 Py_UNICODE fill;
4863 PyObject *v = NULL;
4864 PyObject *temp = NULL;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004865 Py_UNICODE *pbuf;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004866 Py_UNICODE sign;
4867 int len;
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00004868 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{float,int,char}() */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004869
4870 fmt++;
4871 if (*fmt == '(') {
4872 Py_UNICODE *keystart;
4873 int keylen;
4874 PyObject *key;
4875 int pcount = 1;
4876
4877 if (dict == NULL) {
4878 PyErr_SetString(PyExc_TypeError,
4879 "format requires a mapping");
4880 goto onError;
4881 }
4882 ++fmt;
4883 --fmtcnt;
4884 keystart = fmt;
4885 /* Skip over balanced parentheses */
4886 while (pcount > 0 && --fmtcnt >= 0) {
4887 if (*fmt == ')')
4888 --pcount;
4889 else if (*fmt == '(')
4890 ++pcount;
4891 fmt++;
4892 }
4893 keylen = fmt - keystart - 1;
4894 if (fmtcnt < 0 || pcount > 0) {
4895 PyErr_SetString(PyExc_ValueError,
4896 "incomplete format key");
4897 goto onError;
4898 }
Fred Drakee4315f52000-05-09 19:53:39 +00004899 /* keys are converted to strings using UTF-8 and
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 then looked up since Python uses strings to hold
4901 variables names etc. in its namespaces and we
Fred Drakee4315f52000-05-09 19:53:39 +00004902 wouldn't want to break common idioms. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00004903 key = PyUnicode_EncodeUTF8(keystart,
4904 keylen,
4905 NULL);
4906 if (key == NULL)
4907 goto onError;
4908 if (args_owned) {
4909 Py_DECREF(args);
4910 args_owned = 0;
4911 }
4912 args = PyObject_GetItem(dict, key);
4913 Py_DECREF(key);
4914 if (args == NULL) {
4915 goto onError;
4916 }
4917 args_owned = 1;
4918 arglen = -1;
4919 argidx = -2;
4920 }
4921 while (--fmtcnt >= 0) {
4922 switch (c = *fmt++) {
4923 case '-': flags |= F_LJUST; continue;
4924 case '+': flags |= F_SIGN; continue;
4925 case ' ': flags |= F_BLANK; continue;
4926 case '#': flags |= F_ALT; continue;
4927 case '0': flags |= F_ZERO; continue;
4928 }
4929 break;
4930 }
4931 if (c == '*') {
4932 v = getnextarg(args, arglen, &argidx);
4933 if (v == NULL)
4934 goto onError;
4935 if (!PyInt_Check(v)) {
4936 PyErr_SetString(PyExc_TypeError,
4937 "* wants int");
4938 goto onError;
4939 }
4940 width = PyInt_AsLong(v);
4941 if (width < 0) {
4942 flags |= F_LJUST;
4943 width = -width;
4944 }
4945 if (--fmtcnt >= 0)
4946 c = *fmt++;
4947 }
4948 else if (c >= '0' && c <= '9') {
4949 width = c - '0';
4950 while (--fmtcnt >= 0) {
4951 c = *fmt++;
4952 if (c < '0' || c > '9')
4953 break;
4954 if ((width*10) / 10 != width) {
4955 PyErr_SetString(PyExc_ValueError,
4956 "width too big");
4957 goto onError;
4958 }
4959 width = width*10 + (c - '0');
4960 }
4961 }
4962 if (c == '.') {
4963 prec = 0;
4964 if (--fmtcnt >= 0)
4965 c = *fmt++;
4966 if (c == '*') {
4967 v = getnextarg(args, arglen, &argidx);
4968 if (v == NULL)
4969 goto onError;
4970 if (!PyInt_Check(v)) {
4971 PyErr_SetString(PyExc_TypeError,
4972 "* wants int");
4973 goto onError;
4974 }
4975 prec = PyInt_AsLong(v);
4976 if (prec < 0)
4977 prec = 0;
4978 if (--fmtcnt >= 0)
4979 c = *fmt++;
4980 }
4981 else if (c >= '0' && c <= '9') {
4982 prec = c - '0';
4983 while (--fmtcnt >= 0) {
4984 c = Py_CHARMASK(*fmt++);
4985 if (c < '0' || c > '9')
4986 break;
4987 if ((prec*10) / 10 != prec) {
4988 PyErr_SetString(PyExc_ValueError,
4989 "prec too big");
4990 goto onError;
4991 }
4992 prec = prec*10 + (c - '0');
4993 }
4994 }
4995 } /* prec */
4996 if (fmtcnt >= 0) {
4997 if (c == 'h' || c == 'l' || c == 'L') {
4998 size = c;
4999 if (--fmtcnt >= 0)
5000 c = *fmt++;
5001 }
5002 }
5003 if (fmtcnt < 0) {
5004 PyErr_SetString(PyExc_ValueError,
5005 "incomplete format");
5006 goto onError;
5007 }
5008 if (c != '%') {
5009 v = getnextarg(args, arglen, &argidx);
5010 if (v == NULL)
5011 goto onError;
5012 }
5013 sign = 0;
5014 fill = ' ';
5015 switch (c) {
5016
5017 case '%':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005018 pbuf = formatbuf;
5019 /* presume that buffer length is at least 1 */
5020 pbuf[0] = '%';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005021 len = 1;
5022 break;
5023
5024 case 's':
5025 case 'r':
5026 if (PyUnicode_Check(v) && c == 's') {
5027 temp = v;
5028 Py_INCREF(temp);
5029 }
5030 else {
5031 PyObject *unicode;
5032 if (c == 's')
5033 temp = PyObject_Str(v);
5034 else
5035 temp = PyObject_Repr(v);
5036 if (temp == NULL)
5037 goto onError;
5038 if (!PyString_Check(temp)) {
5039 /* XXX Note: this should never happen, since
5040 PyObject_Repr() and PyObject_Str() assure
5041 this */
5042 Py_DECREF(temp);
5043 PyErr_SetString(PyExc_TypeError,
5044 "%s argument has non-string str()");
5045 goto onError;
5046 }
Fred Drakee4315f52000-05-09 19:53:39 +00005047 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
Guido van Rossumd57fd912000-03-10 22:53:23 +00005048 PyString_GET_SIZE(temp),
Fred Drakee4315f52000-05-09 19:53:39 +00005049 NULL,
Guido van Rossumd57fd912000-03-10 22:53:23 +00005050 "strict");
5051 Py_DECREF(temp);
5052 temp = unicode;
5053 if (temp == NULL)
5054 goto onError;
5055 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005056 pbuf = PyUnicode_AS_UNICODE(temp);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005057 len = PyUnicode_GET_SIZE(temp);
5058 if (prec >= 0 && len > prec)
5059 len = prec;
5060 break;
5061
5062 case 'i':
5063 case 'd':
5064 case 'u':
5065 case 'o':
5066 case 'x':
5067 case 'X':
5068 if (c == 'i')
5069 c = 'd';
Tim Petersa3a3a032000-11-30 05:22:44 +00005070 if (PyLong_Check(v)) {
Tim Peters38fd5b62000-09-21 05:43:11 +00005071 temp = formatlong(v, flags, prec, c);
5072 if (!temp)
5073 goto onError;
5074 pbuf = PyUnicode_AS_UNICODE(temp);
5075 len = PyUnicode_GET_SIZE(temp);
5076 /* unbounded ints can always produce
5077 a sign character! */
5078 sign = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005079 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005080 else {
5081 pbuf = formatbuf;
5082 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5083 flags, prec, c, v);
5084 if (len < 0)
5085 goto onError;
5086 /* only d conversion is signed */
5087 sign = c == 'd';
5088 }
5089 if (flags & F_ZERO)
5090 fill = '0';
Guido van Rossumd57fd912000-03-10 22:53:23 +00005091 break;
5092
5093 case 'e':
5094 case 'E':
5095 case 'f':
5096 case 'g':
5097 case 'G':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005098 pbuf = formatbuf;
5099 len = formatfloat(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
5100 flags, prec, c, v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005101 if (len < 0)
5102 goto onError;
5103 sign = 1;
Tim Peters38fd5b62000-09-21 05:43:11 +00005104 if (flags & F_ZERO)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005105 fill = '0';
5106 break;
5107
5108 case 'c':
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005109 pbuf = formatbuf;
5110 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005111 if (len < 0)
5112 goto onError;
5113 break;
5114
5115 default:
5116 PyErr_Format(PyExc_ValueError,
Andrew M. Kuchling6ca89172000-12-15 13:07:46 +00005117 "unsupported format character '%c' (0x%x) "
5118 "at index %i",
Andrew M. Kuchlingf947ffe2000-12-19 22:49:06 +00005119 (31<=c && c<=126) ? c : '?',
5120 c, fmt -1 - PyUnicode_AS_UNICODE(uformat));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005121 goto onError;
5122 }
5123 if (sign) {
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005124 if (*pbuf == '-' || *pbuf == '+') {
5125 sign = *pbuf++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005126 len--;
5127 }
5128 else if (flags & F_SIGN)
5129 sign = '+';
5130 else if (flags & F_BLANK)
5131 sign = ' ';
5132 else
5133 sign = 0;
5134 }
5135 if (width < len)
5136 width = len;
5137 if (rescnt < width + (sign != 0)) {
5138 reslen -= rescnt;
5139 rescnt = width + fmtcnt + 100;
5140 reslen += rescnt;
5141 if (_PyUnicode_Resize(result, reslen) < 0)
5142 return NULL;
5143 res = PyUnicode_AS_UNICODE(result)
5144 + reslen - rescnt;
5145 }
5146 if (sign) {
5147 if (fill != ' ')
5148 *res++ = sign;
5149 rescnt--;
5150 if (width > len)
5151 width--;
5152 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005153 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5154 assert(pbuf[0] == '0');
5155 assert(pbuf[1] == c);
5156 if (fill != ' ') {
5157 *res++ = *pbuf++;
5158 *res++ = *pbuf++;
5159 }
5160 rescnt -= 2;
5161 width -= 2;
5162 if (width < 0)
5163 width = 0;
5164 len -= 2;
5165 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005166 if (width > len && !(flags & F_LJUST)) {
5167 do {
5168 --rescnt;
5169 *res++ = fill;
5170 } while (--width > len);
5171 }
Tim Peters38fd5b62000-09-21 05:43:11 +00005172 if (fill == ' ') {
5173 if (sign)
5174 *res++ = sign;
5175 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
5176 assert(pbuf[0] == '0');
5177 assert(pbuf[1] == c);
5178 *res++ = *pbuf++;
5179 *res++ = *pbuf++;
5180 }
5181 }
Marc-André Lemburgf28dd832000-06-30 10:29:57 +00005182 memcpy(res, pbuf, len * sizeof(Py_UNICODE));
Guido van Rossumd57fd912000-03-10 22:53:23 +00005183 res += len;
5184 rescnt -= len;
5185 while (--width >= len) {
5186 --rescnt;
5187 *res++ = ' ';
5188 }
5189 if (dict && (argidx < arglen) && c != '%') {
5190 PyErr_SetString(PyExc_TypeError,
5191 "not all arguments converted");
5192 goto onError;
5193 }
5194 Py_XDECREF(temp);
5195 } /* '%' */
5196 } /* until end */
5197 if (argidx < arglen && !dict) {
5198 PyErr_SetString(PyExc_TypeError,
5199 "not all arguments converted");
5200 goto onError;
5201 }
5202
5203 if (args_owned) {
5204 Py_DECREF(args);
5205 }
5206 Py_DECREF(uformat);
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005207 if (_PyUnicode_Resize(result, reslen - rescnt))
5208 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005209 return (PyObject *)result;
5210
5211 onError:
5212 Py_XDECREF(result);
5213 Py_DECREF(uformat);
5214 if (args_owned) {
5215 Py_DECREF(args);
5216 }
5217 return NULL;
5218}
5219
5220static PyBufferProcs unicode_as_buffer = {
5221 (getreadbufferproc) unicode_buffer_getreadbuf,
5222 (getwritebufferproc) unicode_buffer_getwritebuf,
5223 (getsegcountproc) unicode_buffer_getsegcount,
5224 (getcharbufferproc) unicode_buffer_getcharbuf,
5225};
5226
5227PyTypeObject PyUnicode_Type = {
5228 PyObject_HEAD_INIT(&PyType_Type)
5229 0, /* ob_size */
5230 "unicode", /* tp_name */
5231 sizeof(PyUnicodeObject), /* tp_size */
5232 0, /* tp_itemsize */
5233 /* Slots */
5234 (destructor)_PyUnicode_Free, /* tp_dealloc */
5235 0, /* tp_print */
5236 (getattrfunc)unicode_getattr, /* tp_getattr */
5237 0, /* tp_setattr */
5238 (cmpfunc) unicode_compare, /* tp_compare */
5239 (reprfunc) unicode_repr, /* tp_repr */
5240 0, /* tp_as_number */
5241 &unicode_as_sequence, /* tp_as_sequence */
5242 0, /* tp_as_mapping */
5243 (hashfunc) unicode_hash, /* tp_hash*/
5244 0, /* tp_call*/
5245 (reprfunc) unicode_str, /* tp_str */
5246 (getattrofunc) NULL, /* tp_getattro */
5247 (setattrofunc) NULL, /* tp_setattro */
5248 &unicode_as_buffer, /* tp_as_buffer */
5249 Py_TPFLAGS_DEFAULT, /* tp_flags */
5250};
5251
5252/* Initialize the Unicode implementation */
5253
Thomas Wouters78890102000-07-22 19:25:51 +00005254void _PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005255{
5256 /* Doublecheck the configuration... */
5257 if (sizeof(Py_UNICODE) != 2)
5258 Py_FatalError("Unicode configuration error: "
5259 "sizeof(Py_UNICODE) != 2 bytes");
5260
Fred Drakee4315f52000-05-09 19:53:39 +00005261 /* Init the implementation */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005262 unicode_freelist = NULL;
5263 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005264 unicode_empty = _PyUnicode_New(0);
Marc-André Lemburg90e81472000-06-07 09:13:21 +00005265 strcpy(unicode_default_encoding, "ascii");
Guido van Rossumd57fd912000-03-10 22:53:23 +00005266}
5267
5268/* Finalize the Unicode implementation */
5269
5270void
Thomas Wouters78890102000-07-22 19:25:51 +00005271_PyUnicode_Fini(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005272{
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005273 PyUnicodeObject *u;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005274
Guido van Rossum4ae8ef82000-10-03 18:09:04 +00005275 Py_XDECREF(unicode_empty);
5276 unicode_empty = NULL;
Barry Warsaw5b4c2282000-10-03 20:45:26 +00005277
5278 for (u = unicode_freelist; u != NULL;) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005279 PyUnicodeObject *v = u;
5280 u = *(PyUnicodeObject **)u;
Guido van Rossumfd4b9572000-04-10 13:51:10 +00005281 if (v->str)
Guido van Rossumb18618d2000-05-03 23:44:39 +00005282 PyMem_DEL(v->str);
Marc-André Lemburgbff879c2000-08-03 18:46:08 +00005283 Py_XDECREF(v->defenc);
Guido van Rossumb18618d2000-05-03 23:44:39 +00005284 PyObject_DEL(v);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005285 }
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +00005286 unicode_freelist = NULL;
5287 unicode_freelist_size = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005288}